mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-17 07:17:06 +01:00
Compare commits
1 Commits
DIRAC-ITT-
...
feature/ei
Author | SHA1 | Date | |
---|---|---|---|
86a9cc8c27 |
@ -9,6 +9,11 @@ matrix:
|
|||||||
- os: osx
|
- os: osx
|
||||||
osx_image: xcode8.3
|
osx_image: xcode8.3
|
||||||
compiler: clang
|
compiler: clang
|
||||||
|
env: PREC=single
|
||||||
|
- os: osx
|
||||||
|
osx_image: xcode8.3
|
||||||
|
compiler: clang
|
||||||
|
env: PREC=double
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- export GRIDDIR=`pwd`
|
- export GRIDDIR=`pwd`
|
||||||
@ -50,7 +55,7 @@ script:
|
|||||||
- make -j4
|
- make -j4
|
||||||
- make install
|
- make install
|
||||||
- cd $CWD/build
|
- cd $CWD/build
|
||||||
- ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
|
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
|
||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
||||||
- make check
|
- make check
|
||||||
|
@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/perfmon/PerfCount.h>
|
#include <Grid/perfmon/PerfCount.h>
|
||||||
#include <Grid/util/Util.h>
|
#include <Grid/util/Util.h>
|
||||||
#include <Grid/log/Log.h>
|
#include <Grid/log/Log.h>
|
||||||
#include <Grid/allocator/Allocator.h>
|
#include <Grid/allocator/AlignedAllocator.h>
|
||||||
#include <Grid/simd/Simd.h>
|
#include <Grid/simd/Simd.h>
|
||||||
#include <Grid/threads/ThreadReduction.h>
|
#include <Grid/threads/Threads.h>
|
||||||
#include <Grid/serialisation/Serialisation.h>
|
#include <Grid/serialisation/Serialisation.h>
|
||||||
#include <Grid/util/Sha.h>
|
#include <Grid/util/Sha.h>
|
||||||
#include <Grid/communicator/Communicator.h>
|
#include <Grid/communicator/Communicator.h>
|
||||||
|
@ -6,7 +6,6 @@
|
|||||||
///////////////////
|
///////////////////
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
#include <memory>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -18,28 +18,21 @@
|
|||||||
#pragma push_macro("__CUDA_ARCH__")
|
#pragma push_macro("__CUDA_ARCH__")
|
||||||
#pragma push_macro("__NVCC__")
|
#pragma push_macro("__NVCC__")
|
||||||
#pragma push_macro("__CUDACC__")
|
#pragma push_macro("__CUDACC__")
|
||||||
#undef __CUDA_ARCH__
|
|
||||||
#undef __NVCC__
|
#undef __NVCC__
|
||||||
#undef __CUDACC__
|
#undef __CUDACC__
|
||||||
|
#undef __CUDA_ARCH__
|
||||||
#define __NVCC__REDEFINE__
|
#define __NVCC__REDEFINE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* SYCL save and restore compile environment*/
|
/* SYCL save and restore compile environment*/
|
||||||
#ifdef GRID_SYCL
|
#ifdef __SYCL_DEVICE_ONLY__
|
||||||
#pragma push
|
#pragma push
|
||||||
#pragma push_macro("__SYCL_DEVICE_ONLY__")
|
#pragma push_macro("__SYCL_DEVICE_ONLY__")
|
||||||
#undef __SYCL_DEVICE_ONLY__
|
#undef __SYCL_DEVICE_ONLY__
|
||||||
|
#undef EIGEN_USE_SYCL
|
||||||
#define EIGEN_DONT_VECTORIZE
|
#define EIGEN_DONT_VECTORIZE
|
||||||
//#undef EIGEN_USE_SYCL
|
|
||||||
#define __SYCL__REDEFINE__
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* HIP save and restore compile environment*/
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
#pragma push
|
|
||||||
#pragma push_macro("__HIP_DEVICE_COMPILE__")
|
|
||||||
#endif
|
|
||||||
#define EIGEN_NO_HIP
|
|
||||||
|
|
||||||
#include <Grid/Eigen/Dense>
|
#include <Grid/Eigen/Dense>
|
||||||
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
||||||
@ -58,12 +51,6 @@
|
|||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*HIP restore*/
|
|
||||||
#ifdef __HIP__REDEFINE__
|
|
||||||
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
|
|
||||||
#pragma pop
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#endif
|
#endif
|
||||||
|
@ -21,7 +21,7 @@ if BUILD_HDF5
|
|||||||
extra_headers+=serialisation/Hdf5Type.h
|
extra_headers+=serialisation/Hdf5Type.h
|
||||||
endif
|
endif
|
||||||
|
|
||||||
all: version-cache Version.h
|
all: version-cache
|
||||||
|
|
||||||
version-cache:
|
version-cache:
|
||||||
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
|
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
|
||||||
@ -42,7 +42,7 @@ version-cache:
|
|||||||
fi;\
|
fi;\
|
||||||
rm -f vertmp
|
rm -f vertmp
|
||||||
|
|
||||||
Version.h: version-cache
|
Version.h:
|
||||||
cp version-cache Version.h
|
cp version-cache Version.h
|
||||||
|
|
||||||
.PHONY: version-cache
|
.PHONY: version-cache
|
||||||
|
@ -29,11 +29,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ALGORITHMS_H
|
#ifndef GRID_ALGORITHMS_H
|
||||||
#define GRID_ALGORITHMS_H
|
#define GRID_ALGORITHMS_H
|
||||||
|
|
||||||
NAMESPACE_CHECK(algorithms);
|
|
||||||
#include <Grid/algorithms/SparseMatrix.h>
|
#include <Grid/algorithms/SparseMatrix.h>
|
||||||
#include <Grid/algorithms/LinearOperator.h>
|
#include <Grid/algorithms/LinearOperator.h>
|
||||||
#include <Grid/algorithms/Preconditioner.h>
|
#include <Grid/algorithms/Preconditioner.h>
|
||||||
NAMESPACE_CHECK(SparseMatrix);
|
|
||||||
|
|
||||||
#include <Grid/algorithms/approx/Zolotarev.h>
|
#include <Grid/algorithms/approx/Zolotarev.h>
|
||||||
#include <Grid/algorithms/approx/Chebyshev.h>
|
#include <Grid/algorithms/approx/Chebyshev.h>
|
||||||
@ -43,12 +41,10 @@ NAMESPACE_CHECK(SparseMatrix);
|
|||||||
#include <Grid/algorithms/approx/Forecast.h>
|
#include <Grid/algorithms/approx/Forecast.h>
|
||||||
#include <Grid/algorithms/approx/RemezGeneral.h>
|
#include <Grid/algorithms/approx/RemezGeneral.h>
|
||||||
#include <Grid/algorithms/approx/ZMobius.h>
|
#include <Grid/algorithms/approx/ZMobius.h>
|
||||||
NAMESPACE_CHECK(approx);
|
|
||||||
#include <Grid/algorithms/iterative/Deflation.h>
|
#include <Grid/algorithms/iterative/Deflation.h>
|
||||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||||
NAMESPACE_CHECK(ConjGrad);
|
|
||||||
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
||||||
NAMESPACE_CHECK(BiCGSTAB);
|
|
||||||
#include <Grid/algorithms/iterative/ConjugateResidual.h>
|
#include <Grid/algorithms/iterative/ConjugateResidual.h>
|
||||||
#include <Grid/algorithms/iterative/NormalEquations.h>
|
#include <Grid/algorithms/iterative/NormalEquations.h>
|
||||||
#include <Grid/algorithms/iterative/SchurRedBlack.h>
|
#include <Grid/algorithms/iterative/SchurRedBlack.h>
|
||||||
@ -66,9 +62,7 @@ NAMESPACE_CHECK(BiCGSTAB);
|
|||||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||||
#include <Grid/algorithms/iterative/PowerMethod.h>
|
#include <Grid/algorithms/iterative/PowerMethod.h>
|
||||||
|
|
||||||
NAMESPACE_CHECK(PowerMethod);
|
|
||||||
#include <Grid/algorithms/CoarsenedMatrix.h>
|
#include <Grid/algorithms/CoarsenedMatrix.h>
|
||||||
NAMESPACE_CHECK(CoarsendMatrix);
|
|
||||||
#include <Grid/algorithms/FFT.h>
|
#include <Grid/algorithms/FFT.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,3 +1,14 @@
|
|||||||
|
// blockZaxpy in bockPromote - 3s, 5%
|
||||||
|
// noncoalesced linalg in Preconditionoer ~ 3s 5%
|
||||||
|
// Lancos tuning or replace 10-20s ~ 25%, open ended
|
||||||
|
// setup tuning 5s ~ 8%
|
||||||
|
// -- e.g. ordermin, orderstep tunables.
|
||||||
|
// MdagM path without norm in LinOp code. few seconds
|
||||||
|
|
||||||
|
// Mdir calc blocking kernels
|
||||||
|
// Fuse kernels in blockMaskedInnerProduct
|
||||||
|
// preallocate Vectors in Cayley 5D ~ few percent few seconds
|
||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -80,7 +91,34 @@ public:
|
|||||||
}
|
}
|
||||||
directions [2*_d]=0;
|
directions [2*_d]=0;
|
||||||
displacements[2*_d]=0;
|
displacements[2*_d]=0;
|
||||||
|
|
||||||
|
//// report back
|
||||||
|
std::cout<<GridLogMessage<<"directions :";
|
||||||
|
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
|
||||||
|
std::cout<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"displacements :";
|
||||||
|
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
|
||||||
|
std::cout<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
// Original cleaner code
|
||||||
|
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
|
||||||
|
for(int d=0;d<dimension;d++){
|
||||||
|
directions[2*d ] = d;
|
||||||
|
directions[2*d+1] = d;
|
||||||
|
displacements[2*d ] = +1;
|
||||||
|
displacements[2*d+1] = -1;
|
||||||
|
}
|
||||||
|
directions [2*dimension]=0;
|
||||||
|
displacements[2*dimension]=0;
|
||||||
|
}
|
||||||
|
std::vector<int> GetDelta(int point) {
|
||||||
|
std::vector<int> delta(dimension,0);
|
||||||
|
delta[directions[point]] = displacements[point];
|
||||||
|
return delta;
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -111,7 +149,25 @@ public:
|
|||||||
CoarseScalar InnerProd(CoarseGrid);
|
CoarseScalar InnerProd(CoarseGrid);
|
||||||
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
|
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
|
||||||
blockOrthogonalise(InnerProd,subspace);
|
blockOrthogonalise(InnerProd,subspace);
|
||||||
|
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
|
||||||
|
// blockOrthogonalise(InnerProd,subspace);
|
||||||
|
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
|
||||||
|
// CheckOrthogonal();
|
||||||
}
|
}
|
||||||
|
void CheckOrthogonal(void){
|
||||||
|
CoarseVector iProj(CoarseGrid);
|
||||||
|
CoarseVector eProj(CoarseGrid);
|
||||||
|
for(int i=0;i<nbasis;i++){
|
||||||
|
blockProject(iProj,subspace[i],subspace);
|
||||||
|
eProj=Zero();
|
||||||
|
accelerator_for(ss, CoarseGrid->oSites(),1,{
|
||||||
|
eProj[ss](i)=CComplex(1.0);
|
||||||
|
});
|
||||||
|
eProj=eProj - iProj;
|
||||||
|
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
|
||||||
|
}
|
||||||
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
||||||
blockProject(CoarseVec,FineVec,subspace);
|
blockProject(CoarseVec,FineVec,subspace);
|
||||||
}
|
}
|
||||||
@ -119,6 +175,11 @@ public:
|
|||||||
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
||||||
blockPromote(CoarseVec,FineVec,subspace);
|
blockPromote(CoarseVec,FineVec,subspace);
|
||||||
}
|
}
|
||||||
|
void CreateSubspaceRandom(GridParallelRNG &RNG){
|
||||||
|
for(int i=0;i<nbasis;i++){
|
||||||
|
random(RNG,subspace[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
||||||
|
|
||||||
@ -129,12 +190,12 @@ public:
|
|||||||
FineField Mn(FineGrid);
|
FineField Mn(FineGrid);
|
||||||
|
|
||||||
for(int b=0;b<nn;b++){
|
for(int b=0;b<nn;b++){
|
||||||
|
|
||||||
subspace[b] = Zero();
|
subspace[b] = Zero();
|
||||||
gaussian(RNG,noise);
|
gaussian(RNG,noise);
|
||||||
scale = std::pow(norm2(noise),-0.5);
|
scale = std::pow(norm2(noise),-0.5);
|
||||||
noise=noise*scale;
|
noise=noise*scale;
|
||||||
|
|
||||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
for(int i=0;i<1;i++){
|
for(int i=0;i<1;i++){
|
||||||
@ -157,7 +218,7 @@ public:
|
|||||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||||
// and this is the best I found
|
// and this is the best I found
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#if 1
|
||||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
int nn,
|
int nn,
|
||||||
double hi,
|
double hi,
|
||||||
@ -219,10 +280,10 @@ public:
|
|||||||
|
|
||||||
hermop.HermOp(*Tn,y);
|
hermop.HermOp(*Tn,y);
|
||||||
|
|
||||||
autoView( y_v , y, AcceleratorWrite);
|
auto y_v = y.View();
|
||||||
autoView( Tn_v , (*Tn), AcceleratorWrite);
|
auto Tn_v = Tn->View();
|
||||||
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
|
auto Tnp_v = Tnp->View();
|
||||||
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
|
auto Tnm_v = Tnm->View();
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
||||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||||
@ -252,6 +313,201 @@ public:
|
|||||||
}
|
}
|
||||||
assert(b==nn);
|
assert(b==nn);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if 0
|
||||||
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
|
int nn,
|
||||||
|
double hi,
|
||||||
|
double lo,
|
||||||
|
int orderfilter,
|
||||||
|
int ordermin,
|
||||||
|
int orderstep,
|
||||||
|
double filterlo
|
||||||
|
) {
|
||||||
|
|
||||||
|
RealD scale;
|
||||||
|
|
||||||
|
FineField noise(FineGrid);
|
||||||
|
FineField Mn(FineGrid);
|
||||||
|
FineField tmp(FineGrid);
|
||||||
|
FineField combined(FineGrid);
|
||||||
|
|
||||||
|
// New normalised noise
|
||||||
|
gaussian(RNG,noise);
|
||||||
|
scale = std::pow(norm2(noise),-0.5);
|
||||||
|
noise=noise*scale;
|
||||||
|
|
||||||
|
// Initial matrix element
|
||||||
|
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
|
int b =0;
|
||||||
|
#define FILTERb(llo,hhi,oorder) \
|
||||||
|
{ \
|
||||||
|
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||||
|
Cheb(hermop,noise,Mn); \
|
||||||
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||||
|
subspace[b] = Mn; \
|
||||||
|
hermop.Op(Mn,tmp); \
|
||||||
|
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||||
|
b++; \
|
||||||
|
}
|
||||||
|
|
||||||
|
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
|
||||||
|
|
||||||
|
RealD alpha=-0.8;
|
||||||
|
RealD beta =-0.8;
|
||||||
|
#define FILTER(llo,hhi,oorder) \
|
||||||
|
{ \
|
||||||
|
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||||
|
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
|
||||||
|
Cheb(hermop,noise,Mn); \
|
||||||
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||||
|
subspace[b] = Mn; \
|
||||||
|
hermop.Op(Mn,tmp); \
|
||||||
|
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||||
|
b++; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FILTERc(llo,hhi,oorder) \
|
||||||
|
{ \
|
||||||
|
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||||
|
Cheb(hermop,noise,combined); \
|
||||||
|
}
|
||||||
|
|
||||||
|
double node = 0.000;
|
||||||
|
FILTERb(lo,hi,orderfilter);// 0
|
||||||
|
// FILTERc(node,hi,51);// 0
|
||||||
|
noise = Mn;
|
||||||
|
int base = 0;
|
||||||
|
int mult = 100;
|
||||||
|
FILTER(node,hi,base+1*mult);
|
||||||
|
FILTER(node,hi,base+2*mult);
|
||||||
|
FILTER(node,hi,base+3*mult);
|
||||||
|
FILTER(node,hi,base+4*mult);
|
||||||
|
FILTER(node,hi,base+5*mult);
|
||||||
|
FILTER(node,hi,base+6*mult);
|
||||||
|
FILTER(node,hi,base+7*mult);
|
||||||
|
FILTER(node,hi,base+8*mult);
|
||||||
|
FILTER(node,hi,base+9*mult);
|
||||||
|
FILTER(node,hi,base+10*mult);
|
||||||
|
FILTER(node,hi,base+11*mult);
|
||||||
|
FILTER(node,hi,base+12*mult);
|
||||||
|
FILTER(node,hi,base+13*mult);
|
||||||
|
FILTER(node,hi,base+14*mult);
|
||||||
|
FILTER(node,hi,base+15*mult);
|
||||||
|
assert(b==nn);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
|
int nn,
|
||||||
|
double hi,
|
||||||
|
double lo,
|
||||||
|
int orderfilter,
|
||||||
|
int ordermin,
|
||||||
|
int orderstep,
|
||||||
|
double filterlo
|
||||||
|
) {
|
||||||
|
|
||||||
|
RealD scale;
|
||||||
|
|
||||||
|
FineField noise(FineGrid);
|
||||||
|
FineField Mn(FineGrid);
|
||||||
|
FineField tmp(FineGrid);
|
||||||
|
FineField combined(FineGrid);
|
||||||
|
|
||||||
|
// New normalised noise
|
||||||
|
gaussian(RNG,noise);
|
||||||
|
scale = std::pow(norm2(noise),-0.5);
|
||||||
|
noise=noise*scale;
|
||||||
|
|
||||||
|
// Initial matrix element
|
||||||
|
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
|
int b =0;
|
||||||
|
{
|
||||||
|
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
|
||||||
|
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
|
||||||
|
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
|
||||||
|
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
|
||||||
|
JacobiPoly(hermop,noise,Mn);
|
||||||
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
|
subspace[b] = Mn;
|
||||||
|
hermop.Op(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||||
|
b++;
|
||||||
|
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||||
|
// subspace[b] = tmp; b++;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FILTER(lambda) \
|
||||||
|
{ \
|
||||||
|
hermop.HermOp(subspace[0],tmp); \
|
||||||
|
tmp = tmp - lambda *subspace[0]; \
|
||||||
|
scale = std::pow(norm2(tmp),-0.5); \
|
||||||
|
tmp=tmp*scale; \
|
||||||
|
subspace[b] = tmp; \
|
||||||
|
hermop.Op(subspace[b],tmp); \
|
||||||
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||||
|
b++; \
|
||||||
|
}
|
||||||
|
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||||
|
// subspace[b] = tmp; b++;
|
||||||
|
// }
|
||||||
|
|
||||||
|
FILTER(2.0e-5);
|
||||||
|
FILTER(2.0e-4);
|
||||||
|
FILTER(4.0e-4);
|
||||||
|
FILTER(8.0e-4);
|
||||||
|
FILTER(8.0e-4);
|
||||||
|
|
||||||
|
FILTER(2.0e-3);
|
||||||
|
FILTER(3.0e-3);
|
||||||
|
FILTER(4.0e-3);
|
||||||
|
FILTER(5.0e-3);
|
||||||
|
FILTER(6.0e-3);
|
||||||
|
|
||||||
|
FILTER(2.5e-3);
|
||||||
|
FILTER(3.5e-3);
|
||||||
|
FILTER(4.5e-3);
|
||||||
|
FILTER(5.5e-3);
|
||||||
|
FILTER(6.5e-3);
|
||||||
|
|
||||||
|
// FILTER(6.0e-5);//6
|
||||||
|
// FILTER(7.0e-5);//8
|
||||||
|
// FILTER(8.0e-5);//9
|
||||||
|
// FILTER(9.0e-5);//3
|
||||||
|
|
||||||
|
/*
|
||||||
|
// FILTER(1.0e-4);//10
|
||||||
|
FILTER(2.0e-4);//11
|
||||||
|
// FILTER(3.0e-4);//12
|
||||||
|
// FILTER(4.0e-4);//13
|
||||||
|
FILTER(5.0e-4);//14
|
||||||
|
|
||||||
|
FILTER(6.0e-3);//4
|
||||||
|
FILTER(7.0e-4);//1
|
||||||
|
FILTER(8.0e-4);//7
|
||||||
|
FILTER(9.0e-4);//15
|
||||||
|
FILTER(1.0e-3);//2
|
||||||
|
|
||||||
|
FILTER(2.0e-3);//2
|
||||||
|
FILTER(3.0e-3);//2
|
||||||
|
FILTER(4.0e-3);//2
|
||||||
|
FILTER(5.0e-3);//2
|
||||||
|
FILTER(6.0e-3);//2
|
||||||
|
|
||||||
|
FILTER(7.0e-3);//2
|
||||||
|
FILTER(8.0e-3);//2
|
||||||
|
FILTER(1.0e-2);//2
|
||||||
|
*/
|
||||||
|
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
|
||||||
|
assert(b==nn);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -279,7 +535,7 @@ public:
|
|||||||
CartesianStencil<siteVector,siteVector,int> Stencil;
|
CartesianStencil<siteVector,siteVector,int> Stencil;
|
||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
///////////////////////
|
///////////////////////
|
||||||
@ -293,13 +549,13 @@ public:
|
|||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
autoView( in_v , in, AcceleratorRead);
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
auto in_v = in.View();
|
||||||
|
auto out_v = out.View();
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@ -316,25 +572,24 @@ public:
|
|||||||
int ptype;
|
int ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
int lane=SIMTlane(Nsimd);
|
||||||
for(int point=0;point<geom.npoint;point++){
|
for(int point=0;point<geom.npoint;point++){
|
||||||
|
|
||||||
SE=Stencil.GetEntry(ptype,point,ss);
|
SE=Stencil.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
if(SE->_is_local) {
|
if(SE->_is_local) {
|
||||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
||||||
} else {
|
} else {
|
||||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
||||||
}
|
}
|
||||||
acceleratorSynchronise();
|
synchronise();
|
||||||
|
|
||||||
for(int bb=0;bb<nbasis;bb++) {
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res,lane);
|
||||||
});
|
});
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void Mdag (const CoarseVector &in, CoarseVector &out)
|
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||||
@ -362,11 +617,11 @@ public:
|
|||||||
|
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
auto out_v = out.View();
|
||||||
autoView( in_v , in, AcceleratorRead);
|
auto in_v = in.View();
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
@ -380,21 +635,45 @@ public:
|
|||||||
int ptype;
|
int ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
int lane=SIMTlane(Nsimd);
|
||||||
SE=Stencil.GetEntry(ptype,point,ss);
|
SE=Stencil.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
if(SE->_is_local) {
|
if(SE->_is_local) {
|
||||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
||||||
} else {
|
} else {
|
||||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
||||||
}
|
}
|
||||||
acceleratorSynchronise();
|
synchronise();
|
||||||
|
|
||||||
for(int bb=0;bb<nbasis;bb++) {
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res,lane);
|
||||||
});
|
});
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
#if 0
|
||||||
|
accelerator_for(ss,Grid()->oSites(),1,{
|
||||||
|
|
||||||
|
siteVector res = Zero();
|
||||||
|
siteVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
SE=Stencil.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local&&SE->_permute) {
|
||||||
|
permute(nbr,in_v[SE->_offset],ptype);
|
||||||
|
} else if(SE->_is_local) {
|
||||||
|
nbr = in_v[SE->_offset];
|
||||||
|
} else {
|
||||||
|
nbr = Stencil.CommBuf()[SE->_offset];
|
||||||
|
}
|
||||||
|
synchronise();
|
||||||
|
|
||||||
|
res = res + Aview_p[point][ss]*nbr;
|
||||||
|
|
||||||
|
out_v[ss]=res;
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||||
{
|
{
|
||||||
@ -562,10 +841,10 @@ public:
|
|||||||
|
|
||||||
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
|
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
|
||||||
|
|
||||||
autoView( iZProj_v , iZProj, AcceleratorRead) ;
|
auto iZProj_v = iZProj.View() ;
|
||||||
autoView( oZProj_v , oZProj, AcceleratorRead) ;
|
auto oZProj_v = oZProj.View() ;
|
||||||
autoView( A_p , A[p], AcceleratorWrite);
|
auto A_p = A[p].View();
|
||||||
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
auto A_self = A[self_stencil].View();
|
||||||
|
|
||||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
||||||
|
|
||||||
@ -581,11 +860,11 @@ public:
|
|||||||
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
||||||
|
|
||||||
{
|
{
|
||||||
autoView( tmp_ , tmp, AcceleratorWrite);
|
auto tmp_ = tmp.View();
|
||||||
autoView( evenmask_ , evenmask, AcceleratorRead);
|
auto evenmask_ = evenmask.View();
|
||||||
autoView( oddmask_ , oddmask, AcceleratorRead);
|
auto oddmask_ = oddmask.View();
|
||||||
autoView( Mphie_ , Mphie, AcceleratorRead);
|
auto Mphie_ = Mphie.View();
|
||||||
autoView( Mphio_ , Mphio, AcceleratorRead);
|
auto Mphio_ = Mphio.View();
|
||||||
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
||||||
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
||||||
});
|
});
|
||||||
@ -593,8 +872,8 @@ public:
|
|||||||
|
|
||||||
blockProject(SelfProj,tmp,Subspace.subspace);
|
blockProject(SelfProj,tmp,Subspace.subspace);
|
||||||
|
|
||||||
autoView( SelfProj_ , SelfProj, AcceleratorRead);
|
auto SelfProj_ = SelfProj.View();
|
||||||
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
auto A_self = A[self_stencil].View();
|
||||||
|
|
||||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
||||||
for(int j=0;j<nbasis;j++){
|
for(int j=0;j<nbasis;j++){
|
||||||
@ -608,8 +887,33 @@ public:
|
|||||||
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
||||||
ForceHermitian();
|
ForceHermitian();
|
||||||
}
|
}
|
||||||
|
// AssertHermitian();
|
||||||
|
// ForceDiagonal();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
///////////////////////////
|
||||||
|
// test code worth preserving in if block
|
||||||
|
///////////////////////////
|
||||||
|
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
|
||||||
|
for(int p=0;p<geom.npoint;p++){
|
||||||
|
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
|
||||||
|
std::cout<<GridLogMessage<< A[p] << std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
|
||||||
|
|
||||||
|
phi=Subspace.subspace[0];
|
||||||
|
std::vector<int> bc(FineGrid->_ndimension,0);
|
||||||
|
|
||||||
|
blockPick(Grid(),phi,tmp,bc); // Pick out a block
|
||||||
|
linop.Op(tmp,Mphi); // Apply big dop
|
||||||
|
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
|
||||||
|
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<< iProj <<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
void ForceHermitian(void) {
|
void ForceHermitian(void) {
|
||||||
CoarseMatrix Diff (Grid());
|
CoarseMatrix Diff (Grid());
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
@ -629,6 +933,27 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void AssertHermitian(void) {
|
||||||
|
CoarseMatrix AA (Grid());
|
||||||
|
CoarseMatrix AAc (Grid());
|
||||||
|
CoarseMatrix Diff (Grid());
|
||||||
|
for(int d=0;d<4;d++){
|
||||||
|
|
||||||
|
int dd=d+1;
|
||||||
|
AAc = Cshift(A[2*d+1],dd,1);
|
||||||
|
AA = A[2*d];
|
||||||
|
|
||||||
|
Diff = AA - adj(AAc);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
Diff = A[8] - adj(A[8]);
|
||||||
|
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -36,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class scalar> struct FFTW { };
|
template<class scalar> struct FFTW { };
|
||||||
@ -189,7 +191,7 @@ public:
|
|||||||
typedef typename sobj::scalar_type scalar;
|
typedef typename sobj::scalar_type scalar;
|
||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
auto pgbuf_v = pgbuf.View();
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
@ -230,18 +232,15 @@ public:
|
|||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
for(int p=0;p<processors[dim];p++) {
|
for(int p=0;p<processors[dim];p++) {
|
||||||
{
|
thread_for(idx, sgrid->lSites(),{
|
||||||
autoView(r_v,result,CpuRead);
|
|
||||||
autoView(p_v,pgbuf,CpuWrite);
|
|
||||||
thread_for(idx, sgrid->lSites(),{
|
|
||||||
Coordinate cbuf(Nd);
|
Coordinate cbuf(Nd);
|
||||||
sobj s;
|
sobj s;
|
||||||
sgrid->LocalIndexToLocalCoor(idx,cbuf);
|
sgrid->LocalIndexToLocalCoor(idx,cbuf);
|
||||||
peekLocalSite(s,r_v,cbuf);
|
peekLocalSite(s,result,cbuf);
|
||||||
cbuf[dim]+=((pc+p) % processors[dim])*L;
|
cbuf[dim]+=((pc+p) % processors[dim])*L;
|
||||||
pokeLocalSite(s,p_v,cbuf);
|
// cbuf[dim]+=p*L;
|
||||||
});
|
pokeLocalSite(s,pgbuf,cbuf);
|
||||||
}
|
});
|
||||||
if (p != processors[dim] - 1) {
|
if (p != processors[dim] - 1) {
|
||||||
result = Cshift(result,dim,L);
|
result = Cshift(result,dim,L);
|
||||||
}
|
}
|
||||||
@ -270,19 +269,15 @@ public:
|
|||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
// writing out result
|
// writing out result
|
||||||
{
|
thread_for(idx,sgrid->lSites(),{
|
||||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
|
||||||
autoView(result_v,result,CpuWrite);
|
|
||||||
thread_for(idx,sgrid->lSites(),{
|
|
||||||
Coordinate clbuf(Nd), cgbuf(Nd);
|
Coordinate clbuf(Nd), cgbuf(Nd);
|
||||||
sobj s;
|
sobj s;
|
||||||
sgrid->LocalIndexToLocalCoor(idx,clbuf);
|
sgrid->LocalIndexToLocalCoor(idx,clbuf);
|
||||||
cgbuf = clbuf;
|
cgbuf = clbuf;
|
||||||
cgbuf[dim] = clbuf[dim]+L*pc;
|
cgbuf[dim] = clbuf[dim]+L*pc;
|
||||||
peekLocalSite(s,pgbuf_v,cgbuf);
|
peekLocalSite(s,pgbuf,cgbuf);
|
||||||
pokeLocalSite(s,result_v,clbuf);
|
pokeLocalSite(s,result,clbuf);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
// destroying plan
|
// destroying plan
|
||||||
|
@ -122,14 +122,12 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
bo = beta * omega;
|
bo = beta * omega;
|
||||||
{
|
auto p_v = p.View();
|
||||||
autoView( p_v , p, AcceleratorWrite);
|
auto r_v = r.View();
|
||||||
autoView( r_v , r, AcceleratorRead);
|
auto v_v = v.View();
|
||||||
autoView( v_v , v, AcceleratorRead);
|
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
|
||||||
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
|
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
|
||||||
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
|
});
|
||||||
});
|
|
||||||
}
|
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
@ -144,20 +142,16 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
alpha = rho / Calpha.real();
|
alpha = rho / Calpha.real();
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
{
|
auto h_v = h.View();
|
||||||
autoView( p_v , p, AcceleratorRead);
|
auto psi_v = psi.View();
|
||||||
autoView( r_v , r, AcceleratorRead);
|
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
|
||||||
autoView( v_v , v, AcceleratorRead);
|
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
|
||||||
autoView( psi_v,psi, AcceleratorRead);
|
});
|
||||||
autoView( h_v , h, AcceleratorWrite);
|
|
||||||
autoView( s_v , s, AcceleratorWrite);
|
auto s_v = s.View();
|
||||||
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
|
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
|
||||||
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
|
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
|
||||||
});
|
});
|
||||||
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
|
|
||||||
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
@ -172,19 +166,13 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
omega = Comega.real() / norm2(t);
|
omega = Comega.real() / norm2(t);
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
{
|
auto t_v = t.View();
|
||||||
autoView( psi_v,psi, AcceleratorWrite);
|
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
|
||||||
autoView( r_v , r, AcceleratorWrite);
|
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
|
||||||
autoView( h_v , h, AcceleratorRead);
|
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
|
||||||
autoView( s_v , s, AcceleratorRead);
|
});
|
||||||
autoView( t_v , t, AcceleratorRead);
|
|
||||||
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
|
|
||||||
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
|
|
||||||
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
|
|
||||||
cp = norm2(r);
|
cp = norm2(r);
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
@ -140,15 +140,13 @@ public:
|
|||||||
b = cp / c;
|
b = cp / c;
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
{
|
auto psi_v = psi.View();
|
||||||
autoView( psi_v , psi, AcceleratorWrite);
|
auto p_v = p.View();
|
||||||
autoView( p_v , p, AcceleratorWrite);
|
auto r_v = r.View();
|
||||||
autoView( r_v , r, AcceleratorWrite);
|
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
|
||||||
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
|
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
|
||||||
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
|
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
|
||||||
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
|
});
|
||||||
});
|
|
||||||
}
|
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
@ -1,241 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#ifndef GRID_PREC_GCR_NON_HERM_H
|
|
||||||
#define GRID_PREC_GCR_NON_HERM_H
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//VPGCR Abe and Zhang, 2005.
|
|
||||||
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
|
|
||||||
//Computing and Information Volume 2, Number 2, Pages 147-161
|
|
||||||
//NB. Likely not original reference since they are focussing on a preconditioner variant.
|
|
||||||
// but VPGCR was nicely written up in their paper
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
|
|
||||||
public:
|
|
||||||
|
|
||||||
RealD Tolerance;
|
|
||||||
Integer MaxIterations;
|
|
||||||
int verbose;
|
|
||||||
int mmax;
|
|
||||||
int nstep;
|
|
||||||
int steps;
|
|
||||||
int level;
|
|
||||||
GridStopWatch PrecTimer;
|
|
||||||
GridStopWatch MatTimer;
|
|
||||||
GridStopWatch LinalgTimer;
|
|
||||||
|
|
||||||
LinearFunction<Field> &Preconditioner;
|
|
||||||
LinearOperatorBase<Field> &Linop;
|
|
||||||
|
|
||||||
void Level(int lv) { level=lv; };
|
|
||||||
|
|
||||||
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
|
|
||||||
Tolerance(tol),
|
|
||||||
MaxIterations(maxit),
|
|
||||||
Linop(_Linop),
|
|
||||||
Preconditioner(Prec),
|
|
||||||
mmax(_mmax),
|
|
||||||
nstep(_nstep)
|
|
||||||
{
|
|
||||||
level=1;
|
|
||||||
verbose=1;
|
|
||||||
};
|
|
||||||
|
|
||||||
void operator() (const Field &src, Field &psi){
|
|
||||||
|
|
||||||
psi=Zero();
|
|
||||||
RealD cp, ssq,rsq;
|
|
||||||
ssq=norm2(src);
|
|
||||||
rsq=Tolerance*Tolerance*ssq;
|
|
||||||
|
|
||||||
Field r(src.Grid());
|
|
||||||
|
|
||||||
PrecTimer.Reset();
|
|
||||||
MatTimer.Reset();
|
|
||||||
LinalgTimer.Reset();
|
|
||||||
|
|
||||||
GridStopWatch SolverTimer;
|
|
||||||
SolverTimer.Start();
|
|
||||||
|
|
||||||
steps=0;
|
|
||||||
for(int k=0;k<MaxIterations;k++){
|
|
||||||
|
|
||||||
cp=GCRnStep(src,psi,rsq);
|
|
||||||
|
|
||||||
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
|
|
||||||
|
|
||||||
if(cp<rsq) {
|
|
||||||
|
|
||||||
SolverTimer.Stop();
|
|
||||||
|
|
||||||
Linop.Op(psi,r);
|
|
||||||
axpy(r,-1.0,src,r);
|
|
||||||
RealD tr = norm2(r);
|
|
||||||
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
|
|
||||||
<< " computed residual "<<sqrt(cp/ssq)
|
|
||||||
<< " true residual " <<sqrt(tr/ssq)
|
|
||||||
<< " target " <<Tolerance <<std::endl;
|
|
||||||
|
|
||||||
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
|
|
||||||
// assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
|
|
||||||
|
|
||||||
RealD cp;
|
|
||||||
ComplexD a, b, zAz;
|
|
||||||
RealD zAAz;
|
|
||||||
ComplexD rq;
|
|
||||||
|
|
||||||
GridBase *grid = src.Grid();
|
|
||||||
|
|
||||||
Field r(grid);
|
|
||||||
Field z(grid);
|
|
||||||
Field tmp(grid);
|
|
||||||
Field ttmp(grid);
|
|
||||||
Field Az(grid);
|
|
||||||
|
|
||||||
////////////////////////////////
|
|
||||||
// history for flexible orthog
|
|
||||||
////////////////////////////////
|
|
||||||
std::vector<Field> q(mmax,grid);
|
|
||||||
std::vector<Field> p(mmax,grid);
|
|
||||||
std::vector<RealD> qq(mmax);
|
|
||||||
|
|
||||||
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
|
|
||||||
|
|
||||||
//////////////////////////////////
|
|
||||||
// initial guess x0 is taken as nonzero.
|
|
||||||
// r0=src-A x0 = src
|
|
||||||
//////////////////////////////////
|
|
||||||
MatTimer.Start();
|
|
||||||
Linop.Op(psi,Az);
|
|
||||||
zAz = innerProduct(Az,psi);
|
|
||||||
zAAz= norm2(Az);
|
|
||||||
MatTimer.Stop();
|
|
||||||
|
|
||||||
|
|
||||||
LinalgTimer.Start();
|
|
||||||
r=src-Az;
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
|
|
||||||
|
|
||||||
/////////////////////
|
|
||||||
// p = Prec(r)
|
|
||||||
/////////////////////
|
|
||||||
|
|
||||||
PrecTimer.Start();
|
|
||||||
Preconditioner(r,z);
|
|
||||||
PrecTimer.Stop();
|
|
||||||
|
|
||||||
MatTimer.Start();
|
|
||||||
Linop.Op(z,Az);
|
|
||||||
MatTimer.Stop();
|
|
||||||
|
|
||||||
LinalgTimer.Start();
|
|
||||||
|
|
||||||
zAz = innerProduct(Az,psi);
|
|
||||||
zAAz= norm2(Az);
|
|
||||||
|
|
||||||
//p[0],q[0],qq[0]
|
|
||||||
p[0]= z;
|
|
||||||
q[0]= Az;
|
|
||||||
qq[0]= zAAz;
|
|
||||||
|
|
||||||
cp =norm2(r);
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
|
|
||||||
for(int k=0;k<nstep;k++){
|
|
||||||
|
|
||||||
steps++;
|
|
||||||
|
|
||||||
int kp = k+1;
|
|
||||||
int peri_k = k %mmax;
|
|
||||||
int peri_kp= kp%mmax;
|
|
||||||
|
|
||||||
LinalgTimer.Start();
|
|
||||||
rq= innerProduct(q[peri_k],r); // what if rAr not real?
|
|
||||||
a = rq/qq[peri_k];
|
|
||||||
|
|
||||||
axpy(psi,a,p[peri_k],psi);
|
|
||||||
|
|
||||||
cp = axpy_norm(r,-a,q[peri_k],r);
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
|
|
||||||
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
|
|
||||||
|
|
||||||
if((k==nstep-1)||(cp<rsq)){
|
|
||||||
return cp;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
PrecTimer.Start();
|
|
||||||
Preconditioner(r,z);// solve Az = r
|
|
||||||
PrecTimer.Stop();
|
|
||||||
|
|
||||||
MatTimer.Start();
|
|
||||||
Linop.Op(z,Az);
|
|
||||||
MatTimer.Stop();
|
|
||||||
zAz = innerProduct(Az,psi);
|
|
||||||
zAAz= norm2(Az);
|
|
||||||
|
|
||||||
LinalgTimer.Start();
|
|
||||||
|
|
||||||
q[peri_kp]=Az;
|
|
||||||
p[peri_kp]=z;
|
|
||||||
|
|
||||||
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
|
|
||||||
for(int back=0;back<northog;back++){
|
|
||||||
|
|
||||||
int peri_back=(k-back)%mmax; assert((k-back)>=0);
|
|
||||||
|
|
||||||
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
|
|
||||||
p[peri_kp]=p[peri_kp]+b*p[peri_back];
|
|
||||||
q[peri_kp]=q[peri_kp]+b*q[peri_back];
|
|
||||||
|
|
||||||
}
|
|
||||||
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
}
|
|
||||||
assert(0); // never reached
|
|
||||||
return cp;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
#endif
|
|
@ -6,6 +6,93 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||||
bool MemoryProfiler::debug = false;
|
bool MemoryProfiler::debug = false;
|
||||||
|
|
||||||
|
int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
int PointerCache::Ncache = 32;
|
||||||
|
#else
|
||||||
|
int PointerCache::Ncache = 8;
|
||||||
|
#endif
|
||||||
|
int PointerCache::Victim;
|
||||||
|
int PointerCache::VictimSmall;
|
||||||
|
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
|
||||||
|
PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
|
||||||
|
|
||||||
|
void PointerCache::Init(void)
|
||||||
|
{
|
||||||
|
char * str;
|
||||||
|
|
||||||
|
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
||||||
|
if ( str ) Ncache = atoi(str);
|
||||||
|
if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
|
||||||
|
|
||||||
|
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||||
|
if ( str ) NcacheSmall = atoi(str);
|
||||||
|
if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
|
||||||
|
|
||||||
|
// printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
|
||||||
|
}
|
||||||
|
void *PointerCache::Insert(void *ptr,size_t bytes)
|
||||||
|
{
|
||||||
|
if (bytes < GRID_ALLOC_SMALL_LIMIT )
|
||||||
|
return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
|
||||||
|
return Insert(ptr,bytes,Entries,Ncache,Victim);
|
||||||
|
}
|
||||||
|
void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
assert(omp_in_parallel()==0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void * ret = NULL;
|
||||||
|
int v = -1;
|
||||||
|
|
||||||
|
for(int e=0;e<ncache;e++) {
|
||||||
|
if ( entries[e].valid==0 ) {
|
||||||
|
v=e;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( v==-1 ) {
|
||||||
|
v=victim;
|
||||||
|
victim = (victim+1)%ncache;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( entries[v].valid ) {
|
||||||
|
ret = entries[v].address;
|
||||||
|
entries[v].valid = 0;
|
||||||
|
entries[v].address = NULL;
|
||||||
|
entries[v].bytes = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
entries[v].address=ptr;
|
||||||
|
entries[v].bytes =bytes;
|
||||||
|
entries[v].valid =1;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *PointerCache::Lookup(size_t bytes)
|
||||||
|
{
|
||||||
|
if (bytes < GRID_ALLOC_SMALL_LIMIT )
|
||||||
|
return Lookup(bytes,EntriesSmall,NcacheSmall);
|
||||||
|
return Lookup(bytes,Entries,Ncache);
|
||||||
|
}
|
||||||
|
void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
assert(omp_in_parallel()==0);
|
||||||
|
#endif
|
||||||
|
for(int e=0;e<ncache;e++){
|
||||||
|
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||||
|
entries[e].valid = 0;
|
||||||
|
return entries[e].address;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||||
{
|
{
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
|
@ -26,10 +26,129 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#pragma once
|
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
||||||
|
#define GRID_ALIGNED_ALLOCATOR_H
|
||||||
|
|
||||||
|
#ifdef HAVE_MALLOC_MALLOC_H
|
||||||
|
#include <malloc/malloc.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_MALLOC_H
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
|
#include <mm_malloc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define POINTER_CACHE
|
||||||
|
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||||
|
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
// Move control to configure.ac and Config.h?
|
||||||
|
|
||||||
|
class PointerCache {
|
||||||
|
private:
|
||||||
|
/*Pinning pages is costly*/
|
||||||
|
/*Could maintain separate large and small allocation caches*/
|
||||||
|
/* Could make these configurable, perhaps up to a max size*/
|
||||||
|
static const int NcacheSmallMax=128;
|
||||||
|
static const int NcacheMax=16;
|
||||||
|
static int NcacheSmall;
|
||||||
|
static int Ncache;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
void *address;
|
||||||
|
size_t bytes;
|
||||||
|
int valid;
|
||||||
|
} PointerCacheEntry;
|
||||||
|
|
||||||
|
static PointerCacheEntry Entries[NcacheMax];
|
||||||
|
static int Victim;
|
||||||
|
static PointerCacheEntry EntriesSmall[NcacheSmallMax];
|
||||||
|
static int VictimSmall;
|
||||||
|
|
||||||
|
public:
|
||||||
|
static void Init(void);
|
||||||
|
static void *Insert(void *ptr,size_t bytes) ;
|
||||||
|
static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
|
||||||
|
static void *Lookup(size_t bytes) ;
|
||||||
|
static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string sizeString(size_t bytes);
|
||||||
|
|
||||||
|
struct MemoryStats
|
||||||
|
{
|
||||||
|
size_t totalAllocated{0}, maxAllocated{0},
|
||||||
|
currentlyAllocated{0}, totalFreed{0};
|
||||||
|
};
|
||||||
|
|
||||||
|
class MemoryProfiler
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static MemoryStats *stats;
|
||||||
|
static bool debug;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
#define profilerCudaMeminfo \
|
||||||
|
{ size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
|
||||||
|
#else
|
||||||
|
#define profilerCudaMeminfo
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
||||||
|
#define profilerDebugPrint \
|
||||||
|
if (MemoryProfiler::stats) \
|
||||||
|
{ \
|
||||||
|
auto s = MemoryProfiler::stats; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
||||||
|
<< std::endl; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
||||||
|
<< std::endl; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
||||||
|
<< std::endl; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
||||||
|
<< std::endl; \
|
||||||
|
} \
|
||||||
|
profilerCudaMeminfo;
|
||||||
|
|
||||||
|
#define profilerAllocate(bytes) \
|
||||||
|
if (MemoryProfiler::stats) \
|
||||||
|
{ \
|
||||||
|
auto s = MemoryProfiler::stats; \
|
||||||
|
s->totalAllocated += (bytes); \
|
||||||
|
s->currentlyAllocated += (bytes); \
|
||||||
|
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
|
||||||
|
} \
|
||||||
|
if (MemoryProfiler::debug) \
|
||||||
|
{ \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
|
||||||
|
profilerDebugPrint; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define profilerFree(bytes) \
|
||||||
|
if (MemoryProfiler::stats) \
|
||||||
|
{ \
|
||||||
|
auto s = MemoryProfiler::stats; \
|
||||||
|
s->totalFreed += (bytes); \
|
||||||
|
s->currentlyAllocated -= (bytes); \
|
||||||
|
} \
|
||||||
|
if (MemoryProfiler::debug) \
|
||||||
|
{ \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
|
||||||
|
profilerDebugPrint; \
|
||||||
|
}
|
||||||
|
|
||||||
|
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// A lattice of something, but assume the something is SIMDized.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<typename _Tp>
|
template<typename _Tp>
|
||||||
class alignedAllocator {
|
class alignedAllocator {
|
||||||
public:
|
public:
|
||||||
@ -53,122 +172,89 @@ public:
|
|||||||
{
|
{
|
||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
profilerAllocate(bytes);
|
profilerAllocate(bytes);
|
||||||
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
|
|
||||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
|
||||||
|
#ifdef POINTER_CACHE
|
||||||
|
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
||||||
|
#else
|
||||||
|
pointer ptr = nullptr;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
////////////////////////////////////
|
||||||
|
// Unified (managed) memory
|
||||||
|
////////////////////////////////////
|
||||||
|
if ( ptr == (_Tp *) NULL ) {
|
||||||
|
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
|
||||||
|
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||||
|
if( err != cudaSuccess ) {
|
||||||
|
ptr = (_Tp *) NULL;
|
||||||
|
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert( ptr != (_Tp *)NULL);
|
||||||
|
#else
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// 2MB align; could make option probably doesn't need configurability
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
|
||||||
|
#else
|
||||||
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
||||||
|
#endif
|
||||||
|
assert( ptr != (_Tp *)NULL);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
// First touch optimise in threaded loop
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
uint64_t *cp = (uint64_t *)ptr;
|
||||||
|
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
|
||||||
|
cp[n]=0;
|
||||||
|
});
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void deallocate(pointer __p, size_type __n)
|
void deallocate(pointer __p, size_type __n) {
|
||||||
{
|
|
||||||
size_type bytes = __n * sizeof(_Tp);
|
size_type bytes = __n * sizeof(_Tp);
|
||||||
|
|
||||||
profilerFree(bytes);
|
profilerFree(bytes);
|
||||||
MemoryManager::CpuFree((void *)__p,bytes);
|
|
||||||
|
#ifdef POINTER_CACHE
|
||||||
|
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
||||||
|
#else
|
||||||
|
pointer __freeme = __p;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
if ( __freeme ) cudaFree((void *)__freeme);
|
||||||
|
#else
|
||||||
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
|
if ( __freeme ) _mm_free((void *)__freeme);
|
||||||
|
#else
|
||||||
|
if ( __freeme ) free((void *)__freeme);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
// FIXME: hack for the copy constructor, eventually it must be avoided
|
||||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
||||||
|
//void construct(pointer __p, const _Tp& __val) { };
|
||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
};
|
};
|
||||||
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
||||||
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Unified virtual memory
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<typename _Tp>
|
|
||||||
class uvmAllocator {
|
|
||||||
public:
|
|
||||||
typedef std::size_t size_type;
|
|
||||||
typedef std::ptrdiff_t difference_type;
|
|
||||||
typedef _Tp* pointer;
|
|
||||||
typedef const _Tp* const_pointer;
|
|
||||||
typedef _Tp& reference;
|
|
||||||
typedef const _Tp& const_reference;
|
|
||||||
typedef _Tp value_type;
|
|
||||||
|
|
||||||
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
|
|
||||||
uvmAllocator() throw() { }
|
|
||||||
uvmAllocator(const uvmAllocator&) throw() { }
|
|
||||||
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
|
|
||||||
~uvmAllocator() throw() { }
|
|
||||||
pointer address(reference __x) const { return &__x; }
|
|
||||||
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
|
||||||
|
|
||||||
pointer allocate(size_type __n, const void* _p= 0)
|
|
||||||
{
|
|
||||||
size_type bytes = __n*sizeof(_Tp);
|
|
||||||
profilerAllocate(bytes);
|
|
||||||
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
|
|
||||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void deallocate(pointer __p, size_type __n)
|
|
||||||
{
|
|
||||||
size_type bytes = __n * sizeof(_Tp);
|
|
||||||
profilerFree(bytes);
|
|
||||||
MemoryManager::SharedFree((void *)__p,bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
|
||||||
void construct(pointer __p) { };
|
|
||||||
void destroy(pointer __p) { };
|
|
||||||
};
|
|
||||||
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
|
|
||||||
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Device memory
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<typename _Tp>
|
|
||||||
class devAllocator {
|
|
||||||
public:
|
|
||||||
typedef std::size_t size_type;
|
|
||||||
typedef std::ptrdiff_t difference_type;
|
|
||||||
typedef _Tp* pointer;
|
|
||||||
typedef const _Tp* const_pointer;
|
|
||||||
typedef _Tp& reference;
|
|
||||||
typedef const _Tp& const_reference;
|
|
||||||
typedef _Tp value_type;
|
|
||||||
|
|
||||||
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
|
|
||||||
devAllocator() throw() { }
|
|
||||||
devAllocator(const devAllocator&) throw() { }
|
|
||||||
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
|
|
||||||
~devAllocator() throw() { }
|
|
||||||
pointer address(reference __x) const { return &__x; }
|
|
||||||
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
|
||||||
|
|
||||||
pointer allocate(size_type __n, const void* _p= 0)
|
|
||||||
{
|
|
||||||
size_type bytes = __n*sizeof(_Tp);
|
|
||||||
profilerAllocate(bytes);
|
|
||||||
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
|
|
||||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void deallocate(pointer __p, size_type __n)
|
|
||||||
{
|
|
||||||
size_type bytes = __n * sizeof(_Tp);
|
|
||||||
profilerFree(bytes);
|
|
||||||
MemoryManager::AcceleratorFree((void *)__p,bytes);
|
|
||||||
}
|
|
||||||
void construct(pointer __p, const _Tp& __val) { };
|
|
||||||
void construct(pointer __p) { };
|
|
||||||
void destroy(pointer __p) { };
|
|
||||||
};
|
|
||||||
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
|
|
||||||
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//template<class T> using commAllocator = devAllocator<T>;
|
template<class T> using commAllocator = alignedAllocator<T>;
|
||||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
|
||||||
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
|
||||||
|
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@ -1,4 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
#include <Grid/allocator/MemoryStats.h>
|
|
||||||
#include <Grid/allocator/MemoryManager.h>
|
|
||||||
#include <Grid/allocator/AlignedAllocator.h>
|
|
@ -1,254 +0,0 @@
|
|||||||
#include <Grid/GridCore.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
/*Allocation types, saying which pointer cache should be used*/
|
|
||||||
#define Cpu (0)
|
|
||||||
#define CpuSmall (1)
|
|
||||||
#define Acc (2)
|
|
||||||
#define AccSmall (3)
|
|
||||||
#define Shared (4)
|
|
||||||
#define SharedSmall (5)
|
|
||||||
uint64_t total_shared;
|
|
||||||
uint64_t total_device;
|
|
||||||
uint64_t total_host;;
|
|
||||||
void MemoryManager::PrintBytes(void)
|
|
||||||
{
|
|
||||||
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
|
|
||||||
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
|
|
||||||
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
// Data tables for recently freed pooiniter caches
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
|
||||||
int MemoryManager::Victim[MemoryManager::NallocType];
|
|
||||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
// Actual allocation and deallocation utils
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
void *MemoryManager::AcceleratorAllocate(size_t bytes)
|
|
||||||
{
|
|
||||||
void *ptr = (void *) Lookup(bytes,Acc);
|
|
||||||
if ( ptr == (void *) NULL ) {
|
|
||||||
ptr = (void *) acceleratorAllocDevice(bytes);
|
|
||||||
total_device+=bytes;
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
|
|
||||||
{
|
|
||||||
void *__freeme = Insert(ptr,bytes,Acc);
|
|
||||||
if ( __freeme ) {
|
|
||||||
acceleratorFreeDevice(__freeme);
|
|
||||||
total_device-=bytes;
|
|
||||||
// PrintBytes();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void *MemoryManager::SharedAllocate(size_t bytes)
|
|
||||||
{
|
|
||||||
void *ptr = (void *) Lookup(bytes,Shared);
|
|
||||||
if ( ptr == (void *) NULL ) {
|
|
||||||
ptr = (void *) acceleratorAllocShared(bytes);
|
|
||||||
total_shared+=bytes;
|
|
||||||
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
|
||||||
// PrintBytes();
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
void MemoryManager::SharedFree (void *ptr,size_t bytes)
|
|
||||||
{
|
|
||||||
void *__freeme = Insert(ptr,bytes,Shared);
|
|
||||||
if ( __freeme ) {
|
|
||||||
acceleratorFreeShared(__freeme);
|
|
||||||
total_shared-=bytes;
|
|
||||||
// PrintBytes();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef GRID_UVM
|
|
||||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
|
||||||
{
|
|
||||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
|
||||||
if ( ptr == (void *) NULL ) {
|
|
||||||
ptr = (void *) acceleratorAllocShared(bytes);
|
|
||||||
total_host+=bytes;
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
|
||||||
{
|
|
||||||
NotifyDeletion(_ptr);
|
|
||||||
void *__freeme = Insert(_ptr,bytes,Cpu);
|
|
||||||
if ( __freeme ) {
|
|
||||||
acceleratorFreeShared(__freeme);
|
|
||||||
total_host-=bytes;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
|
||||||
{
|
|
||||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
|
||||||
if ( ptr == (void *) NULL ) {
|
|
||||||
ptr = (void *) acceleratorAllocCpu(bytes);
|
|
||||||
total_host+=bytes;
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
|
||||||
{
|
|
||||||
NotifyDeletion(_ptr);
|
|
||||||
void *__freeme = Insert(_ptr,bytes,Cpu);
|
|
||||||
if ( __freeme ) {
|
|
||||||
acceleratorFreeCpu(__freeme);
|
|
||||||
total_host-=bytes;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//////////////////////////////////////////
|
|
||||||
// call only once
|
|
||||||
//////////////////////////////////////////
|
|
||||||
void MemoryManager::Init(void)
|
|
||||||
{
|
|
||||||
|
|
||||||
char * str;
|
|
||||||
int Nc;
|
|
||||||
int NcS;
|
|
||||||
|
|
||||||
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
|
||||||
if ( str ) {
|
|
||||||
Nc = atoi(str);
|
|
||||||
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
|
||||||
Ncache[Cpu]=Nc;
|
|
||||||
Ncache[Acc]=Nc;
|
|
||||||
Ncache[Shared]=Nc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
|
||||||
if ( str ) {
|
|
||||||
Nc = atoi(str);
|
|
||||||
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
|
||||||
Ncache[CpuSmall]=Nc;
|
|
||||||
Ncache[AccSmall]=Nc;
|
|
||||||
Ncache[SharedSmall]=Nc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void MemoryManager::InitMessage(void) {
|
|
||||||
|
|
||||||
#ifndef GRID_UVM
|
|
||||||
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
|
||||||
#ifdef ALLOCATION_CACHE
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GRID_UVM
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
|
||||||
{
|
|
||||||
#ifdef ALLOCATION_CACHE
|
|
||||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
|
||||||
int cache = type + small;
|
|
||||||
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
|
|
||||||
#else
|
|
||||||
return ptr;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
|
|
||||||
{
|
|
||||||
assert(ncache>0);
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
assert(omp_in_parallel()==0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void * ret = NULL;
|
|
||||||
int v = -1;
|
|
||||||
|
|
||||||
for(int e=0;e<ncache;e++) {
|
|
||||||
if ( entries[e].valid==0 ) {
|
|
||||||
v=e;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( v==-1 ) {
|
|
||||||
v=victim;
|
|
||||||
victim = (victim+1)%ncache;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( entries[v].valid ) {
|
|
||||||
ret = entries[v].address;
|
|
||||||
entries[v].valid = 0;
|
|
||||||
entries[v].address = NULL;
|
|
||||||
entries[v].bytes = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
entries[v].address=ptr;
|
|
||||||
entries[v].bytes =bytes;
|
|
||||||
entries[v].valid =1;
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void *MemoryManager::Lookup(size_t bytes,int type)
|
|
||||||
{
|
|
||||||
#ifdef ALLOCATION_CACHE
|
|
||||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
|
||||||
int cache = type+small;
|
|
||||||
return Lookup(bytes,Entries[cache],Ncache[cache]);
|
|
||||||
#else
|
|
||||||
return NULL;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
|
|
||||||
{
|
|
||||||
assert(ncache>0);
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
assert(omp_in_parallel()==0);
|
|
||||||
#endif
|
|
||||||
for(int e=0;e<ncache;e++){
|
|
||||||
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
|
||||||
entries[e].valid = 0;
|
|
||||||
return entries[e].address;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
@ -1,182 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/MemoryManager.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
#include <list>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
// Move control to configure.ac and Config.h?
|
|
||||||
|
|
||||||
#define ALLOCATION_CACHE
|
|
||||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
|
||||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
|
||||||
|
|
||||||
/*Pinning pages is costly*/
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Advise the LatticeAccelerator class
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
enum ViewAdvise {
|
|
||||||
AdviseDefault = 0x0, // Regular data
|
|
||||||
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
|
|
||||||
// significantly influence performance of bulk storage.
|
|
||||||
|
|
||||||
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
|
|
||||||
// enables read-only copies of memory to be kept on
|
|
||||||
// host and device.
|
|
||||||
|
|
||||||
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// View Access Mode
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
enum ViewMode {
|
|
||||||
AcceleratorRead = 0x01,
|
|
||||||
AcceleratorWrite = 0x02,
|
|
||||||
AcceleratorWriteDiscard = 0x04,
|
|
||||||
CpuRead = 0x08,
|
|
||||||
CpuWrite = 0x10,
|
|
||||||
CpuWriteDiscard = 0x10 // same for now
|
|
||||||
};
|
|
||||||
|
|
||||||
class MemoryManager {
|
|
||||||
private:
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
|
||||||
// For caching recently freed allocations
|
|
||||||
////////////////////////////////////////////////////////////
|
|
||||||
typedef struct {
|
|
||||||
void *address;
|
|
||||||
size_t bytes;
|
|
||||||
int valid;
|
|
||||||
} AllocationCacheEntry;
|
|
||||||
|
|
||||||
static const int NallocCacheMax=128;
|
|
||||||
static const int NallocType=6;
|
|
||||||
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
|
||||||
static int Victim[NallocType];
|
|
||||||
static int Ncache[NallocType];
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////
|
|
||||||
// Free pool
|
|
||||||
/////////////////////////////////////////////////
|
|
||||||
static void *Insert(void *ptr,size_t bytes,int type) ;
|
|
||||||
static void *Lookup(size_t bytes,int type) ;
|
|
||||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
|
||||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
|
||||||
|
|
||||||
static void PrintBytes(void);
|
|
||||||
public:
|
|
||||||
static void Init(void);
|
|
||||||
static void InitMessage(void);
|
|
||||||
static void *AcceleratorAllocate(size_t bytes);
|
|
||||||
static void AcceleratorFree (void *ptr,size_t bytes);
|
|
||||||
static void *SharedAllocate(size_t bytes);
|
|
||||||
static void SharedFree (void *ptr,size_t bytes);
|
|
||||||
static void *CpuAllocate(size_t bytes);
|
|
||||||
static void CpuFree (void *ptr,size_t bytes);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
// Footprint tracking
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
static uint64_t DeviceBytes;
|
|
||||||
static uint64_t DeviceLRUBytes;
|
|
||||||
static uint64_t DeviceMaxBytes;
|
|
||||||
static uint64_t HostToDeviceBytes;
|
|
||||||
static uint64_t DeviceToHostBytes;
|
|
||||||
static uint64_t HostToDeviceXfer;
|
|
||||||
static uint64_t DeviceToHostXfer;
|
|
||||||
|
|
||||||
private:
|
|
||||||
#ifndef GRID_UVM
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
// Data tables for ViewCache
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
typedef std::list<uint64_t> LRU_t;
|
|
||||||
typedef typename LRU_t::iterator LRUiterator;
|
|
||||||
typedef struct {
|
|
||||||
int LRU_valid;
|
|
||||||
LRUiterator LRU_entry;
|
|
||||||
uint64_t CpuPtr;
|
|
||||||
uint64_t AccPtr;
|
|
||||||
size_t bytes;
|
|
||||||
uint32_t transient;
|
|
||||||
uint32_t state;
|
|
||||||
uint32_t accLock;
|
|
||||||
uint32_t cpuLock;
|
|
||||||
} AcceleratorViewEntry;
|
|
||||||
|
|
||||||
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
|
|
||||||
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
|
|
||||||
|
|
||||||
static AccViewTable_t AccViewTable;
|
|
||||||
static LRU_t LRU;
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////
|
|
||||||
// Device motion
|
|
||||||
/////////////////////////////////////////////////
|
|
||||||
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
|
||||||
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
|
|
||||||
static void Evict(AcceleratorViewEntry &AccCache);
|
|
||||||
static void Flush(AcceleratorViewEntry &AccCache);
|
|
||||||
static void Clone(AcceleratorViewEntry &AccCache);
|
|
||||||
static void AccDiscard(AcceleratorViewEntry &AccCache);
|
|
||||||
static void CpuDiscard(AcceleratorViewEntry &AccCache);
|
|
||||||
|
|
||||||
// static void LRUupdate(AcceleratorViewEntry &AccCache);
|
|
||||||
static void LRUinsert(AcceleratorViewEntry &AccCache);
|
|
||||||
static void LRUremove(AcceleratorViewEntry &AccCache);
|
|
||||||
|
|
||||||
// manage entries in the table
|
|
||||||
static int EntryPresent(uint64_t CpuPtr);
|
|
||||||
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
|
||||||
static void EntryErase (uint64_t CpuPtr);
|
|
||||||
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
|
|
||||||
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
|
|
||||||
|
|
||||||
static void AcceleratorViewClose(uint64_t AccPtr);
|
|
||||||
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
|
||||||
static void CpuViewClose(uint64_t Ptr);
|
|
||||||
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
|
||||||
#endif
|
|
||||||
static void NotifyDeletion(void * CpuPtr);
|
|
||||||
|
|
||||||
public:
|
|
||||||
static void Print(void);
|
|
||||||
static int isOpen (void* CpuPtr);
|
|
||||||
static void ViewClose(void* CpuPtr,ViewMode mode);
|
|
||||||
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
|
|
@ -1,468 +0,0 @@
|
|||||||
#include <Grid/GridCore.h>
|
|
||||||
|
|
||||||
#ifndef GRID_UVM
|
|
||||||
|
|
||||||
#warning "Using explicit device memory copies"
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
#define dprintf(...)
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
|
||||||
// For caching copies of data on device
|
|
||||||
////////////////////////////////////////////////////////////
|
|
||||||
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
|
|
||||||
MemoryManager::LRU_t MemoryManager::LRU;
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
// Footprint tracking
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
uint64_t MemoryManager::DeviceBytes;
|
|
||||||
uint64_t MemoryManager::DeviceLRUBytes;
|
|
||||||
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
|
|
||||||
uint64_t MemoryManager::HostToDeviceBytes;
|
|
||||||
uint64_t MemoryManager::DeviceToHostBytes;
|
|
||||||
uint64_t MemoryManager::HostToDeviceXfer;
|
|
||||||
uint64_t MemoryManager::DeviceToHostXfer;
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Priority ordering for unlocked entries
|
|
||||||
// Empty
|
|
||||||
// CpuDirty
|
|
||||||
// Consistent
|
|
||||||
// AccDirty
|
|
||||||
////////////////////////////////////
|
|
||||||
#define Empty (0x0) /*Entry unoccupied */
|
|
||||||
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
|
|
||||||
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
|
|
||||||
#define AccDirty (0x4) /*ACC copy is golden */
|
|
||||||
#define EvictNext (0x8) /*Priority for eviction*/
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////
|
|
||||||
// Mechanics of data table maintenance
|
|
||||||
/////////////////////////////////////////////////
|
|
||||||
int MemoryManager::EntryPresent(uint64_t CpuPtr)
|
|
||||||
{
|
|
||||||
if(AccViewTable.empty()) return 0;
|
|
||||||
|
|
||||||
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
|
||||||
{
|
|
||||||
assert(!EntryPresent(CpuPtr));
|
|
||||||
AcceleratorViewEntry AccCache;
|
|
||||||
AccCache.CpuPtr = CpuPtr;
|
|
||||||
AccCache.AccPtr = (uint64_t)NULL;
|
|
||||||
AccCache.bytes = bytes;
|
|
||||||
AccCache.state = CpuDirty;
|
|
||||||
AccCache.LRU_valid=0;
|
|
||||||
AccCache.transient=0;
|
|
||||||
AccCache.accLock=0;
|
|
||||||
AccCache.cpuLock=0;
|
|
||||||
AccViewTable[CpuPtr] = AccCache;
|
|
||||||
}
|
|
||||||
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
|
|
||||||
{
|
|
||||||
assert(EntryPresent(CpuPtr));
|
|
||||||
auto AccCacheIterator = AccViewTable.find(CpuPtr);
|
|
||||||
assert(AccCacheIterator!=AccViewTable.end());
|
|
||||||
return AccCacheIterator;
|
|
||||||
}
|
|
||||||
void MemoryManager::EntryErase(uint64_t CpuPtr)
|
|
||||||
{
|
|
||||||
auto AccCache = EntryLookup(CpuPtr);
|
|
||||||
AccViewTable.erase(CpuPtr);
|
|
||||||
}
|
|
||||||
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
|
|
||||||
{
|
|
||||||
assert(AccCache.LRU_valid==0);
|
|
||||||
if (AccCache.transient) {
|
|
||||||
LRU.push_back(AccCache.CpuPtr);
|
|
||||||
AccCache.LRU_entry = --LRU.end();
|
|
||||||
} else {
|
|
||||||
LRU.push_front(AccCache.CpuPtr);
|
|
||||||
AccCache.LRU_entry = LRU.begin();
|
|
||||||
}
|
|
||||||
AccCache.LRU_valid = 1;
|
|
||||||
DeviceLRUBytes+=AccCache.bytes;
|
|
||||||
}
|
|
||||||
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
|
|
||||||
{
|
|
||||||
assert(AccCache.LRU_valid==1);
|
|
||||||
LRU.erase(AccCache.LRU_entry);
|
|
||||||
AccCache.LRU_valid = 0;
|
|
||||||
DeviceLRUBytes-=AccCache.bytes;
|
|
||||||
}
|
|
||||||
/////////////////////////////////////////////////
|
|
||||||
// Accelerator cache motion & consistency logic
|
|
||||||
/////////////////////////////////////////////////
|
|
||||||
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|
||||||
{
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
// Remove from Accelerator, remove entry, without flush
|
|
||||||
// Cannot be locked. If allocated Must be in LRU pool.
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
assert(AccCache.state!=Empty);
|
|
||||||
|
|
||||||
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
|
||||||
assert(AccCache.accLock==0);
|
|
||||||
assert(AccCache.cpuLock==0);
|
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
|
||||||
if(AccCache.AccPtr) {
|
|
||||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
|
||||||
DeviceBytes -=AccCache.bytes;
|
|
||||||
LRUremove(AccCache);
|
|
||||||
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
|
||||||
}
|
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
|
||||||
EntryErase(CpuPtr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|
||||||
{
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
|
||||||
// Make CPU consistent, remove from Accelerator, remove entry
|
|
||||||
// Cannot be locked. If allocated must be in LRU pool.
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
|
||||||
assert(AccCache.state!=Empty);
|
|
||||||
|
|
||||||
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
|
||||||
assert(AccCache.accLock==0);
|
|
||||||
assert(AccCache.cpuLock==0);
|
|
||||||
if(AccCache.state==AccDirty) {
|
|
||||||
Flush(AccCache);
|
|
||||||
}
|
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
|
||||||
if(AccCache.AccPtr) {
|
|
||||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
|
||||||
DeviceBytes -=AccCache.bytes;
|
|
||||||
LRUremove(AccCache);
|
|
||||||
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
|
||||||
}
|
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
|
||||||
EntryErase(CpuPtr);
|
|
||||||
}
|
|
||||||
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
|
||||||
{
|
|
||||||
assert(AccCache.state==AccDirty);
|
|
||||||
assert(AccCache.cpuLock==0);
|
|
||||||
assert(AccCache.accLock==0);
|
|
||||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
|
||||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
|
||||||
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
|
||||||
DeviceToHostBytes+=AccCache.bytes;
|
|
||||||
DeviceToHostXfer++;
|
|
||||||
AccCache.state=Consistent;
|
|
||||||
}
|
|
||||||
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
|
||||||
{
|
|
||||||
assert(AccCache.state==CpuDirty);
|
|
||||||
assert(AccCache.cpuLock==0);
|
|
||||||
assert(AccCache.accLock==0);
|
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
|
||||||
if(AccCache.AccPtr==(uint64_t)NULL){
|
|
||||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
|
||||||
DeviceBytes+=AccCache.bytes;
|
|
||||||
}
|
|
||||||
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
|
||||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
|
||||||
HostToDeviceBytes+=AccCache.bytes;
|
|
||||||
HostToDeviceXfer++;
|
|
||||||
AccCache.state=Consistent;
|
|
||||||
}
|
|
||||||
|
|
||||||
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
|
||||||
{
|
|
||||||
assert(AccCache.state!=Empty);
|
|
||||||
assert(AccCache.cpuLock==0);
|
|
||||||
assert(AccCache.accLock==0);
|
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
|
||||||
if(AccCache.AccPtr==(uint64_t)NULL){
|
|
||||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
|
||||||
DeviceBytes+=AccCache.bytes;
|
|
||||||
}
|
|
||||||
AccCache.state=AccDirty;
|
|
||||||
}
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// View management
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
|
||||||
{
|
|
||||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
|
||||||
AcceleratorViewClose((uint64_t)Ptr);
|
|
||||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
|
||||||
CpuViewClose((uint64_t)Ptr);
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
|
||||||
{
|
|
||||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
|
||||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
|
||||||
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
|
||||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
|
||||||
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
|
||||||
{
|
|
||||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
|
||||||
if ( DeviceLRUBytes > 0){
|
|
||||||
assert(LRU.size()>0);
|
|
||||||
uint64_t victim = LRU.back();
|
|
||||||
auto AccCacheIterator = EntryLookup(victim);
|
|
||||||
auto & AccCache = AccCacheIterator->second;
|
|
||||||
Evict(AccCache);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
|
||||||
{
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Find if present, otherwise get or force an empty
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
if ( EntryPresent(CpuPtr)==0 ){
|
|
||||||
EvictVictims(bytes);
|
|
||||||
EntryCreate(CpuPtr,bytes,mode,hint);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
|
||||||
auto & AccCache = AccCacheIterator->second;
|
|
||||||
|
|
||||||
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
|
|
||||||
|
|
||||||
assert(AccCache.cpuLock==0); // Programming error
|
|
||||||
|
|
||||||
if(AccCache.state!=Empty) {
|
|
||||||
assert(AccCache.CpuPtr == CpuPtr);
|
|
||||||
assert(AccCache.bytes ==bytes);
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* State transitions and actions
|
|
||||||
*
|
|
||||||
* Action State StateNext Flush Clone
|
|
||||||
*
|
|
||||||
* AccRead Empty Consistent - Y
|
|
||||||
* AccWrite Empty AccDirty - Y
|
|
||||||
* AccRead CpuDirty Consistent - Y
|
|
||||||
* AccWrite CpuDirty AccDirty - Y
|
|
||||||
* AccRead Consistent Consistent - -
|
|
||||||
* AccWrite Consistent AccDirty - -
|
|
||||||
* AccRead AccDirty AccDirty - -
|
|
||||||
* AccWrite AccDirty AccDirty - -
|
|
||||||
*/
|
|
||||||
if(AccCache.state==Empty) {
|
|
||||||
assert(AccCache.LRU_valid==0);
|
|
||||||
AccCache.CpuPtr = CpuPtr;
|
|
||||||
AccCache.AccPtr = (uint64_t)NULL;
|
|
||||||
AccCache.bytes = bytes;
|
|
||||||
AccCache.state = CpuDirty; // Cpu starts primary
|
|
||||||
if(mode==AcceleratorWriteDiscard){
|
|
||||||
CpuDiscard(AccCache);
|
|
||||||
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
|
|
||||||
} else if(mode==AcceleratorWrite){
|
|
||||||
Clone(AccCache);
|
|
||||||
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
|
|
||||||
} else {
|
|
||||||
Clone(AccCache);
|
|
||||||
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
|
||||||
}
|
|
||||||
AccCache.accLock= 1;
|
|
||||||
} else if(AccCache.state==CpuDirty ){
|
|
||||||
if(mode==AcceleratorWriteDiscard) {
|
|
||||||
CpuDiscard(AccCache);
|
|
||||||
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
|
|
||||||
} else if(mode==AcceleratorWrite) {
|
|
||||||
Clone(AccCache);
|
|
||||||
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
|
|
||||||
} else {
|
|
||||||
Clone(AccCache);
|
|
||||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
|
||||||
}
|
|
||||||
AccCache.accLock++;
|
|
||||||
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
|
|
||||||
} else if(AccCache.state==Consistent) {
|
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
|
||||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
|
||||||
else
|
|
||||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
|
||||||
AccCache.accLock++;
|
|
||||||
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
|
|
||||||
} else if(AccCache.state==AccDirty) {
|
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
|
||||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
|
||||||
else
|
|
||||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
|
||||||
AccCache.accLock++;
|
|
||||||
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If view is opened on device remove from LRU
|
|
||||||
if(AccCache.LRU_valid==1){
|
|
||||||
// must possibly remove from LRU as now locked on GPU
|
|
||||||
LRUremove(AccCache);
|
|
||||||
}
|
|
||||||
|
|
||||||
int transient =hint;
|
|
||||||
AccCache.transient= transient? EvictNext : 0;
|
|
||||||
|
|
||||||
return AccCache.AccPtr;
|
|
||||||
}
|
|
||||||
////////////////////////////////////
|
|
||||||
// look up & decrement lock count
|
|
||||||
////////////////////////////////////
|
|
||||||
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
|
||||||
{
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
|
||||||
auto & AccCache = AccCacheIterator->second;
|
|
||||||
|
|
||||||
assert(AccCache.cpuLock==0);
|
|
||||||
assert(AccCache.accLock>0);
|
|
||||||
|
|
||||||
AccCache.accLock--;
|
|
||||||
|
|
||||||
// Move to LRU queue if not locked and close on device
|
|
||||||
if(AccCache.accLock==0) {
|
|
||||||
LRUinsert(AccCache);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
|
||||||
{
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
|
||||||
auto & AccCache = AccCacheIterator->second;
|
|
||||||
|
|
||||||
assert(AccCache.cpuLock>0);
|
|
||||||
assert(AccCache.accLock==0);
|
|
||||||
|
|
||||||
AccCache.cpuLock--;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Action State StateNext Flush Clone
|
|
||||||
*
|
|
||||||
* CpuRead Empty CpuDirty - -
|
|
||||||
* CpuWrite Empty CpuDirty - -
|
|
||||||
* CpuRead CpuDirty CpuDirty - -
|
|
||||||
* CpuWrite CpuDirty CpuDirty - -
|
|
||||||
* CpuRead Consistent Consistent - -
|
|
||||||
* CpuWrite Consistent CpuDirty - -
|
|
||||||
* CpuRead AccDirty Consistent Y -
|
|
||||||
* CpuWrite AccDirty CpuDirty Y -
|
|
||||||
*/
|
|
||||||
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
|
|
||||||
{
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Find if present, otherwise get or force an empty
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
if ( EntryPresent(CpuPtr)==0 ){
|
|
||||||
EvictVictims(bytes);
|
|
||||||
EntryCreate(CpuPtr,bytes,mode,transient);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
|
||||||
auto & AccCache = AccCacheIterator->second;
|
|
||||||
|
|
||||||
assert((mode==CpuRead)||(mode==CpuWrite));
|
|
||||||
assert(AccCache.accLock==0); // Programming error
|
|
||||||
|
|
||||||
if(AccCache.state!=Empty) {
|
|
||||||
assert(AccCache.CpuPtr == CpuPtr);
|
|
||||||
assert(AccCache.bytes==bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(AccCache.state==Empty) {
|
|
||||||
AccCache.CpuPtr = CpuPtr;
|
|
||||||
AccCache.AccPtr = (uint64_t)NULL;
|
|
||||||
AccCache.bytes = bytes;
|
|
||||||
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
|
|
||||||
AccCache.accLock= 0;
|
|
||||||
AccCache.cpuLock= 1;
|
|
||||||
} else if(AccCache.state==CpuDirty ){
|
|
||||||
// AccPtr dont care, deferred allocate
|
|
||||||
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
|
|
||||||
AccCache.cpuLock++;
|
|
||||||
} else if(AccCache.state==Consistent) {
|
|
||||||
assert(AccCache.AccPtr != (uint64_t)NULL);
|
|
||||||
if(mode==CpuWrite)
|
|
||||||
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
|
|
||||||
else
|
|
||||||
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
|
|
||||||
AccCache.cpuLock++;
|
|
||||||
} else if(AccCache.state==AccDirty) {
|
|
||||||
assert(AccCache.AccPtr != (uint64_t)NULL);
|
|
||||||
Flush(AccCache);
|
|
||||||
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
|
|
||||||
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
|
|
||||||
AccCache.cpuLock++;
|
|
||||||
} else {
|
|
||||||
assert(0); // should be unreachable
|
|
||||||
}
|
|
||||||
|
|
||||||
AccCache.transient= transient? EvictNext : 0;
|
|
||||||
|
|
||||||
return AccCache.CpuPtr;
|
|
||||||
}
|
|
||||||
void MemoryManager::NotifyDeletion(void *_ptr)
|
|
||||||
{
|
|
||||||
// Look up in ViewCache
|
|
||||||
uint64_t ptr = (uint64_t)_ptr;
|
|
||||||
if(EntryPresent(ptr)) {
|
|
||||||
auto e = EntryLookup(ptr);
|
|
||||||
AccDiscard(e->second);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void MemoryManager::Print(void)
|
|
||||||
{
|
|
||||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
|
||||||
std::cout << GridLogDebug << "Memory Manager " << std::endl;
|
|
||||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
|
||||||
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
|
|
||||||
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
|
|
||||||
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
|
|
||||||
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
|
|
||||||
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
|
|
||||||
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
|
|
||||||
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
|
|
||||||
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
|
|
||||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
|
||||||
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
|
|
||||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
|
||||||
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
|
|
||||||
auto &AccCache = it->second;
|
|
||||||
|
|
||||||
std::string str;
|
|
||||||
if ( AccCache.state==Empty ) str = std::string("Empty");
|
|
||||||
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
|
|
||||||
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
|
|
||||||
if ( AccCache.state==Consistent)str = std::string("Consistent");
|
|
||||||
|
|
||||||
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
|
||||||
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
|
||||||
<< "\t" << AccCache.cpuLock
|
|
||||||
<< "\t" << AccCache.accLock
|
|
||||||
<< "\t" << AccCache.LRU_valid<<std::endl;
|
|
||||||
}
|
|
||||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
|
||||||
|
|
||||||
};
|
|
||||||
int MemoryManager::isOpen (void* _CpuPtr)
|
|
||||||
{
|
|
||||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
|
||||||
if ( EntryPresent(CpuPtr) ){
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
|
||||||
auto & AccCache = AccCacheIterator->second;
|
|
||||||
return AccCache.cpuLock+AccCache.accLock;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,24 +0,0 @@
|
|||||||
#include <Grid/GridCore.h>
|
|
||||||
#ifdef GRID_UVM
|
|
||||||
|
|
||||||
#warning "Grid is assuming unified virtual memory address space"
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// View management is 1:1 address space mapping
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
|
||||||
uint64_t MemoryManager::DeviceBytes;
|
|
||||||
uint64_t MemoryManager::DeviceLRUBytes;
|
|
||||||
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
|
|
||||||
uint64_t MemoryManager::HostToDeviceBytes;
|
|
||||||
uint64_t MemoryManager::DeviceToHostBytes;
|
|
||||||
uint64_t MemoryManager::HostToDeviceXfer;
|
|
||||||
uint64_t MemoryManager::DeviceToHostXfer;
|
|
||||||
|
|
||||||
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
|
|
||||||
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
|
|
||||||
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
|
|
||||||
void MemoryManager::Print(void){};
|
|
||||||
void MemoryManager::NotifyDeletion(void *ptr){};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
#endif
|
|
@ -1,67 +0,0 @@
|
|||||||
#include <Grid/GridCore.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
|
||||||
bool MemoryProfiler::debug = false;
|
|
||||||
|
|
||||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
|
||||||
{
|
|
||||||
#ifdef __linux__
|
|
||||||
int fd = open("/proc/self/pagemap", O_RDONLY);
|
|
||||||
assert(fd >= 0);
|
|
||||||
const int page_size = 4096;
|
|
||||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
|
||||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
|
||||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
|
||||||
uint64_t pagedata[npages];
|
|
||||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
|
||||||
assert(ret == offset);
|
|
||||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
|
||||||
assert(ret == sizeof(uint64_t) * npages);
|
|
||||||
int nhugepages = npages / 512;
|
|
||||||
int n4ktotal, nnothuge;
|
|
||||||
n4ktotal = 0;
|
|
||||||
nnothuge = 0;
|
|
||||||
for (int i = 0; i < nhugepages; ++i) {
|
|
||||||
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
|
||||||
for (int j = 0; j < 512; ++j) {
|
|
||||||
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
|
||||||
++n4ktotal;
|
|
||||||
if (pageaddr != baseaddr + j * page_size)
|
|
||||||
++nnothuge;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int rank = CartesianCommunicator::RankWorld();
|
|
||||||
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string sizeString(const size_t bytes)
|
|
||||||
{
|
|
||||||
constexpr unsigned int bufSize = 256;
|
|
||||||
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
|
|
||||||
char buf[256];
|
|
||||||
size_t s = 0;
|
|
||||||
double count = bytes;
|
|
||||||
|
|
||||||
while (count >= 1024 && s < 7)
|
|
||||||
{
|
|
||||||
s++;
|
|
||||||
count /= 1024;
|
|
||||||
}
|
|
||||||
if (count - floor(count) == 0.0)
|
|
||||||
{
|
|
||||||
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::string(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
@ -1,95 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/MemoryStats.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
std::string sizeString(size_t bytes);
|
|
||||||
|
|
||||||
struct MemoryStats
|
|
||||||
{
|
|
||||||
size_t totalAllocated{0}, maxAllocated{0},
|
|
||||||
currentlyAllocated{0}, totalFreed{0};
|
|
||||||
};
|
|
||||||
|
|
||||||
class MemoryProfiler
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
static MemoryStats *stats;
|
|
||||||
static bool debug;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
|
||||||
#define profilerDebugPrint \
|
|
||||||
if (MemoryProfiler::stats) \
|
|
||||||
{ \
|
|
||||||
auto s = MemoryProfiler::stats; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
|
||||||
<< std::endl; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
|
||||||
<< std::endl; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
|
||||||
<< std::endl; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
|
||||||
<< std::endl; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define profilerAllocate(bytes) \
|
|
||||||
if (MemoryProfiler::stats) \
|
|
||||||
{ \
|
|
||||||
auto s = MemoryProfiler::stats; \
|
|
||||||
s->totalAllocated += (bytes); \
|
|
||||||
s->currentlyAllocated += (bytes); \
|
|
||||||
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
|
|
||||||
} \
|
|
||||||
if (MemoryProfiler::debug) \
|
|
||||||
{ \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
|
|
||||||
profilerDebugPrint; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define profilerFree(bytes) \
|
|
||||||
if (MemoryProfiler::stats) \
|
|
||||||
{ \
|
|
||||||
auto s = MemoryProfiler::stats; \
|
|
||||||
s->totalFreed += (bytes); \
|
|
||||||
s->currentlyAllocated -= (bytes); \
|
|
||||||
} \
|
|
||||||
if (MemoryProfiler::debug) \
|
|
||||||
{ \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
|
|
||||||
profilerDebugPrint; \
|
|
||||||
}
|
|
||||||
|
|
||||||
void check_huge_pages(void *Buf,uint64_t BYTES);
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
@ -81,7 +81,6 @@ public:
|
|||||||
|
|
||||||
bool _isCheckerBoarded;
|
bool _isCheckerBoarded;
|
||||||
int LocallyPeriodic;
|
int LocallyPeriodic;
|
||||||
Coordinate _checker_dim_mask;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -38,7 +38,6 @@ class GridCartesian: public GridBase {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
int dummy;
|
int dummy;
|
||||||
Coordinate _checker_dim_mask;
|
|
||||||
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -105,7 +104,6 @@ public:
|
|||||||
_ldimensions.resize(_ndimension);
|
_ldimensions.resize(_ndimension);
|
||||||
_rdimensions.resize(_ndimension);
|
_rdimensions.resize(_ndimension);
|
||||||
_simd_layout.resize(_ndimension);
|
_simd_layout.resize(_ndimension);
|
||||||
_checker_dim_mask.resize(_ndimension);;
|
|
||||||
_lstart.resize(_ndimension);
|
_lstart.resize(_ndimension);
|
||||||
_lend.resize(_ndimension);
|
_lend.resize(_ndimension);
|
||||||
|
|
||||||
@ -116,8 +114,6 @@ public:
|
|||||||
|
|
||||||
for (int d = 0; d < _ndimension; d++)
|
for (int d = 0; d < _ndimension; d++)
|
||||||
{
|
{
|
||||||
_checker_dim_mask[d]=0;
|
|
||||||
|
|
||||||
_fdimensions[d] = dimensions[d]; // Global dimensions
|
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||||
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||||
_simd_layout[d] = simd_layout[d];
|
_simd_layout[d] = simd_layout[d];
|
||||||
|
@ -35,28 +35,12 @@ static const int CbRed =0;
|
|||||||
static const int CbBlack=1;
|
static const int CbBlack=1;
|
||||||
static const int Even =CbRed;
|
static const int Even =CbRed;
|
||||||
static const int Odd =CbBlack;
|
static const int Odd =CbBlack;
|
||||||
|
|
||||||
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
|
|
||||||
{
|
|
||||||
int nd=rdim.size();
|
|
||||||
Coordinate coor(nd);
|
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor,oindex,rdim);
|
|
||||||
|
|
||||||
int linear=0;
|
|
||||||
for(int d=0;d<nd;d++){
|
|
||||||
if(chk_dim_msk[d])
|
|
||||||
linear=linear+coor[d];
|
|
||||||
}
|
|
||||||
return (linear&0x1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Specialise this for red black grids storing half the data like a chess board.
|
// Specialise this for red black grids storing half the data like a chess board.
|
||||||
class GridRedBlackCartesian : public GridBase
|
class GridRedBlackCartesian : public GridBase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// Coordinate _checker_dim_mask;
|
Coordinate _checker_dim_mask;
|
||||||
int _checker_dim;
|
int _checker_dim;
|
||||||
std::vector<int> _checker_board;
|
std::vector<int> _checker_board;
|
||||||
|
|
||||||
|
@ -138,6 +138,21 @@ public:
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
|
void SendRecvPacket(void *xmit,
|
||||||
|
void *recv,
|
||||||
|
int xmit_to_rank,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes);
|
||||||
|
|
||||||
|
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes);
|
||||||
|
|
||||||
|
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||||
|
|
||||||
double StencilSendToRecvFrom(void *xmit,
|
double StencilSendToRecvFrom(void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
void *recv,
|
void *recv,
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
Source file: ./lib/communicator/Communicator_mpi.cc
|
Source file: ./lib/communicator/Communicator_mpi.cc
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
|||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// First initialise of comms system
|
// First initialise of comms system
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
void CartesianCommunicator::Init(int *argc, char ***argv)
|
void CartesianCommunicator::Init(int *argc, char ***argv)
|
||||||
{
|
{
|
||||||
|
|
||||||
int flag;
|
int flag;
|
||||||
@ -43,16 +43,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
|
|||||||
|
|
||||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
if ( !flag ) {
|
if ( !flag ) {
|
||||||
|
|
||||||
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
|
|
||||||
nCommThreads=1;
|
|
||||||
// wrong results here too
|
|
||||||
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
|
|
||||||
// other comms schemes are ok
|
|
||||||
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
|
|
||||||
#else
|
|
||||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||||
#endif
|
|
||||||
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
|
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
|
||||||
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
|
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -99,7 +91,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Initialises from communicator_world
|
// Initialises from communicator_world
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
|
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
|
||||||
{
|
{
|
||||||
MPI_Comm optimal_comm;
|
MPI_Comm optimal_comm;
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
@ -118,7 +110,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
|
|||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
// Try to subdivide communicator
|
// Try to subdivide communicator
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
|
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
|
||||||
{
|
{
|
||||||
_ndimension = processors.size(); assert(_ndimension>=1);
|
_ndimension = processors.size(); assert(_ndimension>=1);
|
||||||
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
|
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
|
||||||
@ -135,7 +127,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
|
|||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// split the communicator
|
// split the communicator
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// int Nparent = parent._processors ;
|
// int Nparent = parent._processors ;
|
||||||
int Nparent;
|
int Nparent;
|
||||||
MPI_Comm_size(parent.communicator,&Nparent);
|
MPI_Comm_size(parent.communicator,&Nparent);
|
||||||
|
|
||||||
@ -157,13 +149,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
|
|||||||
}
|
}
|
||||||
|
|
||||||
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
|
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
|
||||||
int crank;
|
int crank;
|
||||||
// Mpi uses the reverse Lexico convention to us; so reversed routines called
|
// Mpi uses the reverse Lexico convention to us; so reversed routines called
|
||||||
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
|
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
|
||||||
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
|
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
|
||||||
|
|
||||||
MPI_Comm comm_split;
|
MPI_Comm comm_split;
|
||||||
if ( Nchild > 1 ) {
|
if ( Nchild > 1 ) {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Split the communicator
|
// Split the communicator
|
||||||
@ -188,11 +180,11 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
|
|||||||
SetCommunicator(comm_split);
|
SetCommunicator(comm_split);
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
///////////////////////////////////////////////
|
||||||
// Free the temp communicator
|
// Free the temp communicator
|
||||||
///////////////////////////////////////////////
|
///////////////////////////////////////////////
|
||||||
MPI_Comm_free(&comm_split);
|
MPI_Comm_free(&comm_split);
|
||||||
|
|
||||||
if(0){
|
if(0){
|
||||||
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
|
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
|
||||||
for(int d=0;d<processors.size();d++){
|
for(int d=0;d<processors.size();d++){
|
||||||
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
|
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
|
||||||
@ -253,7 +245,7 @@ CartesianCommunicator::~CartesianCommunicator()
|
|||||||
for(int i=0;i<communicator_halo.size();i++){
|
for(int i=0;i<communicator_halo.size();i++){
|
||||||
MPI_Comm_free(&communicator_halo[i]);
|
MPI_Comm_free(&communicator_halo[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
@ -302,28 +294,60 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
int bytes)
|
int bytes)
|
||||||
{
|
{
|
||||||
std::vector<CommsRequest_t> reqs(0);
|
std::vector<CommsRequest_t> reqs(0);
|
||||||
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||||
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||||
|
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||||
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||||
|
SendToRecvFromComplete(reqs);
|
||||||
|
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||||
|
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||||
|
void *recv,
|
||||||
|
int sender,
|
||||||
|
int receiver,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
MPI_Status stat;
|
||||||
|
assert(sender != receiver);
|
||||||
|
int tag = sender;
|
||||||
|
if ( _processor == sender ) {
|
||||||
|
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||||
|
}
|
||||||
|
if ( _processor == receiver ) {
|
||||||
|
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
|
|
||||||
// Enforce no UVM in comms, device or host OK
|
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||||
assert(acceleratorIsCommunicable(xmit));
|
MPI_Request xrq;
|
||||||
assert(acceleratorIsCommunicable(recv));
|
MPI_Request rrq;
|
||||||
|
|
||||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||||
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
|
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
|
||||||
recv,bytes,MPI_CHAR,from, from,
|
assert(ierr==0);
|
||||||
communicator,MPI_STATUS_IGNORE);
|
list.push_back(xrq);
|
||||||
assert(ierr==0);
|
list.push_back(rrq);
|
||||||
|
} else {
|
||||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||||
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
recv,bytes,MPI_CHAR,from, from,
|
||||||
|
communicator,MPI_STATUS_IGNORE);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Basic Halo comms primitive
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
void *recv,
|
void *recv,
|
||||||
@ -343,7 +367,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
int from,
|
int from,
|
||||||
int bytes,int dir)
|
int bytes,int dir)
|
||||||
{
|
{
|
||||||
int ncomm =communicator_halo.size();
|
int ncomm =communicator_halo.size();
|
||||||
int commdir=dir%ncomm;
|
int commdir=dir%ncomm;
|
||||||
|
|
||||||
MPI_Request xrq;
|
MPI_Request xrq;
|
||||||
@ -373,13 +397,21 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
off_node_bytes+=bytes;
|
off_node_bytes+=bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||||
this->StencilSendToRecvFromComplete(list,dir);
|
this->StencilSendToRecvFromComplete(list,dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
|
{
|
||||||
|
SendToRecvFromComplete(waitall);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::StencilBarrier(void)
|
||||||
|
{
|
||||||
|
MPI_Barrier (ShmComm);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
{
|
{
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
|
|
||||||
@ -390,13 +422,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.resize(0);
|
list.resize(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilBarrier(void)
|
|
||||||
{
|
|
||||||
MPI_Barrier (ShmComm);
|
|
||||||
}
|
|
||||||
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
||||||
//{
|
|
||||||
//}
|
|
||||||
void CartesianCommunicator::Barrier(void)
|
void CartesianCommunicator::Barrier(void)
|
||||||
{
|
{
|
||||||
int ierr = MPI_Barrier(communicator);
|
int ierr = MPI_Barrier(communicator);
|
||||||
@ -411,8 +436,8 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
|||||||
communicator);
|
communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
int CartesianCommunicator::RankWorld(void){
|
int CartesianCommunicator::RankWorld(void){
|
||||||
int r;
|
int r;
|
||||||
MPI_Comm_rank(communicator_world,&r);
|
MPI_Comm_rank(communicator_world,&r);
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
@ -445,7 +470,7 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
|
|||||||
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
||||||
// (Turns up on 32^3 x 64 Gparity too)
|
// (Turns up on 32^3 x 64 Gparity too)
|
||||||
MPI_Datatype object;
|
MPI_Datatype object;
|
||||||
int iwords;
|
int iwords;
|
||||||
int ibytes;
|
int ibytes;
|
||||||
iwords = words;
|
iwords = words;
|
||||||
ibytes = bytes;
|
ibytes = bytes;
|
||||||
@ -458,3 +483,5 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,6 +77,15 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
|
|||||||
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
||||||
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
||||||
|
|
||||||
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||||
|
void *recv,
|
||||||
|
int xmit_to_rank,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Basic Halo comms primitive -- should never call in single node
|
// Basic Halo comms primitive -- should never call in single node
|
||||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
@ -87,6 +96,20 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(in,out,bytes*words);
|
bcopy(in,out,bytes*words);
|
||||||
@ -114,6 +137,10 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes, int dir)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
|
std::vector<CommsRequest_t> list;
|
||||||
|
// Discard the "dir"
|
||||||
|
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||||
|
SendToRecvFromComplete(list);
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
@ -123,10 +150,13 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes, int dir)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
|
// Discard the "dir"
|
||||||
|
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
|
SendToRecvFromComplete(waitall);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::StencilBarrier(void){};
|
void CartesianCommunicator::StencilBarrier(void){};
|
||||||
|
@ -29,12 +29,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
#include <pwd.h>
|
#include <pwd.h>
|
||||||
|
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_NVCC
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_HIP
|
|
||||||
#include <hip/hip_runtime_api.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
#define header "SharedMemoryMpi: "
|
#define header "SharedMemoryMpi: "
|
||||||
@ -50,12 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
|||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// Split into groups that can share memory
|
// Split into groups that can share memory
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
#ifndef GRID_MPI3_SHM_NONE
|
|
||||||
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
|
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
|
||||||
#else
|
|
||||||
MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
|
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
|
||||||
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
|
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
|
||||||
|
|
||||||
@ -178,24 +170,17 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
|
|||||||
std::vector<int> primes({2,3,5});
|
std::vector<int> primes({2,3,5});
|
||||||
|
|
||||||
int dim = 0;
|
int dim = 0;
|
||||||
int last_dim = ndimension - 1;
|
|
||||||
int AutoShmSize = 1;
|
int AutoShmSize = 1;
|
||||||
while(AutoShmSize != WorldShmSize) {
|
while(AutoShmSize != WorldShmSize) {
|
||||||
int p;
|
for(int p=0;p<primes.size();p++) {
|
||||||
for(p=0;p<primes.size();p++) {
|
|
||||||
int prime=primes[p];
|
int prime=primes[p];
|
||||||
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
||||||
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
||||||
AutoShmSize*=prime;
|
AutoShmSize*=prime;
|
||||||
ShmDims[dim]*=prime;
|
ShmDims[dim]*=prime;
|
||||||
last_dim = dim;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (p == primes.size() && last_dim == dim) {
|
|
||||||
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
dim=(dim+1) %ndimension;
|
dim=(dim+1) %ndimension;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -428,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#if defined(GRID_CUDA) ||defined(GRID_HIP)
|
#ifdef GRID_NVCC
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -448,18 +433,27 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
||||||
|
|
||||||
|
#ifdef GRID_IBM_SUMMIT
|
||||||
|
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||||
|
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
|
||||||
|
#else
|
||||||
|
std::cout << "setting device to WorldShmRank"<<std::endl;
|
||||||
|
cudaSetDevice(WorldShmRank);
|
||||||
|
#endif
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
auto err = cudaMalloc(&ShmCommBuf, bytes);
|
||||||
|
if ( err != cudaSuccess) {
|
||||||
|
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
if ( WorldRank == 0 ){
|
if ( WorldRank == 0 ){
|
||||||
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes
|
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
|
||||||
}
|
}
|
||||||
SharedMemoryZero(ShmCommBuf,bytes);
|
SharedMemoryZero(ShmCommBuf,bytes);
|
||||||
|
|
||||||
@ -467,31 +461,19 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// Loop over ranks/gpu's on our node
|
// Loop over ranks/gpu's on our node
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
for(int r=0;r<WorldShmSize;r++){
|
for(int r=0;r<WorldShmSize;r++){
|
||||||
|
|
||||||
#ifndef GRID_MPI3_SHM_NONE
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// If it is me, pass around the IPC access key
|
// If it is me, pass around the IPC access key
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cudaIpcMemHandle_t handle;
|
cudaIpcMemHandle_t handle;
|
||||||
|
|
||||||
if ( r==WorldShmRank ) {
|
if ( r==WorldShmRank ) {
|
||||||
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipIpcMemHandle_t handle;
|
|
||||||
if ( r==WorldShmRank ) {
|
|
||||||
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
|
|
||||||
if ( err != hipSuccess) {
|
|
||||||
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// Share this IPC handle across the Shm Comm
|
// Share this IPC handle across the Shm Comm
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
@ -508,31 +490,17 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// If I am not the source, overwrite thisBuf with remote buffer
|
// If I am not the source, overwrite thisBuf with remote buffer
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void * thisBuf = ShmCommBuf;
|
void * thisBuf = ShmCommBuf;
|
||||||
#ifdef GRID_CUDA
|
|
||||||
if ( r!=WorldShmRank ) {
|
if ( r!=WorldShmRank ) {
|
||||||
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
if ( r!=WorldShmRank ) {
|
|
||||||
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
|
|
||||||
if ( err != hipSuccess) {
|
|
||||||
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Save a copy of the device buffers
|
// Save a copy of the device buffers
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
WorldShmCommBufs[r] = thisBuf;
|
WorldShmCommBufs[r] = thisBuf;
|
||||||
#else
|
|
||||||
WorldShmCommBufs[r] = ShmCommBuf;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_ShmAllocBytes=bytes;
|
_ShmAllocBytes=bytes;
|
||||||
@ -709,7 +677,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
/////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////
|
||||||
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||||
{
|
{
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_NVCC
|
||||||
cudaMemset(dest,0,bytes);
|
cudaMemset(dest,0,bytes);
|
||||||
#else
|
#else
|
||||||
bzero(dest,bytes);
|
bzero(dest,bytes);
|
||||||
@ -717,7 +685,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
|||||||
}
|
}
|
||||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
||||||
{
|
{
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_NVCC
|
||||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||||
#else
|
#else
|
||||||
bcopy(src,dest,bytes);
|
bcopy(src,dest,bytes);
|
||||||
@ -737,11 +705,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// Split into groups that can share memory
|
// Split into groups that can share memory
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
#ifndef GRID_MPI3_SHM_NONE
|
|
||||||
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
||||||
#else
|
|
||||||
MPI_Comm_split(comm, rank, 0, &ShmComm);
|
|
||||||
#endif
|
|
||||||
MPI_Comm_rank(ShmComm ,&ShmRank);
|
MPI_Comm_rank(ShmComm ,&ShmRank);
|
||||||
MPI_Comm_size(ShmComm ,&ShmSize);
|
MPI_Comm_size(ShmComm ,&ShmSize);
|
||||||
ShmCommBufs.resize(ShmSize);
|
ShmCommBufs.resize(ShmSize);
|
||||||
|
@ -52,8 +52,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
template<typename Op, typename T1>
|
||||||
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
|
||||||
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
|
||||||
|
{
|
||||||
|
return Cshift(closure(expr),dim,shift);
|
||||||
|
}
|
||||||
|
template <class Op, class T1, class T2>
|
||||||
|
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
|
||||||
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
|
||||||
|
{
|
||||||
|
return Cshift(closure(expr),dim,shift);
|
||||||
|
}
|
||||||
|
template <class Op, class T1, class T2, class T3>
|
||||||
|
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
|
||||||
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
||||||
|
eval(0, expr.arg2),
|
||||||
|
eval(0, expr.arg3)))>
|
||||||
{
|
{
|
||||||
return Cshift(closure(expr),dim,shift);
|
return Cshift(closure(expr),dim,shift);
|
||||||
}
|
}
|
||||||
|
@ -29,8 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern Vector<std::pair<int,int> > Cshift_table;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -48,16 +46,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int ent = 0;
|
int ent = 0;
|
||||||
|
|
||||||
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
static Vector<std::pair<int,int> > table; table.resize(e1*e2);
|
||||||
|
|
||||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
|
auto rhs_v = rhs.View();
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int bo = n*e2;
|
int bo = n*e2;
|
||||||
Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
|
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -67,19 +65,14 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
|
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{
|
thread_for(i,ent,{
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
buffer[table[i].first]=rhs_v[table[i].second];
|
||||||
auto buffer_p = & buffer[0];
|
});
|
||||||
auto table = &Cshift_table[0];
|
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
|
||||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -102,38 +95,36 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
|
auto rhs_v = rhs.View();
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
thread_for_collapse(2,n,e1,{
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
int offset = b+n*e2;
|
int offset = b+n*e2;
|
||||||
|
|
||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
});
|
}
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
|
||||||
|
|
||||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
// Test_cshift_red_black code.
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
std::cout << " Dense packed buffer WARNING " <<std::endl;
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
thread_for_collapse(2,n,e1,{
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
Coordinate coor;
|
|
||||||
|
|
||||||
int o=n*n1;
|
int o=n*n1;
|
||||||
int oindex = o+b;
|
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||||
|
|
||||||
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
|
||||||
|
|
||||||
int ocb=1<<cb;
|
|
||||||
int offset = b+n*e2;
|
int offset = b+n*e2;
|
||||||
|
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,8 +145,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
||||||
|
|
||||||
int ent =0;
|
int ent =0;
|
||||||
|
|
||||||
if ( cbmask ==0x3 ) {
|
if ( cbmask ==0x3 ) {
|
||||||
@ -164,7 +154,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int bo =n*rhs.Grid()->_slice_block[dimension];
|
int bo =n*rhs.Grid()->_slice_block[dimension];
|
||||||
Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
|
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -175,20 +165,16 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
|
table[ent++]=std::pair<int,int> (so+o+b,bo++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
auto rhs_v = rhs.View();
|
||||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
thread_for(i,ent,{
|
||||||
auto buffer_p = & buffer[0];
|
rhs_v[table[i].first]=buffer[table[i].second];
|
||||||
auto table = &Cshift_table[0];
|
});
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
|
||||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -208,22 +194,21 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
|
|
||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
auto rhs_v = rhs.View();
|
||||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
thread_for_collapse(2,n,e1,{
|
||||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
for(int b=0;b<e2;b++){
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int o = n*_slice_stride;
|
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
||||||
int offset = b+n*_slice_block;
|
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
});
|
}
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
// Test_cshift_red_black code.
|
// Test_cshift_red_black code.
|
||||||
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
||||||
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
||||||
assert(0); // This will fail if hit on GPU
|
auto rhs_v = rhs.View();
|
||||||
autoView( rhs_v, rhs, CpuWrite);
|
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||||
@ -240,7 +225,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// local to node block strided copies
|
// local to node block strided copies
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
@ -255,16 +239,14 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
|
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
||||||
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
|
||||||
|
|
||||||
int ent=0;
|
int ent=0;
|
||||||
|
|
||||||
if(cbmask == 0x3 ){
|
if(cbmask == 0x3 ){
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -273,24 +255,23 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
auto rhs_v = rhs.View();
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
thread_for(i,ent,{
|
||||||
auto table = &Cshift_table[0];
|
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
});
|
||||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
||||||
{
|
{
|
||||||
|
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
|
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
|
||||||
@ -304,33 +285,29 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
int e2=rhs.Grid()->_slice_block [dimension];
|
int e2=rhs.Grid()->_slice_block [dimension];
|
||||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
||||||
|
|
||||||
int ent=0;
|
int ent=0;
|
||||||
|
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||||
}}
|
}}
|
||||||
} else {
|
} else {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
|
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
auto rhs_v = rhs.View();
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
thread_for(i,ent,{
|
||||||
auto table = &Cshift_table[0];
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
accelerator_for(i,ent,1,{
|
});
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
|
@ -1,4 +0,0 @@
|
|||||||
#include <Grid/GridCore.h>
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
Vector<std::pair<int,int> > Cshift_table;
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -26,7 +26,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <Grid/lattice/Lattice_view.h>
|
|
||||||
#include <Grid/lattice/Lattice_base.h>
|
#include <Grid/lattice/Lattice_base.h>
|
||||||
#include <Grid/lattice/Lattice_conformable.h>
|
#include <Grid/lattice/Lattice_conformable.h>
|
||||||
#include <Grid/lattice/Lattice_ET.h>
|
#include <Grid/lattice/Lattice_ET.h>
|
||||||
@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/lattice/Lattice_reduction.h>
|
#include <Grid/lattice/Lattice_reduction.h>
|
||||||
#include <Grid/lattice/Lattice_peekpoke.h>
|
#include <Grid/lattice/Lattice_peekpoke.h>
|
||||||
//#include <Grid/lattice/Lattice_reality.h>
|
//#include <Grid/lattice/Lattice_reality.h>
|
||||||
#include <Grid/lattice/Lattice_real_imag.h>
|
|
||||||
#include <Grid/lattice/Lattice_comparison_utils.h>
|
#include <Grid/lattice/Lattice_comparison_utils.h>
|
||||||
#include <Grid/lattice/Lattice_comparison.h>
|
#include <Grid/lattice/Lattice_comparison.h>
|
||||||
#include <Grid/lattice/Lattice_coordinate.h>
|
#include <Grid/lattice/Lattice_coordinate.h>
|
||||||
|
@ -42,24 +42,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
// Predicated where support
|
// Predicated where support
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
#ifdef GRID_SIMT
|
|
||||||
// drop to scalar in SIMT; cleaner in fact
|
|
||||||
template <class iobj, class vobj, class robj>
|
template <class iobj, class vobj, class robj>
|
||||||
accelerator_inline vobj predicatedWhere(const iobj &predicate,
|
accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
|
||||||
const vobj &iftrue,
|
const robj &iffalse) {
|
||||||
const robj &iffalse)
|
|
||||||
{
|
|
||||||
Integer mask = TensorRemove(predicate);
|
|
||||||
typename std::remove_const<vobj>::type ret= iffalse;
|
|
||||||
if (mask) ret=iftrue;
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
template <class iobj, class vobj, class robj>
|
|
||||||
accelerator_inline vobj predicatedWhere(const iobj &predicate,
|
|
||||||
const vobj &iftrue,
|
|
||||||
const robj &iffalse)
|
|
||||||
{
|
|
||||||
typename std::remove_const<vobj>::type ret;
|
typename std::remove_const<vobj>::type ret;
|
||||||
|
|
||||||
typedef typename vobj::scalar_object scalar_object;
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
@ -83,7 +68,6 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
|
|||||||
merge(ret, falsevals);
|
merge(ret, falsevals);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
//Specialization of getVectorType for lattices
|
//Specialization of getVectorType for lattices
|
||||||
@ -97,62 +81,26 @@ struct getVectorType<Lattice<T> >{
|
|||||||
//-- recursive evaluation of expressions; --
|
//-- recursive evaluation of expressions; --
|
||||||
// handle leaves of syntax tree
|
// handle leaves of syntax tree
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template<class sobj,
|
template<class sobj> accelerator_inline
|
||||||
typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr>
|
|
||||||
accelerator_inline
|
|
||||||
sobj eval(const uint64_t ss, const sobj &arg)
|
sobj eval(const uint64_t ss, const sobj &arg)
|
||||||
{
|
{
|
||||||
return arg;
|
return arg;
|
||||||
}
|
}
|
||||||
template <class lobj> accelerator_inline
|
|
||||||
auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
|
|
||||||
{
|
|
||||||
return arg(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////
|
|
||||||
//-- recursive evaluation of expressions; --
|
|
||||||
// whole vector return, used only for expression return type inference
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
template<class sobj> accelerator_inline
|
|
||||||
sobj vecEval(const uint64_t ss, const sobj &arg)
|
|
||||||
{
|
|
||||||
return arg;
|
|
||||||
}
|
|
||||||
template <class lobj> accelerator_inline
|
template <class lobj> accelerator_inline
|
||||||
const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg)
|
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
|
||||||
{
|
{
|
||||||
return arg[ss];
|
return arg[ss];
|
||||||
}
|
}
|
||||||
|
template <class lobj> accelerator_inline
|
||||||
|
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
|
||||||
|
{
|
||||||
|
auto view = arg.AcceleratorView(ViewRead);
|
||||||
|
return view[ss];
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// handle nodes in syntax tree- eval one operand
|
// handle nodes in syntax tree- eval one operand
|
||||||
// vecEval needed (but never called as all expressions offloaded) to infer the return type
|
|
||||||
// in SIMT contexts of closure.
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
template <typename Op, typename T1> accelerator_inline
|
|
||||||
auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
|
||||||
-> decltype(expr.op.func( vecEval(ss, expr.arg1)))
|
|
||||||
{
|
|
||||||
return expr.op.func( vecEval(ss, expr.arg1) );
|
|
||||||
}
|
|
||||||
// vecEval two operands
|
|
||||||
template <typename Op, typename T1, typename T2> accelerator_inline
|
|
||||||
auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
|
|
||||||
-> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
|
|
||||||
{
|
|
||||||
return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
|
|
||||||
}
|
|
||||||
// vecEval three operands
|
|
||||||
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
|
|
||||||
auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
|
||||||
-> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
|
|
||||||
{
|
|
||||||
return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
// handle nodes in syntax tree- eval one operand coalesced
|
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <typename Op, typename T1> accelerator_inline
|
template <typename Op, typename T1> accelerator_inline
|
||||||
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
||||||
@ -160,41 +108,23 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
|||||||
{
|
{
|
||||||
return expr.op.func( eval(ss, expr.arg1) );
|
return expr.op.func( eval(ss, expr.arg1) );
|
||||||
}
|
}
|
||||||
|
///////////////////////
|
||||||
// eval two operands
|
// eval two operands
|
||||||
|
///////////////////////
|
||||||
template <typename Op, typename T1, typename T2> accelerator_inline
|
template <typename Op, typename T1, typename T2> accelerator_inline
|
||||||
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
|
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
|
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
|
||||||
{
|
{
|
||||||
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
|
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
|
||||||
}
|
}
|
||||||
|
///////////////////////
|
||||||
// eval three operands
|
// eval three operands
|
||||||
|
///////////////////////
|
||||||
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
|
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
|
||||||
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
-> decltype(expr.op.func(eval(ss, expr.arg1),
|
-> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
|
||||||
eval(ss, expr.arg2),
|
|
||||||
eval(ss, expr.arg3)))
|
|
||||||
{
|
{
|
||||||
#ifdef GRID_SIMT
|
return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
|
||||||
// Handles Nsimd (vInteger) != Nsimd(ComplexD)
|
|
||||||
typedef decltype(vecEval(ss, expr.arg2)) rvobj;
|
|
||||||
typedef typename std::remove_reference<rvobj>::type vobj;
|
|
||||||
|
|
||||||
const int Nsimd = vobj::vector_type::Nsimd();
|
|
||||||
|
|
||||||
auto vpred = vecEval(ss,expr.arg1);
|
|
||||||
|
|
||||||
ExtractBuffer<Integer> mask(Nsimd);
|
|
||||||
extract<vInteger, Integer>(TensorRemove(vpred), mask);
|
|
||||||
|
|
||||||
int s = acceleratorSIMTlane(Nsimd);
|
|
||||||
return expr.op.func(mask[s],
|
|
||||||
eval(ss, expr.arg2),
|
|
||||||
eval(ss, expr.arg3));
|
|
||||||
#else
|
|
||||||
return expr.op.func(eval(ss, expr.arg1),
|
|
||||||
eval(ss, expr.arg2),
|
|
||||||
eval(ss, expr.arg3));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -250,12 +180,16 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
|
|||||||
cb = lat.Checkerboard();
|
cb = lat.Checkerboard();
|
||||||
}
|
}
|
||||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||||
inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf
|
inline void CBFromExpression(int &cb, const T1 ¬lat) // non-lattice leaf
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Op, typename T1> inline
|
template <typename Op, typename T1> inline
|
||||||
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
|
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
|
||||||
{
|
{
|
||||||
CBFromExpression(cb, expr.arg1); // recurse AST
|
CBFromExpression(cb, expr.arg1); // recurse AST
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Op, typename T1, typename T2> inline
|
template <typename Op, typename T1, typename T2> inline
|
||||||
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
|
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
{
|
{
|
||||||
@ -270,74 +204,13 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
|
|||||||
CBFromExpression(cb, expr.arg3); // recurse AST
|
CBFromExpression(cb, expr.arg3); // recurse AST
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// ViewOpen
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
|
|
||||||
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
|
|
||||||
{
|
|
||||||
lat.ViewOpen(AcceleratorRead);
|
|
||||||
}
|
|
||||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
|
||||||
inline void ExpressionViewOpen(T1 ¬lat) {}
|
|
||||||
|
|
||||||
template <typename Op, typename T1> inline
|
|
||||||
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
|
|
||||||
{
|
|
||||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Op, typename T1, typename T2> inline
|
|
||||||
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
|
|
||||||
{
|
|
||||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
|
||||||
ExpressionViewOpen(expr.arg2); // rrecurse AST
|
|
||||||
}
|
|
||||||
template <typename Op, typename T1, typename T2, typename T3>
|
|
||||||
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
|
||||||
{
|
|
||||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
|
||||||
ExpressionViewOpen(expr.arg2); // recurse AST
|
|
||||||
ExpressionViewOpen(expr.arg3); // recurse AST
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// ViewClose
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
|
|
||||||
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
|
|
||||||
{
|
|
||||||
lat.ViewClose();
|
|
||||||
}
|
|
||||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
|
||||||
inline void ExpressionViewClose(T1 ¬lat) {}
|
|
||||||
|
|
||||||
template <typename Op, typename T1> inline
|
|
||||||
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
|
|
||||||
{
|
|
||||||
ExpressionViewClose(expr.arg1); // recurse AST
|
|
||||||
}
|
|
||||||
template <typename Op, typename T1, typename T2> inline
|
|
||||||
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
|
|
||||||
{
|
|
||||||
ExpressionViewClose(expr.arg1); // recurse AST
|
|
||||||
ExpressionViewClose(expr.arg2); // recurse AST
|
|
||||||
}
|
|
||||||
template <typename Op, typename T1, typename T2, typename T3>
|
|
||||||
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
|
||||||
{
|
|
||||||
ExpressionViewClose(expr.arg1); // recurse AST
|
|
||||||
ExpressionViewClose(expr.arg2); // recurse AST
|
|
||||||
ExpressionViewClose(expr.arg3); // recurse AST
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// Unary operators and funcs
|
// Unary operators and funcs
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
#define GridUnopClass(name, ret) \
|
#define GridUnopClass(name, ret) \
|
||||||
|
template <class arg> \
|
||||||
struct name { \
|
struct name { \
|
||||||
template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
|
static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
|
||||||
};
|
};
|
||||||
|
|
||||||
GridUnopClass(UnarySub, -a);
|
GridUnopClass(UnarySub, -a);
|
||||||
@ -348,6 +221,8 @@ GridUnopClass(UnaryTrace, trace(a));
|
|||||||
GridUnopClass(UnaryTranspose, transpose(a));
|
GridUnopClass(UnaryTranspose, transpose(a));
|
||||||
GridUnopClass(UnaryTa, Ta(a));
|
GridUnopClass(UnaryTa, Ta(a));
|
||||||
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
|
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
|
||||||
|
GridUnopClass(UnaryReal, real(a));
|
||||||
|
GridUnopClass(UnaryImag, imag(a));
|
||||||
GridUnopClass(UnaryToReal, toReal(a));
|
GridUnopClass(UnaryToReal, toReal(a));
|
||||||
GridUnopClass(UnaryToComplex, toComplex(a));
|
GridUnopClass(UnaryToComplex, toComplex(a));
|
||||||
GridUnopClass(UnaryTimesI, timesI(a));
|
GridUnopClass(UnaryTimesI, timesI(a));
|
||||||
@ -366,10 +241,10 @@ GridUnopClass(UnaryExp, exp(a));
|
|||||||
// Binary operators
|
// Binary operators
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
#define GridBinOpClass(name, combination) \
|
#define GridBinOpClass(name, combination) \
|
||||||
|
template <class left, class right> \
|
||||||
struct name { \
|
struct name { \
|
||||||
template <class _left, class _right> \
|
|
||||||
static auto accelerator_inline \
|
static auto accelerator_inline \
|
||||||
func(const _left &lhs, const _right &rhs) \
|
func(const left &lhs, const right &rhs) \
|
||||||
-> decltype(combination) const \
|
-> decltype(combination) const \
|
||||||
{ \
|
{ \
|
||||||
return combination; \
|
return combination; \
|
||||||
@ -389,10 +264,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
|
|||||||
// Trinary conditional op
|
// Trinary conditional op
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
#define GridTrinOpClass(name, combination) \
|
#define GridTrinOpClass(name, combination) \
|
||||||
|
template <class predicate, class left, class right> \
|
||||||
struct name { \
|
struct name { \
|
||||||
template <class _predicate,class _left, class _right> \
|
|
||||||
static auto accelerator_inline \
|
static auto accelerator_inline \
|
||||||
func(const _predicate &pred, const _left &lhs, const _right &rhs) \
|
func(const predicate &pred, const left &lhs, const right &rhs) \
|
||||||
-> decltype(combination) const \
|
-> decltype(combination) const \
|
||||||
{ \
|
{ \
|
||||||
return combination; \
|
return combination; \
|
||||||
@ -400,17 +275,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
|
|||||||
};
|
};
|
||||||
|
|
||||||
GridTrinOpClass(TrinaryWhere,
|
GridTrinOpClass(TrinaryWhere,
|
||||||
(predicatedWhere<
|
(predicatedWhere<predicate,
|
||||||
typename std::remove_reference<_predicate>::type,
|
typename std::remove_reference<left>::type,
|
||||||
typename std::remove_reference<_left>::type,
|
typename std::remove_reference<right>::type>(pred, lhs,rhs)));
|
||||||
typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
|
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// Operator syntactical glue
|
// Operator syntactical glue
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
#define GRID_UNOP(name) name
|
|
||||||
#define GRID_BINOP(name) name
|
#define GRID_UNOP(name) name<decltype(eval(0, arg))>
|
||||||
#define GRID_TRINOP(name) name
|
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
|
||||||
|
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
|
||||||
|
|
||||||
#define GRID_DEF_UNOP(op, name) \
|
#define GRID_DEF_UNOP(op, name) \
|
||||||
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
|
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
|
||||||
@ -462,6 +337,8 @@ GRID_DEF_UNOP(trace, UnaryTrace);
|
|||||||
GRID_DEF_UNOP(transpose, UnaryTranspose);
|
GRID_DEF_UNOP(transpose, UnaryTranspose);
|
||||||
GRID_DEF_UNOP(Ta, UnaryTa);
|
GRID_DEF_UNOP(Ta, UnaryTa);
|
||||||
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
|
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
|
||||||
|
GRID_DEF_UNOP(real, UnaryReal);
|
||||||
|
GRID_DEF_UNOP(imag, UnaryImag);
|
||||||
GRID_DEF_UNOP(toReal, UnaryToReal);
|
GRID_DEF_UNOP(toReal, UnaryToReal);
|
||||||
GRID_DEF_UNOP(toComplex, UnaryToComplex);
|
GRID_DEF_UNOP(toComplex, UnaryToComplex);
|
||||||
GRID_DEF_UNOP(timesI, UnaryTimesI);
|
GRID_DEF_UNOP(timesI, UnaryTimesI);
|
||||||
@ -494,36 +371,29 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
|
|||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
template <class Op, class T1>
|
template <class Op, class T1>
|
||||||
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
|
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))>
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
|
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template <class Op, class T1, class T2>
|
template <class Op, class T1, class T2>
|
||||||
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
|
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
|
Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template <class Op, class T1, class T2, class T3>
|
template <class Op, class T1, class T2, class T3>
|
||||||
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
||||||
vecEval(0, expr.arg2),
|
eval(0, expr.arg2),
|
||||||
vecEval(0, expr.arg3)))>
|
eval(0, expr.arg3)))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
|
Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
||||||
vecEval(0, expr.arg2),
|
eval(0, expr.arg2),
|
||||||
vecEval(0, expr.arg3)))> ret(expr);
|
eval(0, expr.arg3)))> ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
#define EXPRESSION_CLOSURE(function) \
|
|
||||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
|
|
||||||
auto function(Expression &expr) -> decltype(function(closure(expr))) \
|
|
||||||
{ \
|
|
||||||
return function(closure(expr)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#undef GRID_UNOP
|
#undef GRID_UNOP
|
||||||
#undef GRID_BINOP
|
#undef GRID_BINOP
|
||||||
|
@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
@ -56,13 +56,13 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
auto tmp =ret_v(ss);
|
|
||||||
mac(&tmp,&lhs_t,&rhs_t);
|
mac(&tmp,&lhs_t,&rhs_t);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
@ -73,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -89,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -108,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
mult(&tmp,&lhs_v(ss),&rhs);
|
mult(&tmp,&lhs_v(ss),&rhs);
|
||||||
@ -121,10 +121,10 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
auto tmp =ret_v(ss);
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
mac(&tmp,&lhs_t,&rhs);
|
mac(&tmp,&lhs_t,&rhs);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
@ -135,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -148,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -165,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( rhs_v , lhs, AcceleratorRead);
|
auto rhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -179,10 +179,10 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( rhs_v , lhs, AcceleratorRead);
|
auto rhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
auto tmp =ret_v(ss);
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
mac(&tmp,&lhs,&rhs_t);
|
mac(&tmp,&lhs,&rhs_t);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
@ -193,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( rhs_v , lhs, AcceleratorRead);
|
auto rhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -206,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( rhs_v , lhs, AcceleratorRead);
|
auto rhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -221,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
|
|||||||
ret.Checkerboard() = x.Checkerboard();
|
ret.Checkerboard() = x.Checkerboard();
|
||||||
conformable(ret,x);
|
conformable(ret,x);
|
||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( x_v , x, AcceleratorRead);
|
auto x_v = x.AcceleratorView(ViewRead);
|
||||||
autoView( y_v , y, AcceleratorRead);
|
auto y_v = y.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||||
auto tmp = a*x_v(ss)+y_v(ss);
|
auto tmp = a*x_v(ss)+y_v(ss);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
@ -234,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
|||||||
ret.Checkerboard() = x.Checkerboard();
|
ret.Checkerboard() = x.Checkerboard();
|
||||||
conformable(ret,x);
|
conformable(ret,x);
|
||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
autoView( x_v , x, AcceleratorRead);
|
auto x_v = x.AcceleratorView(ViewRead);
|
||||||
autoView( y_v , y, AcceleratorRead);
|
auto y_v = y.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
|
@ -29,7 +29,6 @@ See the full license in the file "LICENSE" in the top level distribution
|
|||||||
directory
|
directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#define STREAMING_STORES
|
#define STREAMING_STORES
|
||||||
@ -38,6 +37,180 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
extern int GridCshiftPermuteMap[4][16];
|
extern int GridCshiftPermuteMap[4][16];
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// Base class which can be used by traits to pick up behaviour
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
class LatticeBase {};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Conformable checks; same instance of Grid required
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
|
||||||
|
{
|
||||||
|
assert(lhs == rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Advise the LatticeAccelerator class
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
enum LatticeAcceleratorAdvise {
|
||||||
|
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
|
||||||
|
// significantly influence performance of bulk storage.
|
||||||
|
AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures
|
||||||
|
// enables read-only copies of memory to be kept on
|
||||||
|
// host and device.
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// View Access Mode
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
enum ViewMode {
|
||||||
|
ViewRead = 0x1,
|
||||||
|
ViewWrite = 0x2,
|
||||||
|
ViewReadWrite = 0x3
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Minimal base class containing only data valid to access from accelerator
|
||||||
|
// _odata will be a managed pointer in CUDA
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Force access to lattice through a view object.
|
||||||
|
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
|
||||||
|
// strict since host could could in principle direct access through the lattice object
|
||||||
|
// Need to decide programming model.
|
||||||
|
#define LATTICE_VIEW_STRICT
|
||||||
|
template<class vobj> class LatticeAccelerator : public LatticeBase
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
GridBase *_grid;
|
||||||
|
int checkerboard;
|
||||||
|
vobj *_odata; // A managed pointer
|
||||||
|
uint64_t _odata_size;
|
||||||
|
public:
|
||||||
|
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
|
||||||
|
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
|
||||||
|
accelerator_inline int Checkerboard(void) const { return checkerboard; };
|
||||||
|
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
|
||||||
|
accelerator_inline void Conformable(GridBase * &grid) const
|
||||||
|
{
|
||||||
|
if (grid) conformable(grid, _grid);
|
||||||
|
else grid = _grid;
|
||||||
|
};
|
||||||
|
|
||||||
|
accelerator_inline void Advise(int advise) {
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
#ifndef __CUDA_ARCH__ // only on host
|
||||||
|
if (advise & AdviseInfrequentUse) {
|
||||||
|
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
|
||||||
|
}
|
||||||
|
if (advise & AdviseReadMostly) {
|
||||||
|
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
#ifndef __CUDA_ARCH__ // only on host
|
||||||
|
int target;
|
||||||
|
cudaGetDevice(&target);
|
||||||
|
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
#ifndef __CUDA_ARCH__ // only on host
|
||||||
|
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// A View class which provides accessor to the data.
|
||||||
|
// This will be safe to call from accelerator_for and is trivially copy constructible
|
||||||
|
// The copy constructor for this will need to be used by device lambda functions
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj>
|
||||||
|
class LatticeView : public LatticeAccelerator<vobj>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
|
||||||
|
|
||||||
|
// Rvalue
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
|
||||||
|
#else
|
||||||
|
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||||
|
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
||||||
|
|
||||||
|
accelerator_inline uint64_t begin(void) const { return 0;};
|
||||||
|
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||||
|
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||||
|
|
||||||
|
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Lattice expression types used by ET to assemble the AST
|
||||||
|
//
|
||||||
|
// Need to be able to detect code paths according to the whether a lattice object or not
|
||||||
|
// so introduce some trait type things
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
class LatticeExpressionBase {};
|
||||||
|
|
||||||
|
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
|
||||||
|
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
||||||
|
|
||||||
|
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
||||||
|
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
|
||||||
|
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
||||||
|
|
||||||
|
template <typename Op, typename _T1>
|
||||||
|
class LatticeUnaryExpression : public LatticeExpressionBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef typename ViewMap<_T1>::Type T1;
|
||||||
|
Op op;
|
||||||
|
T1 arg1;
|
||||||
|
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Op, typename _T1, typename _T2>
|
||||||
|
class LatticeBinaryExpression : public LatticeExpressionBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef typename ViewMap<_T1>::Type T1;
|
||||||
|
typedef typename ViewMap<_T2>::Type T2;
|
||||||
|
Op op;
|
||||||
|
T1 arg1;
|
||||||
|
T2 arg2;
|
||||||
|
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Op, typename _T1, typename _T2, typename _T3>
|
||||||
|
class LatticeTrinaryExpression : public LatticeExpressionBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef typename ViewMap<_T1>::Type T1;
|
||||||
|
typedef typename ViewMap<_T2>::Type T2;
|
||||||
|
typedef typename ViewMap<_T3>::Type T3;
|
||||||
|
Op op;
|
||||||
|
T1 arg1;
|
||||||
|
T2 arg2;
|
||||||
|
T3 arg3;
|
||||||
|
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
|
||||||
|
};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// The real lattice class, with normal copy and assignment semantics.
|
// The real lattice class, with normal copy and assignment semantics.
|
||||||
// This contains extra (host resident) grid pointer data that may be accessed by host code
|
// This contains extra (host resident) grid pointer data that may be accessed by host code
|
||||||
@ -73,33 +246,38 @@ private:
|
|||||||
dealloc();
|
dealloc();
|
||||||
|
|
||||||
this->_odata_size = size;
|
this->_odata_size = size;
|
||||||
if ( size )
|
if ( size )
|
||||||
this->_odata = alloc.allocate(this->_odata_size);
|
this->_odata = alloc.allocate(this->_odata_size);
|
||||||
else
|
else
|
||||||
this->_odata = nullptr;
|
this->_odata = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void SetViewMode(ViewMode mode) {
|
|
||||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
|
||||||
accessor.ViewClose();
|
|
||||||
}
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
// Return a view object that may be dereferenced in site loops.
|
// Return a view object that may be dereferenced in site loops.
|
||||||
// The view is trivially copy constructible and may be copied to an accelerator device
|
// The view is trivially copy constructible and may be copied to an accelerator device
|
||||||
// in device lambdas
|
// in device lambdas
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
|
||||||
LatticeView<vobj> View (ViewMode mode) const
|
{ // and HostView for thread_for
|
||||||
{
|
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
||||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
|
||||||
return accessor;
|
return accessor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const
|
||||||
|
{
|
||||||
|
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
||||||
|
accessor.AcceleratorPrefetch(mode);
|
||||||
|
return accessor;
|
||||||
|
}
|
||||||
|
|
||||||
|
LatticeView<vobj> HostView(int mode = ViewReadWrite) const
|
||||||
|
{
|
||||||
|
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
||||||
|
accessor.HostPrefetch(mode);
|
||||||
|
return accessor;
|
||||||
|
}
|
||||||
|
|
||||||
~Lattice() {
|
~Lattice() {
|
||||||
if ( this->_odata_size ) {
|
if ( this->_odata_size ) {
|
||||||
dealloc();
|
dealloc();
|
||||||
@ -119,16 +297,12 @@ public:
|
|||||||
CBFromExpression(cb,expr);
|
CBFromExpression(cb,expr);
|
||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
this->checkerboard=cb;
|
this->checkerboard=cb;
|
||||||
|
|
||||||
auto exprCopy = expr;
|
auto me = AcceleratorView(ViewWrite);
|
||||||
ExpressionViewOpen(exprCopy);
|
accelerator_for(ss,me.size(),1,{
|
||||||
auto me = View(AcceleratorWriteDiscard);
|
auto tmp = eval(ss,expr);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
vstream(me[ss],tmp);
|
||||||
auto tmp = eval(ss,exprCopy);
|
|
||||||
coalescedWrite(me[ss],tmp);
|
|
||||||
});
|
});
|
||||||
me.ViewClose();
|
|
||||||
ExpressionViewClose(exprCopy);
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
|
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
|
||||||
@ -143,15 +317,11 @@ public:
|
|||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
this->checkerboard=cb;
|
this->checkerboard=cb;
|
||||||
|
|
||||||
auto exprCopy = expr;
|
auto me = AcceleratorView(ViewWrite);
|
||||||
ExpressionViewOpen(exprCopy);
|
accelerator_for(ss,me.size(),1,{
|
||||||
auto me = View(AcceleratorWriteDiscard);
|
auto tmp = eval(ss,expr);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
vstream(me[ss],tmp);
|
||||||
auto tmp = eval(ss,exprCopy);
|
|
||||||
coalescedWrite(me[ss],tmp);
|
|
||||||
});
|
});
|
||||||
me.ViewClose();
|
|
||||||
ExpressionViewClose(exprCopy);
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
|
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
|
||||||
@ -165,15 +335,11 @@ public:
|
|||||||
CBFromExpression(cb,expr);
|
CBFromExpression(cb,expr);
|
||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
this->checkerboard=cb;
|
this->checkerboard=cb;
|
||||||
auto exprCopy = expr;
|
auto me = AcceleratorView(ViewWrite);
|
||||||
ExpressionViewOpen(exprCopy);
|
accelerator_for(ss,me.size(),1,{
|
||||||
auto me = View(AcceleratorWriteDiscard);
|
auto tmp = eval(ss,expr);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
vstream(me[ss],tmp);
|
||||||
auto tmp = eval(ss,exprCopy);
|
|
||||||
coalescedWrite(me[ss],tmp);
|
|
||||||
});
|
});
|
||||||
me.ViewClose();
|
|
||||||
ExpressionViewClose(exprCopy);
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
//GridFromExpression is tricky to do
|
//GridFromExpression is tricky to do
|
||||||
@ -224,11 +390,10 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
auto me = View(CpuWrite);
|
auto me = View();
|
||||||
thread_for(ss,me.size(),{
|
thread_for(ss,me.size(),{
|
||||||
me[ss]= r;
|
me[ss] = r;
|
||||||
});
|
});
|
||||||
me.ViewClose();
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -238,12 +403,11 @@ public:
|
|||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// user defined constructor
|
// user defined constructor
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
|
Lattice(GridBase *grid) {
|
||||||
this->_grid = grid;
|
this->_grid = grid;
|
||||||
resize(this->_grid->oSites());
|
resize(this->_grid->oSites());
|
||||||
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
|
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
|
||||||
this->checkerboard=0;
|
this->checkerboard=0;
|
||||||
SetViewMode(mode);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// virtual ~Lattice(void) = default;
|
// virtual ~Lattice(void) = default;
|
||||||
@ -281,12 +445,11 @@ public:
|
|||||||
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
|
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
|
||||||
conformable(*this,r);
|
conformable(*this,r);
|
||||||
this->checkerboard = r.Checkerboard();
|
this->checkerboard = r.Checkerboard();
|
||||||
auto me = View(AcceleratorWriteDiscard);
|
auto me = AcceleratorView(ViewWrite);
|
||||||
auto him= r.View(AcceleratorRead);
|
auto him= r.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(me[ss],him(ss));
|
coalescedWrite(me[ss],him(ss));
|
||||||
});
|
});
|
||||||
me.ViewClose(); him.ViewClose();
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -296,12 +459,11 @@ public:
|
|||||||
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
|
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
|
||||||
this->checkerboard = r.Checkerboard();
|
this->checkerboard = r.Checkerboard();
|
||||||
conformable(*this,r);
|
conformable(*this,r);
|
||||||
auto me = View(AcceleratorWriteDiscard);
|
auto me = AcceleratorView(ViewWrite);
|
||||||
auto him= r.View(AcceleratorRead);
|
auto him= r.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(me[ss],him(ss));
|
coalescedWrite(me[ss],him(ss));
|
||||||
});
|
});
|
||||||
me.ViewClose(); him.ViewClose();
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
|
@ -51,39 +51,34 @@ template<class VField, class Matrix>
|
|||||||
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||||
{
|
{
|
||||||
typedef decltype(basis[0]) Field;
|
typedef decltype(basis[0]) Field;
|
||||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
typedef decltype(basis[0].View()) View;
|
||||||
|
auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
|
||||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
Vector<View> basis_v(basis.size(),tmp_v);
|
||||||
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
|
||||||
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
|
#ifndef GRID_NVCC
|
||||||
int max_threads = thread_max();
|
|
||||||
Vector < vobj > Bt(Nm * max_threads);
|
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
vobj* B = &Bt[Nm * thread_num()];
|
std::vector < vobj > B(Nm); // Thread private
|
||||||
thread_for_in_region(ss, grid->oSites(),{
|
thread_for_in_region(ss, grid->oSites(),{
|
||||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||||
|
|
||||||
for(int j=j0; j<j1; ++j){
|
for(int j=j0; j<j1; ++j){
|
||||||
for(int k=k0; k<k1; ++k){
|
for(int k=k0; k<k1; ++k){
|
||||||
B[j] +=Qt(j,k) * basis_v[k][ss];
|
B[j] +=Qt(j,k) * basis_v[k][ss];
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for(int j=j0; j<j1; ++j){
|
}
|
||||||
basis_v[j][ss] = B[j];
|
for(int j=j0; j<j1; ++j){
|
||||||
}
|
basis_v[j][ss] = B[j];
|
||||||
});
|
}
|
||||||
}
|
});
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
View *basis_vp = &basis_v[0];
|
|
||||||
|
|
||||||
int nrot = j1-j0;
|
int nrot = j1-j0;
|
||||||
if (!nrot) // edge case not handled gracefully by Cuda
|
if (!nrot) // edge case not handled gracefully by Cuda
|
||||||
return;
|
return;
|
||||||
@ -95,13 +90,13 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
auto Bp=&Bt[0];
|
auto Bp=&Bt[0];
|
||||||
|
|
||||||
// GPU readable copy of matrix
|
// GPU readable copy of matrix
|
||||||
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
Vector<double> Qt_jv(Nm*Nm);
|
||||||
Coeff_t *Qt_p = & Qt_jv[0];
|
double *Qt_p = & Qt_jv[0];
|
||||||
thread_for(i,Nm*Nm,{
|
thread_for(i,Nm*Nm,{
|
||||||
int j = i/Nm;
|
int j = i/Nm;
|
||||||
int k = i%Nm;
|
int k = i%Nm;
|
||||||
Qt_p[i]=Qt(j,k);
|
Qt_p[i]=Qt(j,k);
|
||||||
});
|
});
|
||||||
|
|
||||||
// Block the loop to keep storage footprint down
|
// Block the loop to keep storage footprint down
|
||||||
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
||||||
@ -138,30 +133,26 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract a single rotated vector
|
// Extract a single rotated vector
|
||||||
template<class Field>
|
template<class Field>
|
||||||
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
|
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
|
||||||
{
|
{
|
||||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
typedef decltype(basis[0].AcceleratorView()) View;
|
||||||
typedef typename Field::vector_object vobj;
|
typedef typename Field::vector_object vobj;
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
result.Checkerboard() = basis[0].Checkerboard();
|
result.Checkerboard() = basis[0].Checkerboard();
|
||||||
|
auto result_v=result.AcceleratorView(ViewWrite);
|
||||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
Vector<View> basis_v(basis.size(),result_v);
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
basis_v.push_back(basis[k].View(AcceleratorRead));
|
basis_v[k] = basis[k].AcceleratorView(ViewRead);
|
||||||
}
|
}
|
||||||
vobj zz=Zero();
|
vobj zz=Zero();
|
||||||
Vector<double> Qt_jv(Nm);
|
Vector<double> Qt_jv(Nm);
|
||||||
double * Qt_j = & Qt_jv[0];
|
double * Qt_j = & Qt_jv[0];
|
||||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||||
|
|
||||||
autoView(result_v,result,AcceleratorWrite);
|
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||||
auto B=coalescedRead(zz);
|
auto B=coalescedRead(zz);
|
||||||
for(int k=k0; k<k1; ++k){
|
for(int k=k0; k<k1; ++k){
|
||||||
@ -169,7 +160,6 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
}
|
}
|
||||||
coalescedWrite(result_v[ss], B);
|
coalescedWrite(result_v[ss], B);
|
||||||
});
|
});
|
||||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Field>
|
template<class Field>
|
||||||
|
@ -42,6 +42,34 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
typedef iScalar<vInteger> vPredicate ;
|
typedef iScalar<vInteger> vPredicate ;
|
||||||
|
|
||||||
|
/*
|
||||||
|
template <class iobj, class vobj, class robj> accelerator_inline
|
||||||
|
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
|
||||||
|
{
|
||||||
|
typename std::remove_const<vobj>::type ret;
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
|
const int Nsimd = vobj::vector_type::Nsimd();
|
||||||
|
|
||||||
|
ExtractBuffer<Integer> mask(Nsimd);
|
||||||
|
ExtractBuffer<scalar_object> truevals(Nsimd);
|
||||||
|
ExtractBuffer<scalar_object> falsevals(Nsimd);
|
||||||
|
|
||||||
|
extract(iftrue, truevals);
|
||||||
|
extract(iffalse, falsevals);
|
||||||
|
extract<vInteger, Integer>(TensorRemove(predicate), mask);
|
||||||
|
|
||||||
|
for (int s = 0; s < Nsimd; s++) {
|
||||||
|
if (mask[s]) falsevals[s] = truevals[s];
|
||||||
|
}
|
||||||
|
|
||||||
|
merge(ret, falsevals);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
*/
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// compare lattice to lattice
|
// compare lattice to lattice
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -50,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(rhs.Grid());
|
Lattice<vPredicate> ret(rhs.Grid());
|
||||||
autoView( lhs_v, lhs, CpuRead);
|
auto lhs_v = lhs.View();
|
||||||
autoView( rhs_v, rhs, CpuRead);
|
auto rhs_v = rhs.View();
|
||||||
autoView( ret_v, ret, CpuWrite);
|
auto ret_v = ret.View();
|
||||||
thread_for( ss, rhs_v.size(), {
|
thread_for( ss, rhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
|
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
|
||||||
});
|
});
|
||||||
@ -65,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(lhs.Grid());
|
Lattice<vPredicate> ret(lhs.Grid());
|
||||||
autoView( lhs_v, lhs, CpuRead);
|
auto lhs_v = lhs.View();
|
||||||
autoView( ret_v, ret, CpuWrite);
|
auto ret_v = ret.View();
|
||||||
thread_for( ss, lhs_v.size(), {
|
thread_for( ss, lhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs_v[ss],rhs);
|
ret_v[ss]=op(lhs_v[ss],rhs);
|
||||||
});
|
});
|
||||||
@ -79,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(rhs.Grid());
|
Lattice<vPredicate> ret(rhs.Grid());
|
||||||
autoView( rhs_v, rhs, CpuRead);
|
auto rhs_v = rhs.View();
|
||||||
autoView( ret_v, ret, CpuWrite);
|
auto ret_v = ret.View();
|
||||||
thread_for( ss, rhs_v.size(), {
|
thread_for( ss, rhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs,rhs_v[ss]);
|
ret_v[ss]=op(lhs,rhs_v[ss]);
|
||||||
});
|
});
|
||||||
|
@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
|||||||
GridBase *grid = l.Grid();
|
GridBase *grid = l.Grid();
|
||||||
int Nsimd = grid->iSites();
|
int Nsimd = grid->iSites();
|
||||||
|
|
||||||
autoView(l_v, l, CpuWrite);
|
auto l_v = l.View();
|
||||||
thread_for( o, grid->oSites(), {
|
thread_for( o, grid->oSites(), {
|
||||||
vector_type vI;
|
vector_type vI;
|
||||||
Coordinate gcoor;
|
Coordinate gcoor;
|
||||||
@ -51,5 +51,23 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// LatticeCoordinate();
|
||||||
|
// FIXME for debug; deprecate this; made obscelete by
|
||||||
|
template<class vobj> void lex_sites(Lattice<vobj> &l){
|
||||||
|
auto l_v = l.View();
|
||||||
|
Real *v_ptr = (Real *)&l_v[0];
|
||||||
|
size_t o_len = l.Grid()->oSites();
|
||||||
|
size_t v_len = sizeof(vobj)/sizeof(vRealF);
|
||||||
|
size_t vec_len = vRealF::Nsimd();
|
||||||
|
|
||||||
|
for(int i=0;i<o_len;i++){
|
||||||
|
for(int j=0;j<v_len;j++){
|
||||||
|
for(int vv=0;vv<vec_len;vv+=2){
|
||||||
|
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
|
||||||
|
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -43,8 +43,8 @@ template<class vobj>
|
|||||||
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
||||||
{
|
{
|
||||||
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
auto rhs_v = rhs.View();
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
|
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -56,9 +56,9 @@ template<class vobj>
|
|||||||
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
||||||
{
|
{
|
||||||
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
auto rhs_v = rhs.View();
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
|
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
|
|||||||
typedef decltype(coalescedRead(ll())) sll;
|
typedef decltype(coalescedRead(ll())) sll;
|
||||||
typedef decltype(coalescedRead(rr())) srr;
|
typedef decltype(coalescedRead(rr())) srr;
|
||||||
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
|
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
auto rhs_v = rhs.View();
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
accelerator_for(ss,rhs_v.size(),1,{
|
accelerator_for(ss,rhs_v.size(),1,{
|
||||||
// FIXME had issues with scalar version of outer
|
// FIXME had issues with scalar version of outer
|
||||||
// Use vector [] operator and don't read coalesce this loop
|
// Use vector [] operator and don't read coalesce this loop
|
||||||
|
@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
|||||||
int block =FullGrid->_slice_block [Orthog];
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
autoView( X_v , X, CpuRead);
|
auto X_v = X.View();
|
||||||
autoView( Y_v , Y, CpuRead);
|
auto Y_v = Y.View();
|
||||||
autoView( R_v , R, CpuWrite);
|
auto R_v = R.View();
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> s_x(Nblock);
|
std::vector<vobj> s_x(Nblock);
|
||||||
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
|||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
autoView( X_v , X, CpuRead);
|
auto X_v = X.View();
|
||||||
autoView( R_v , R, CpuWrite);
|
auto R_v = R.View();
|
||||||
|
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
|||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_typeD;
|
typedef typename vobj::vector_typeD vector_typeD;
|
||||||
autoView( lhs_v , lhs, CpuRead);
|
auto lhs_v = lhs.View();
|
||||||
autoView( rhs_v , rhs, CpuRead);
|
auto rhs_v = rhs.View();
|
||||||
thread_region {
|
thread_region {
|
||||||
std::vector<vobj> Left(Nblock);
|
std::vector<vobj> Left(Nblock);
|
||||||
std::vector<vobj> Right(Nblock);
|
std::vector<vobj> Right(Nblock);
|
||||||
|
@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
|
|||||||
{
|
{
|
||||||
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
|
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
autoView( ret_v, ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
autoView( lhs_v, lhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
accelerator_for( ss, lhs_v.size(), 1, {
|
thread_for( ss, lhs_v.size(), {
|
||||||
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
|
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
|
||||||
});
|
});
|
||||||
return ret;
|
return ret;
|
||||||
@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
|
|||||||
{
|
{
|
||||||
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
|
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
autoView( ret_v, ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
autoView( lhs_v, lhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
accelerator_for( ss, lhs_v.size(), 1, {
|
thread_for( ss, lhs_v.size(), {
|
||||||
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
|
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
|
||||||
});
|
});
|
||||||
return ret;
|
return ret;
|
||||||
@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
|
|||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
|
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
|
||||||
{
|
{
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
auto rhs_v = rhs.View();
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
auto lhs_v = lhs.View();
|
||||||
accelerator_for( ss, lhs_v.size(), 1, {
|
thread_for( ss, lhs_v.size(), {
|
||||||
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
|
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
|
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
|
||||||
{
|
{
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
auto rhs_v = rhs.View();
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
auto lhs_v = lhs.View();
|
||||||
accelerator_for( ss, lhs_v.size(), 1, {
|
thread_for( ss, lhs_v.size(), {
|
||||||
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
|
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
|
|||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
autoView( l_v , l, CpuWrite);
|
auto l_v = l.View();
|
||||||
if ( rank == grid->ThisRank() ) {
|
if ( rank == grid->ThisRank() ) {
|
||||||
extract(l_v[odx],buf);
|
extract(l_v[odx],buf);
|
||||||
buf[idx] = s;
|
buf[idx] = s;
|
||||||
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
|||||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
autoView( l_v , l, CpuWrite);
|
auto l_v = l.View();
|
||||||
extract(l_v[odx],buf);
|
extract(l_v[odx],buf);
|
||||||
|
|
||||||
s = buf[idx];
|
s = buf[idx];
|
||||||
@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
|||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Peek a scalar object from the SIMD array
|
// Peek a scalar object from the SIMD array
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Must be CPU read view
|
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
|
||||||
{
|
|
||||||
GridBase *grid = l.getGrid();
|
GridBase *grid = l.Grid();
|
||||||
assert(l.mode==CpuRead);
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
@ -173,7 +173,8 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
idx= grid->iIndex(site);
|
idx= grid->iIndex(site);
|
||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
scalar_type * vp = (scalar_type *)&l[odx];
|
auto l_v = l.View();
|
||||||
|
scalar_type * vp = (scalar_type *)&l_v[odx];
|
||||||
scalar_type * pt = (scalar_type *)&s;
|
scalar_type * pt = (scalar_type *)&s;
|
||||||
|
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
@ -182,27 +183,18 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
template<class vobj,class sobj>
|
|
||||||
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
|
|
||||||
{
|
|
||||||
autoView(lv,l,CpuRead);
|
|
||||||
peekLocalSite(s,lv,site);
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Must be CPU write view
|
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
|
||||||
{
|
|
||||||
GridBase *grid=l.getGrid();
|
GridBase *grid=l.Grid();
|
||||||
assert(l.mode==CpuWrite);
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
@ -210,19 +202,13 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
|||||||
idx= grid->iIndex(site);
|
idx= grid->iIndex(site);
|
||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
scalar_type * vp = (scalar_type *)&l[odx];
|
auto l_v = l.View();
|
||||||
|
scalar_type * vp = (scalar_type *)&l_v[odx];
|
||||||
scalar_type * pt = (scalar_type *)&s;
|
scalar_type * pt = (scalar_type *)&s;
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
vp[idx+w*Nsimd] = pt[w];
|
vp[idx+w*Nsimd] = pt[w];
|
||||||
}
|
}
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class vobj,class sobj>
|
|
||||||
inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
|
|
||||||
{
|
|
||||||
autoView(lv,l,CpuWrite);
|
|
||||||
pokeLocalSite(s,lv,site);
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,79 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/lattice/Lattice_reality.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: neo <cossu@post.kek.jp>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#ifndef GRID_LATTICE_REAL_IMAG_H
|
|
||||||
#define GRID_LATTICE_REAL_IMAG_H
|
|
||||||
|
|
||||||
|
|
||||||
// FIXME .. this is the sector of the code
|
|
||||||
// I am most worried about the directions
|
|
||||||
// The choice of burying complex in the SIMD
|
|
||||||
// is making the use of "real" and "imag" very cumbersome
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
|
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
|
||||||
|
|
||||||
autoView( lhs_v, lhs, AcceleratorRead);
|
|
||||||
autoView( ret_v, ret, AcceleratorWrite);
|
|
||||||
|
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
|
||||||
accelerator_for( ss, lhs_v.size(), 1, {
|
|
||||||
ret_v[ss] =real(lhs_v[ss]);
|
|
||||||
});
|
|
||||||
return ret;
|
|
||||||
};
|
|
||||||
template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
|
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
|
||||||
|
|
||||||
autoView( lhs_v, lhs, AcceleratorRead);
|
|
||||||
autoView( ret_v, ret, AcceleratorWrite);
|
|
||||||
|
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
|
||||||
accelerator_for( ss, lhs_v.size(), 1, {
|
|
||||||
ret_v[ss] =imag(lhs_v[ss]);
|
|
||||||
});
|
|
||||||
return ret;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
|
||||||
auto real(const Expression &expr) -> decltype(real(closure(expr)))
|
|
||||||
{
|
|
||||||
return real(closure(expr));
|
|
||||||
}
|
|
||||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
|
||||||
auto imag(const Expression &expr) -> decltype(imag(closure(expr)))
|
|
||||||
{
|
|
||||||
return imag(closure(expr));
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
#endif
|
|
@ -40,11 +40,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
|
|
||||||
autoView( lhs_v, lhs, AcceleratorRead);
|
|
||||||
autoView( ret_v, ret, AcceleratorWrite);
|
|
||||||
|
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
|
auto lhs_v = lhs.View();
|
||||||
|
auto ret_v = ret.View();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -53,11 +51,9 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
|
|
||||||
autoView( lhs_v, lhs, AcceleratorRead);
|
|
||||||
autoView( ret_v, ret, AcceleratorWrite);
|
|
||||||
|
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
|
auto lhs_v = lhs.View();
|
||||||
|
auto ret_v = ret.View();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
|
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -25,7 +25,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
|||||||
#include <Grid/Grid_Eigen_Dense.h>
|
#include <Grid/Grid_Eigen_Dense.h>
|
||||||
|
|
||||||
|
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
#ifdef GRID_NVCC
|
||||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -39,7 +39,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
// const int Nsimd = vobj::Nsimd();
|
const int Nsimd = vobj::Nsimd();
|
||||||
const int nthread = GridThread::GetThreads();
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
Vector<sobj> sumarray(nthread);
|
Vector<sobj> sumarray(nthread);
|
||||||
@ -62,73 +62,24 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
for(int i=0;i<nthread;i++){
|
for(int i=0;i<nthread;i++){
|
||||||
ssum = ssum+sumarray[i];
|
ssum = ssum+sumarray[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
return ssum;
|
return ssum;
|
||||||
}
|
}
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_objectD sobj;
|
|
||||||
|
|
||||||
const int nthread = GridThread::GetThreads();
|
|
||||||
|
|
||||||
Vector<sobj> sumarray(nthread);
|
|
||||||
for(int i=0;i<nthread;i++){
|
|
||||||
sumarray[i]=Zero();
|
|
||||||
}
|
|
||||||
|
|
||||||
thread_for(thr,nthread, {
|
|
||||||
int nwork, mywork, myoff;
|
|
||||||
nwork = osites;
|
|
||||||
GridThread::GetWork(nwork,thr,mywork,myoff);
|
|
||||||
vobj vvsum=Zero();
|
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
|
||||||
vvsum = vvsum + arg[ss];
|
|
||||||
}
|
|
||||||
sumarray[thr]=Reduce(vvsum);
|
|
||||||
});
|
|
||||||
|
|
||||||
sobj ssum=Zero(); // sum across threads
|
|
||||||
for(int i=0;i<nthread;i++){
|
|
||||||
ssum = ssum+sumarray[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_object ssobj;
|
|
||||||
ssobj ret = ssum;
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
#ifdef GRID_NVCC
|
||||||
return sum_gpu(arg,osites);
|
return sum_gpu(arg,osites);
|
||||||
#else
|
#else
|
||||||
return sum_cpu(arg,osites);
|
return sum_cpu(arg,osites);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
|
||||||
{
|
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
|
||||||
return sumD_gpu(arg,osites);
|
|
||||||
#else
|
|
||||||
return sumD_cpu(arg,osites);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
auto arg_v = arg.View();
|
||||||
autoView( arg_v, arg, AcceleratorRead);
|
|
||||||
Integer osites = arg.Grid()->oSites();
|
Integer osites = arg.Grid()->oSites();
|
||||||
auto ssum= sum_gpu(&arg_v[0],osites);
|
auto ssum= sum(&arg_v[0],osites);
|
||||||
#else
|
|
||||||
autoView(arg_v, arg, CpuRead);
|
|
||||||
Integer osites = arg.Grid()->oSites();
|
|
||||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
|
||||||
#endif
|
|
||||||
arg.Grid()->GlobalSum(ssum);
|
arg.Grid()->GlobalSum(ssum);
|
||||||
return ssum;
|
return ssum;
|
||||||
}
|
}
|
||||||
@ -150,30 +101,43 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
ComplexD nrm;
|
ComplexD nrm;
|
||||||
|
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
|
// Might make all code paths go this way.
|
||||||
|
auto left_v = left.AcceleratorView(ViewRead);
|
||||||
|
auto right_v=right.AcceleratorView(ViewRead);
|
||||||
|
|
||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
// Might make all code paths go this way.
|
#ifdef GRID_NVCC
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
// GPU - SIMT lane compliance...
|
||||||
|
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
{
|
|
||||||
autoView( left_v , left, AcceleratorRead);
|
|
||||||
autoView( right_v,right, AcceleratorRead);
|
|
||||||
|
|
||||||
// GPU - SIMT lane compliance...
|
accelerator_for( ss, sites, nsimd,{
|
||||||
accelerator_for( ss, sites, 1,{
|
auto x_l = left_v(ss);
|
||||||
auto x_l = left_v[ss];
|
auto y_l = right_v(ss);
|
||||||
auto y_l = right_v[ss];
|
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||||
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
})
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is in single precision and fails some tests
|
// This is in single precision and fails some tests
|
||||||
auto anrm = sum(inner_tmp_v,sites);
|
// Need a sumD that sums in double
|
||||||
nrm = anrm;
|
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
|
||||||
|
#else
|
||||||
|
// CPU
|
||||||
|
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto x_l = left_v[ss];
|
||||||
|
auto y_l = right_v[ss];
|
||||||
|
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
||||||
|
})
|
||||||
|
nrm = TensorRemove(sum(inner_tmp_v,sites));
|
||||||
|
#endif
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -211,24 +175,40 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
|
|
||||||
GridBase *grid = x.Grid();
|
GridBase *grid = x.Grid();
|
||||||
|
|
||||||
|
auto x_v=x.AcceleratorView(ViewRead);
|
||||||
|
auto y_v=y.AcceleratorView(ViewRead);
|
||||||
|
auto z_v=z.AcceleratorView(ViewWrite);
|
||||||
|
|
||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
// GPU
|
// GPU
|
||||||
autoView( x_v, x, AcceleratorRead);
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
autoView( y_v, y, AcceleratorRead);
|
|
||||||
autoView( z_v, z, AcceleratorWrite);
|
|
||||||
|
|
||||||
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
accelerator_for( ss, sites, 1,{
|
accelerator_for( ss, sites, nsimd,{
|
||||||
auto tmp = a*x_v[ss]+b*y_v[ss];
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
|
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||||
|
coalescedWrite(z_v[ss],tmp);
|
||||||
|
});
|
||||||
|
|
||||||
|
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
|
||||||
|
#else
|
||||||
|
// CPU
|
||||||
|
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
||||||
z_v[ss]=tmp;
|
z_v[ss]=tmp;
|
||||||
});
|
});
|
||||||
|
// Already promoted to double
|
||||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
|
#endif
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
@ -244,29 +224,47 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
|||||||
|
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
|
auto left_v=left.AcceleratorView(ViewRead);
|
||||||
|
auto right_v=right.AcceleratorView(ViewRead);
|
||||||
|
|
||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
// GPU
|
// GPU
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
Vector<norm_t> norm_tmp(sites);
|
Vector<norm_t> norm_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
auto norm_tmp_v = &norm_tmp[0];
|
auto norm_tmp_v = &norm_tmp[0];
|
||||||
{
|
|
||||||
autoView(left_v,left, AcceleratorRead);
|
|
||||||
autoView(right_v,right,AcceleratorRead);
|
|
||||||
accelerator_for( ss, sites, 1,{
|
|
||||||
auto left_tmp = left_v[ss];
|
|
||||||
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
|
|
||||||
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto left_tmp = left_v(ss);
|
||||||
|
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
|
||||||
|
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
|
||||||
|
});
|
||||||
|
|
||||||
|
tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
|
||||||
|
tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
|
||||||
|
#else
|
||||||
|
// CPU
|
||||||
|
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
|
||||||
|
typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
Vector<norm_t> norm_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
auto norm_tmp_v = &norm_tmp[0];
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto left_tmp = left_v(ss);
|
||||||
|
inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
|
||||||
|
norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
|
||||||
|
});
|
||||||
|
// Already promoted to double
|
||||||
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
|
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
|
||||||
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
|
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
|
||||||
|
#endif
|
||||||
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
|
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
|
||||||
ip = tmp[0];
|
ip = tmp[0];
|
||||||
nrm = real(tmp[1]);
|
nrm = real(tmp[1]);
|
||||||
@ -337,7 +335,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
|
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
// Parallel over orthog direction
|
// Parallel over orthog direction
|
||||||
autoView( Data_v, Data, CpuRead);
|
auto Data_v=Data.View();
|
||||||
thread_for( r,rd, {
|
thread_for( r,rd, {
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
@ -415,8 +413,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
int e2= grid->_slice_block [orthogdim];
|
int e2= grid->_slice_block [orthogdim];
|
||||||
int stride=grid->_slice_stride[orthogdim];
|
int stride=grid->_slice_stride[orthogdim];
|
||||||
|
|
||||||
autoView( lhv, lhs, CpuRead);
|
auto lhv=lhs.View();
|
||||||
autoView( rhv, rhs, CpuRead);
|
auto rhv=rhs.View();
|
||||||
thread_for( r,rd,{
|
thread_for( r,rd,{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
@ -523,12 +521,14 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
|||||||
|
|
||||||
tensor_reduced at; at=av;
|
tensor_reduced at; at=av;
|
||||||
|
|
||||||
autoView( Rv, R, CpuWrite);
|
auto Rv=R.View();
|
||||||
autoView( Xv, X, CpuRead);
|
auto Xv=X.View();
|
||||||
autoView( Yv, Y, CpuRead);
|
auto Yv=Y.View();
|
||||||
thread_for2d( n, e1, b,e2, {
|
thread_for_collapse(2, n, e1, {
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
int ss= so+n*stride+b;
|
int ss= so+n*stride+b;
|
||||||
Rv[ss] = at*Xv[ss]+Yv[ss];
|
Rv[ss] = at*Xv[ss]+Yv[ss];
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -581,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
|||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
autoView( X_v, X, CpuRead);
|
auto X_v=X.View();
|
||||||
autoView( Y_v, Y, CpuRead);
|
auto Y_v=Y.View();
|
||||||
autoView( R_v, R, CpuWrite);
|
auto R_v=R.View();
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
Vector<vobj> s_x(Nblock);
|
Vector<vobj> s_x(Nblock);
|
||||||
@ -628,14 +628,13 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
|||||||
// int nl=1;
|
// int nl=1;
|
||||||
|
|
||||||
//FIXME package in a convenient iterator
|
//FIXME package in a convenient iterator
|
||||||
// thread_for2d_in_region
|
|
||||||
//Should loop over a plane orthogonal to direction "Orthog"
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
int stride=FullGrid->_slice_stride[Orthog];
|
int stride=FullGrid->_slice_stride[Orthog];
|
||||||
int block =FullGrid->_slice_block [Orthog];
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
autoView( R_v, R, CpuWrite);
|
auto R_v = R.View();
|
||||||
autoView( X_v, X, CpuRead);
|
auto X_v = X.View();
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> s_x(Nblock);
|
std::vector<vobj> s_x(Nblock);
|
||||||
@ -693,8 +692,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
|||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_typeD;
|
typedef typename vobj::vector_typeD vector_typeD;
|
||||||
|
|
||||||
autoView( lhs_v, lhs, CpuRead);
|
auto lhs_v=lhs.View();
|
||||||
autoView( rhs_v, rhs, CpuRead);
|
auto rhs_v=rhs.View();
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> Left(Nblock);
|
std::vector<vobj> Left(Nblock);
|
||||||
|
@ -1,14 +1,7 @@
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
extern hipDeviceProp_t *gpu_props;
|
|
||||||
#define WARP_SIZE 64
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
extern cudaDeviceProp *gpu_props;
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
#endif
|
extern cudaDeviceProp *gpu_props;
|
||||||
|
|
||||||
__device__ unsigned int retirementCount = 0;
|
__device__ unsigned int retirementCount = 0;
|
||||||
|
|
||||||
template <class Iterator>
|
template <class Iterator>
|
||||||
@ -26,12 +19,7 @@ template <class Iterator>
|
|||||||
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
||||||
|
|
||||||
int device;
|
int device;
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cudaGetDevice(&device);
|
cudaGetDevice(&device);
|
||||||
#endif
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipGetDevice(&device);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
Iterator warpSize = gpu_props[device].warpSize;
|
Iterator warpSize = gpu_props[device].warpSize;
|
||||||
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
||||||
@ -65,7 +53,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
|
|
||||||
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
||||||
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
||||||
acceleratorSynchronise();
|
__syncwarp();
|
||||||
|
|
||||||
const Iterator VEC = WARP_SIZE;
|
const Iterator VEC = WARP_SIZE;
|
||||||
const Iterator vid = tid & (VEC-1);
|
const Iterator vid = tid & (VEC-1);
|
||||||
@ -79,9 +67,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
beta += temp;
|
beta += temp;
|
||||||
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
||||||
}
|
}
|
||||||
acceleratorSynchronise();
|
__syncwarp();
|
||||||
}
|
}
|
||||||
acceleratorSynchroniseAll();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
beta = Zero();
|
beta = Zero();
|
||||||
@ -91,7 +79,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
}
|
}
|
||||||
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
||||||
}
|
}
|
||||||
acceleratorSynchroniseAll();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -159,7 +147,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
|||||||
sobj *smem = (sobj *)shmem_pointer;
|
sobj *smem = (sobj *)shmem_pointer;
|
||||||
|
|
||||||
// wait until all outstanding memory instructions in this thread are finished
|
// wait until all outstanding memory instructions in this thread are finished
|
||||||
acceleratorFence();
|
__threadfence();
|
||||||
|
|
||||||
if (tid==0) {
|
if (tid==0) {
|
||||||
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
||||||
@ -168,8 +156,8 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// each thread must read the correct value of amLast
|
// each thread must read the correct value of amLast
|
||||||
acceleratorSynchroniseAll();
|
__syncthreads();
|
||||||
|
|
||||||
if (amLast) {
|
if (amLast) {
|
||||||
// reduce buffer[0], ..., buffer[gridDim.x-1]
|
// reduce buffer[0], ..., buffer[gridDim.x-1]
|
||||||
Iterator i = tid;
|
Iterator i = tid;
|
||||||
@ -211,7 +199,13 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|||||||
sobj *buffer_v = &buffer[0];
|
sobj *buffer_v = &buffer[0];
|
||||||
|
|
||||||
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
||||||
accelerator_barrier();
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
|
cudaError err = cudaGetLastError();
|
||||||
|
if ( cudaSuccess != err ) {
|
||||||
|
printf("Cuda error %s\n",cudaGetErrorString( err ));
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
auto result = buffer_v[0];
|
auto result = buffer_v[0];
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -375,7 +375,7 @@ public:
|
|||||||
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
||||||
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
||||||
|
|
||||||
autoView(l_v, l, CpuWrite);
|
auto l_v = l.View();
|
||||||
thread_for( ss, osites, {
|
thread_for( ss, osites, {
|
||||||
ExtractBuffer<scalar_object> buf(Nsimd);
|
ExtractBuffer<scalar_object> buf(Nsimd);
|
||||||
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
||||||
@ -461,8 +461,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// Obtain one reseeded generator per thread
|
// Obtain one reseeded generator per thread
|
||||||
int Nthread = 32; // Hardwire a good level or parallelism
|
int Nthread = GridThread::GetThreads();
|
||||||
std::vector<RngEngine> seeders(Nthread);
|
std::vector<RngEngine> seeders(Nthread);
|
||||||
for(int t=0;t<Nthread;t++){
|
for(int t=0;t<Nthread;t++){
|
||||||
seeders[t] = Reseed(master_engine);
|
seeders[t] = Reseed(master_engine);
|
||||||
|
@ -42,8 +42,8 @@ template<class vobj>
|
|||||||
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
|
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
|
||||||
autoView(ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
autoView(lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -58,8 +58,8 @@ template<int Index,class vobj>
|
|||||||
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
|
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
|
||||||
autoView( ret_v , ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -47,12 +47,11 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// remove and insert a half checkerboard
|
// remove and insert a half checkerboard
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
||||||
{
|
|
||||||
half.Checkerboard() = cb;
|
half.Checkerboard() = cb;
|
||||||
|
|
||||||
autoView( half_v, half, CpuWrite);
|
auto half_v = half.View();
|
||||||
autoView( full_v, full, CpuRead);
|
auto full_v = full.View();
|
||||||
thread_for(ss, full.Grid()->oSites(),{
|
thread_for(ss, full.Grid()->oSites(),{
|
||||||
int cbos;
|
int cbos;
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -65,11 +64,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
|
||||||
{
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||||
int cb = half.Checkerboard();
|
int cb = half.Checkerboard();
|
||||||
autoView( half_v , half, CpuRead);
|
auto half_v = half.View();
|
||||||
autoView( full_v , full, CpuWrite);
|
auto full_v = full.View();
|
||||||
thread_for(ss,full.Grid()->oSites(),{
|
thread_for(ss,full.Grid()->oSites(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -97,15 +96,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
|
|||||||
out = in;
|
out = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GRID_SIMT
|
#ifdef __CUDA_ARCH__
|
||||||
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
|
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
|
||||||
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
|
((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
|
||||||
}
|
}
|
||||||
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
|
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
|
||||||
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
|
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
|
||||||
}
|
}
|
||||||
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
|
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
|
||||||
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
|
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -152,11 +151,12 @@ accelerator_inline void convertType(T & out, const T & in) {
|
|||||||
|
|
||||||
template<typename T1,typename T2>
|
template<typename T1,typename T2>
|
||||||
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
||||||
autoView( out_v , out,AcceleratorWrite);
|
auto out_v = out.AcceleratorView(ViewWrite);
|
||||||
autoView( in_v , in ,AcceleratorRead);
|
auto in_v = in.AcceleratorView(ViewRead);
|
||||||
|
|
||||||
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
|
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
|
||||||
convertType(out_v[ss],in_v(ss));
|
convertType(out_v[ss],in_v(ss));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -164,20 +164,19 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
|
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
|
||||||
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
|
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
|
||||||
{
|
{
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||||
|
|
||||||
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
|
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
|
||||||
Lattice<iScalar<t_inner>> ret(lhs.Grid());
|
Lattice<iScalar<t_inner>> ret(lhs.Grid());
|
||||||
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
|
|
||||||
{
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
autoView(ret_v, ret,AcceleratorWrite);
|
|
||||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
|
||||||
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
|
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
|
||||||
});
|
});
|
||||||
}
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,13 +194,14 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
Lattice<iScalar<CComplex>> ip(coarse);
|
Lattice<iScalar<CComplex>> ip(coarse);
|
||||||
Lattice<vobj> fineDataRed = fineData;
|
Lattice<vobj> fineDataRed = fineData;
|
||||||
|
|
||||||
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
// auto fineData_ = fineData.View();
|
||||||
autoView( ip_ , ip, AcceleratorWrite);
|
auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
|
||||||
|
auto ip_ = ip.AcceleratorView(ViewReadWrite);
|
||||||
for(int v=0;v<nbasis;v++) {
|
for(int v=0;v<nbasis;v++) {
|
||||||
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
|
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
|
||||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||||
convertType(coarseData_[sc](v),ip_[sc]);
|
convertType(coarseData_[sc](v),ip_[sc]);
|
||||||
});
|
});
|
||||||
|
|
||||||
// improve numerical stability of projection
|
// improve numerical stability of projection
|
||||||
// |fine> = |fine> - <basis|fine> |basis>
|
// |fine> = |fine> - <basis|fine> |basis>
|
||||||
@ -210,6 +210,68 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj,class CComplex,int nbasis>
|
||||||
|
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
|
const Lattice<vobj> &fineData,
|
||||||
|
const std::vector<Lattice<vobj> > &Basis)
|
||||||
|
{
|
||||||
|
typedef iVector<CComplex,nbasis > coarseSiteData;
|
||||||
|
coarseSiteData elide;
|
||||||
|
typedef decltype(coalescedRead(elide)) ScalarComplex;
|
||||||
|
GridBase * fine = fineData.Grid();
|
||||||
|
GridBase * coarse= coarseData.Grid();
|
||||||
|
int _ndimension = coarse->_ndimension;
|
||||||
|
|
||||||
|
// checks
|
||||||
|
assert( nbasis == Basis.size() );
|
||||||
|
subdivides(coarse,fine);
|
||||||
|
for(int i=0;i<nbasis;i++){
|
||||||
|
conformable(Basis[i],fineData);
|
||||||
|
}
|
||||||
|
|
||||||
|
Coordinate block_r (_ndimension);
|
||||||
|
|
||||||
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
|
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||||
|
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
|
||||||
|
}
|
||||||
|
int blockVol = fine->oSites()/coarse->oSites();
|
||||||
|
|
||||||
|
coarseData=Zero();
|
||||||
|
|
||||||
|
auto fineData_ = fineData.View();
|
||||||
|
auto coarseData_ = coarseData.View();
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
|
||||||
|
// Otherwise do fine inner product per site, and make the update atomic
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
|
||||||
|
|
||||||
|
auto sc=sci/nbasis;
|
||||||
|
auto i=sci%nbasis;
|
||||||
|
auto Basis_ = Basis[i].View();
|
||||||
|
|
||||||
|
Coordinate coor_c(_ndimension);
|
||||||
|
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
||||||
|
|
||||||
|
int sf;
|
||||||
|
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
|
||||||
|
|
||||||
|
for(int sb=0;sb<blockVol;sb++){
|
||||||
|
|
||||||
|
Coordinate coor_b(_ndimension);
|
||||||
|
Coordinate coor_f(_ndimension);
|
||||||
|
|
||||||
|
Lexicographic::CoorFromIndex(coor_b,sb,block_r);
|
||||||
|
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
|
||||||
|
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
||||||
|
|
||||||
|
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
|
||||||
|
}
|
||||||
|
coalescedWrite(coarseData_[sc](i),reduce);
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
template<class vobj,class vobj2,class CComplex>
|
template<class vobj,class vobj2,class CComplex>
|
||||||
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||||
@ -236,12 +298,10 @@ template<class vobj,class vobj2,class CComplex>
|
|||||||
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||||
}
|
}
|
||||||
|
|
||||||
autoView( fineZ_ , fineZ, AcceleratorWrite);
|
auto fineZ_ = fineZ.AcceleratorView(ViewWrite);
|
||||||
autoView( fineX_ , fineX, AcceleratorRead);
|
auto fineX_ = fineX.AcceleratorView(ViewRead);
|
||||||
autoView( fineY_ , fineY, AcceleratorRead);
|
auto fineY_ = fineY.AcceleratorView(ViewRead);
|
||||||
autoView( coarseA_, coarseA, AcceleratorRead);
|
auto coarseA_= coarseA.AcceleratorView(ViewRead);
|
||||||
Coordinate fine_rdimensions = fine->_rdimensions;
|
|
||||||
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
|
||||||
|
|
||||||
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
||||||
|
|
||||||
@ -249,12 +309,12 @@ template<class vobj,class vobj2,class CComplex>
|
|||||||
Coordinate coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
Coordinate coor_f(_ndimension);
|
Coordinate coor_f(_ndimension);
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
// z = A x + y
|
// z = A x + y
|
||||||
#ifdef GRID_SIMT
|
#ifdef __CUDA_ARCH__
|
||||||
typename vobj2::tensor_reduced::scalar_object cA;
|
typename vobj2::tensor_reduced::scalar_object cA;
|
||||||
typename vobj::scalar_object cAx;
|
typename vobj::scalar_object cAx;
|
||||||
#else
|
#else
|
||||||
@ -284,16 +344,15 @@ template<class vobj,class CComplex>
|
|||||||
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
|
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
|
||||||
Lattice<dotp> coarse_inner(coarse);
|
Lattice<dotp> coarse_inner(coarse);
|
||||||
|
|
||||||
|
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
|
||||||
|
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
|
||||||
|
|
||||||
// Precision promotion
|
// Precision promotion
|
||||||
fine_inner = localInnerProductD<vobj>(fineX,fineY);
|
fine_inner = localInnerProductD(fineX,fineY);
|
||||||
blockSum(coarse_inner,fine_inner);
|
blockSum(coarse_inner,fine_inner);
|
||||||
{
|
accelerator_for(ss, coarse->oSites(), 1, {
|
||||||
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
|
|
||||||
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
|
|
||||||
accelerator_for(ss, coarse->oSites(), 1, {
|
|
||||||
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
|
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -311,15 +370,14 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
|||||||
Lattice<dotp> coarse_inner(coarse);
|
Lattice<dotp> coarse_inner(coarse);
|
||||||
|
|
||||||
// Precision promotion?
|
// Precision promotion?
|
||||||
|
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
|
||||||
|
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
|
||||||
|
|
||||||
fine_inner = localInnerProduct(fineX,fineY);
|
fine_inner = localInnerProduct(fineX,fineY);
|
||||||
blockSum(coarse_inner,fine_inner);
|
blockSum(coarse_inner,fine_inner);
|
||||||
{
|
accelerator_for(ss, coarse->oSites(), 1, {
|
||||||
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
|
CoarseInner_[ss] = coarse_inner_[ss];
|
||||||
autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
|
});
|
||||||
accelerator_for(ss, coarse->oSites(), 1, {
|
|
||||||
CoarseInner_[ss] = coarse_inner_[ss];
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class vobj,class CComplex>
|
||||||
@ -350,19 +408,14 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
}
|
}
|
||||||
int blockVol = fine->oSites()/coarse->oSites();
|
int blockVol = fine->oSites()/coarse->oSites();
|
||||||
|
|
||||||
// Turn this around to loop threaded over sc and interior loop
|
auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
|
||||||
// over sf would thread better
|
auto fineData_ = fineData.AcceleratorView(ViewRead);
|
||||||
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
|
||||||
autoView( fineData_ , fineData, AcceleratorRead);
|
|
||||||
|
|
||||||
Coordinate fine_rdimensions = fine->_rdimensions;
|
|
||||||
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
|
||||||
|
|
||||||
accelerator_for(sc,coarse->oSites(),1,{
|
accelerator_for(sc,coarse->oSites(),1,{
|
||||||
|
|
||||||
// One thread per sub block
|
// One thread per sub block
|
||||||
Coordinate coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
|
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
||||||
coarseData_[sc]=Zero();
|
coarseData_[sc]=Zero();
|
||||||
|
|
||||||
for(int sb=0;sb<blockVol;sb++){
|
for(int sb=0;sb<blockVol;sb++){
|
||||||
@ -372,7 +425,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
Coordinate coor_f(_ndimension);
|
Coordinate coor_f(_ndimension);
|
||||||
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
|
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
|
||||||
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
|
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
|
||||||
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
|
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
||||||
|
|
||||||
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
|
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
|
||||||
}
|
}
|
||||||
@ -457,8 +510,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
for(int d=0 ; d<_ndimension;d++){
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||||
}
|
}
|
||||||
autoView( fineData_ , fineData, AcceleratorWrite);
|
auto fineData_ = fineData.View();
|
||||||
autoView( coarseData_ , coarseData, AcceleratorRead);
|
auto coarseData_ = coarseData.View();
|
||||||
|
|
||||||
// Loop with a cache friendly loop ordering
|
// Loop with a cache friendly loop ordering
|
||||||
accelerator_for(sf,fine->oSites(),1,{
|
accelerator_for(sf,fine->oSites(),1,{
|
||||||
@ -471,7 +524,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
/* auto basis_ = Basis[i], );*/
|
auto basis_ = Basis[i].View();
|
||||||
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
|
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
|
||||||
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
|
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
|
||||||
}
|
}
|
||||||
@ -490,14 +543,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
fineData=Zero();
|
fineData=Zero();
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
||||||
|
auto ip_ = ip.AcceleratorView(ViewRead);
|
||||||
//Lattice<CComplex> cip(coarse);
|
|
||||||
//autoView( cip_ , cip, AcceleratorWrite);
|
|
||||||
//autoView( ip_ , ip, AcceleratorRead);
|
|
||||||
//accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
|
|
||||||
// coalescedWrite(cip_[sc], ip_(sc)());
|
|
||||||
// });
|
|
||||||
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
|
|
||||||
blockZAXPY(fineData,ip,Basis[i],fineData);
|
blockZAXPY(fineData,ip,Basis[i],fineData);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -525,17 +571,15 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
|
|||||||
assert(ig->lSites() == og->lSites());
|
assert(ig->lSites() == og->lSites());
|
||||||
}
|
}
|
||||||
|
|
||||||
autoView(in_v,in,CpuRead);
|
|
||||||
autoView(out_v,out,CpuWrite);
|
|
||||||
thread_for(idx, ig->lSites(),{
|
thread_for(idx, ig->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
ssobj ss;
|
ssobj ss;
|
||||||
|
|
||||||
Coordinate lcoor(ni);
|
Coordinate lcoor(ni);
|
||||||
ig->LocalIndexToLocalCoor(idx,lcoor);
|
ig->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
peekLocalSite(s,in_v,lcoor);
|
peekLocalSite(s,in,lcoor);
|
||||||
ss=s;
|
ss=s;
|
||||||
pokeLocalSite(ss,out_v,lcoor);
|
pokeLocalSite(ss,out,lcoor);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -570,9 +614,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
Coordinate rdt = Tg->_rdimensions;
|
Coordinate rdt = Tg->_rdimensions;
|
||||||
Coordinate ist = Tg->_istride;
|
Coordinate ist = Tg->_istride;
|
||||||
Coordinate ost = Tg->_ostride;
|
Coordinate ost = Tg->_ostride;
|
||||||
|
auto t_v = To.AcceleratorView(ViewWrite);
|
||||||
autoView( t_v , To, AcceleratorWrite);
|
auto f_v = From.AcceleratorView(ViewRead);
|
||||||
autoView( f_v , From, AcceleratorRead);
|
|
||||||
accelerator_for(idx,Fg->lSites(),1,{
|
accelerator_for(idx,Fg->lSites(),1,{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate Fcoor(nd);
|
Coordinate Fcoor(nd);
|
||||||
@ -595,6 +638,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
|
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
|
||||||
}
|
}
|
||||||
|
// peekLocalSite(s,From,Fcoor);
|
||||||
|
// pokeLocalSite(s,To ,Tcoor);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -625,8 +670,6 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
autoView(lowDimv,lowDim,CpuRead);
|
|
||||||
autoView(higherDimv,higherDim,CpuWrite);
|
|
||||||
thread_for(idx,lg->lSites(),{
|
thread_for(idx,lg->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
@ -639,8 +682,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
hcoor[d]=lcoor[ddl++];
|
hcoor[d]=lcoor[ddl++];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
peekLocalSite(s,lowDimv,lcoor);
|
peekLocalSite(s,lowDim,lcoor);
|
||||||
pokeLocalSite(s,higherDimv,hcoor);
|
pokeLocalSite(s,higherDim,hcoor);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -668,8 +711,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
autoView(lowDimv,lowDim,CpuWrite);
|
|
||||||
autoView(higherDimv,higherDim,CpuRead);
|
|
||||||
thread_for(idx,lg->lSites(),{
|
thread_for(idx,lg->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
@ -682,8 +723,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
hcoor[d]=lcoor[ddl++];
|
hcoor[d]=lcoor[ddl++];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
peekLocalSite(s,higherDimv,hcoor);
|
peekLocalSite(s,higherDim,hcoor);
|
||||||
pokeLocalSite(s,lowDimv,lcoor);
|
pokeLocalSite(s,lowDim,lcoor);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -711,8 +752,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
autoView(lowDimv,lowDim,CpuRead);
|
|
||||||
autoView(higherDimv,higherDim,CpuWrite);
|
|
||||||
thread_for(idx,lg->lSites(),{
|
thread_for(idx,lg->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
@ -721,8 +760,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
if( lcoor[orthog] == slice_lo ) {
|
if( lcoor[orthog] == slice_lo ) {
|
||||||
hcoor=lcoor;
|
hcoor=lcoor;
|
||||||
hcoor[orthog] = slice_hi;
|
hcoor[orthog] = slice_hi;
|
||||||
peekLocalSite(s,lowDimv,lcoor);
|
peekLocalSite(s,lowDim,lcoor);
|
||||||
pokeLocalSite(s,higherDimv,hcoor);
|
pokeLocalSite(s,higherDim,hcoor);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -750,8 +789,6 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
autoView(lowDimv,lowDim,CpuWrite);
|
|
||||||
autoView(higherDimv,higherDim,CpuRead);
|
|
||||||
thread_for(idx,lg->lSites(),{
|
thread_for(idx,lg->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
@ -760,8 +797,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
|
|||||||
if( lcoor[orthog] == slice_lo ) {
|
if( lcoor[orthog] == slice_lo ) {
|
||||||
hcoor=lcoor;
|
hcoor=lcoor;
|
||||||
hcoor[orthog] = slice_hi;
|
hcoor[orthog] = slice_hi;
|
||||||
peekLocalSite(s,higherDimv,hcoor);
|
peekLocalSite(s,higherDim,hcoor);
|
||||||
pokeLocalSite(s,lowDimv,lcoor);
|
pokeLocalSite(s,lowDim,lcoor);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -825,7 +862,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
}
|
}
|
||||||
|
|
||||||
//loop over outer index
|
//loop over outer index
|
||||||
autoView( in_v , in, CpuRead);
|
auto in_v = in.View();
|
||||||
thread_for(in_oidx,in_grid->oSites(),{
|
thread_for(in_oidx,in_grid->oSites(),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
||||||
@ -918,7 +955,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
icoor[lane].resize(ndim);
|
icoor[lane].resize(ndim);
|
||||||
grid->iCoorFromIindex(icoor[lane],lane);
|
grid->iCoorFromIindex(icoor[lane],lane);
|
||||||
}
|
}
|
||||||
autoView( out_v , out, CpuWrite);
|
auto out_v = out.View();
|
||||||
thread_for(oidx, grid->oSites(),{
|
thread_for(oidx, grid->oSites(),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
ExtractPointerArray<sobj> ptrs(nsimd);
|
ExtractPointerArray<sobj> ptrs(nsimd);
|
||||||
@ -1021,7 +1058,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
|||||||
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
||||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
unvectorizeToLexOrdArray(in_slex_conv, in);
|
||||||
|
|
||||||
autoView( out_v , out, CpuWrite);
|
auto out_v = out.View();
|
||||||
thread_for(out_oidx,out_grid->oSites(),{
|
thread_for(out_oidx,out_grid->oSites(),{
|
||||||
Coordinate out_ocoor(ndim);
|
Coordinate out_ocoor(ndim);
|
||||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
||||||
|
@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
autoView( ret_v, ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
autoView( lhs_v, lhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -58,8 +58,8 @@ template<int Index,class vobj>
|
|||||||
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
|
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
|
||||||
autoView( ret_v, ret, AcceleratorWrite);
|
auto ret_v = ret.View();
|
||||||
autoView( lhs_v, lhs, AcceleratorRead);
|
auto lhs_v = lhs.View();
|
||||||
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
|
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
autoView( rhs, rhs_i, AcceleratorRead);
|
auto rhs = rhs_i.View();
|
||||||
autoView( ret, ret_i, AcceleratorWrite);
|
auto ret = ret_i.View();
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),1,{
|
accelerator_for(ss,rhs.size(),1,{
|
||||||
ret[ss]=pow(rhs[ss],y);
|
ret[ss]=pow(rhs[ss],y);
|
||||||
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
|||||||
}
|
}
|
||||||
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
autoView( rhs , rhs_i, AcceleratorRead);
|
auto rhs = rhs_i.View();
|
||||||
autoView( ret , ret_i, AcceleratorWrite);
|
auto ret = ret_i.View();
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],mod(rhs(ss),y));
|
coalescedWrite(ret[ss],mod(rhs(ss),y));
|
||||||
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
autoView( ret , ret_i, AcceleratorWrite);
|
auto ret = ret_i.View();
|
||||||
autoView( rhs , rhs_i, AcceleratorRead);
|
auto rhs = rhs_i.View();
|
||||||
ret.Checkerboard() = rhs_i.Checkerboard();
|
ret.Checkerboard() = rhs_i.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],div(rhs(ss),y));
|
coalescedWrite(ret[ss],div(rhs(ss),y));
|
||||||
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
|
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
autoView( rhs , rhs_i, AcceleratorRead);
|
auto rhs = rhs_i.View();
|
||||||
autoView( ret , ret_i, AcceleratorWrite);
|
auto ret = ret_i.View();
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
|
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
|
||||||
|
@ -1,168 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
// Base class which can be used by traits to pick up behaviour
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
class LatticeBase {};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Conformable checks; same instance of Grid required
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
|
|
||||||
{
|
|
||||||
assert(lhs == rhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Minimal base class containing only data valid to access from accelerator
|
|
||||||
// _odata will be a managed pointer in CUDA
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Force access to lattice through a view object.
|
|
||||||
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
|
|
||||||
// strict since host could could in principle direct access through the lattice object
|
|
||||||
// Need to decide programming model.
|
|
||||||
#define LATTICE_VIEW_STRICT
|
|
||||||
template<class vobj> class LatticeAccelerator : public LatticeBase
|
|
||||||
{
|
|
||||||
protected:
|
|
||||||
//public:
|
|
||||||
GridBase *_grid;
|
|
||||||
int checkerboard;
|
|
||||||
vobj *_odata; // A managed pointer
|
|
||||||
uint64_t _odata_size;
|
|
||||||
ViewAdvise advise;
|
|
||||||
public:
|
|
||||||
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
|
|
||||||
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
|
|
||||||
accelerator_inline int Checkerboard(void) const { return checkerboard; };
|
|
||||||
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
|
|
||||||
accelerator_inline ViewAdvise Advise(void) const { return advise; };
|
|
||||||
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
|
|
||||||
accelerator_inline void Conformable(GridBase * &grid) const
|
|
||||||
{
|
|
||||||
if (grid) conformable(grid, _grid);
|
|
||||||
else grid = _grid;
|
|
||||||
};
|
|
||||||
// Host only
|
|
||||||
GridBase * getGrid(void) const { return _grid; };
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// A View class which provides accessor to the data.
|
|
||||||
// This will be safe to call from accelerator_for and is trivially copy constructible
|
|
||||||
// The copy constructor for this will need to be used by device lambda functions
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<class vobj>
|
|
||||||
class LatticeView : public LatticeAccelerator<vobj>
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
// Rvalue
|
|
||||||
ViewMode mode;
|
|
||||||
void * cpu_ptr;
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
|
|
||||||
return coalescedRead(this->_odata[i]);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
|
||||||
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
|
||||||
|
|
||||||
accelerator_inline uint64_t begin(void) const { return 0;};
|
|
||||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
|
||||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
|
||||||
|
|
||||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
|
|
||||||
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
|
|
||||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
|
|
||||||
{
|
|
||||||
this->ViewOpen(mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Host functions
|
|
||||||
void ViewOpen(ViewMode mode)
|
|
||||||
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
|
|
||||||
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
|
|
||||||
this->cpu_ptr = (void *)this->_odata;
|
|
||||||
this->mode = mode;
|
|
||||||
this->_odata =(vobj *)
|
|
||||||
MemoryManager::ViewOpen(this->cpu_ptr,
|
|
||||||
this->_odata_size*sizeof(vobj),
|
|
||||||
mode,
|
|
||||||
this->advise);
|
|
||||||
}
|
|
||||||
void ViewClose(void)
|
|
||||||
{ // Inform the manager
|
|
||||||
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
|
|
||||||
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
// Little autoscope assister
|
|
||||||
template<class View>
|
|
||||||
class ViewCloser
|
|
||||||
{
|
|
||||||
View v; // Take a copy of view and call view close when I go out of scope automatically
|
|
||||||
public:
|
|
||||||
ViewCloser(View &_v) : v(_v) {};
|
|
||||||
~ViewCloser() { v.ViewClose(); }
|
|
||||||
};
|
|
||||||
|
|
||||||
#define autoView(l_v,l,mode) \
|
|
||||||
auto l_v = l.View(mode); \
|
|
||||||
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Lattice expression types used by ET to assemble the AST
|
|
||||||
//
|
|
||||||
// Need to be able to detect code paths according to the whether a lattice object or not
|
|
||||||
// so introduce some trait type things
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
class LatticeExpressionBase {};
|
|
||||||
|
|
||||||
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
|
|
||||||
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
|
||||||
|
|
||||||
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
|
||||||
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
|
|
||||||
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
|
||||||
|
|
||||||
template <typename Op, typename _T1>
|
|
||||||
class LatticeUnaryExpression : public LatticeExpressionBase
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
typedef typename ViewMap<_T1>::Type T1;
|
|
||||||
Op op;
|
|
||||||
T1 arg1;
|
|
||||||
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename Op, typename _T1, typename _T2>
|
|
||||||
class LatticeBinaryExpression : public LatticeExpressionBase
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
typedef typename ViewMap<_T1>::Type T1;
|
|
||||||
typedef typename ViewMap<_T2>::Type T2;
|
|
||||||
Op op;
|
|
||||||
T1 arg1;
|
|
||||||
T2 arg2;
|
|
||||||
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename Op, typename _T1, typename _T2, typename _T3>
|
|
||||||
class LatticeTrinaryExpression : public LatticeExpressionBase
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
typedef typename ViewMap<_T1>::Type T1;
|
|
||||||
typedef typename ViewMap<_T2>::Type T2;
|
|
||||||
typedef typename ViewMap<_T3>::Type T3;
|
|
||||||
Op op;
|
|
||||||
T1 arg1;
|
|
||||||
T2 arg2;
|
|
||||||
T3 arg3;
|
|
||||||
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
|
|
||||||
};
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -130,8 +130,6 @@ public:
|
|||||||
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
||||||
|
|
||||||
if ( log.active ) {
|
if ( log.active ) {
|
||||||
std::ios_base::fmtflags f(stream.flags());
|
|
||||||
|
|
||||||
stream << log.background()<< std::left;
|
stream << log.background()<< std::left;
|
||||||
if (log.topWidth > 0)
|
if (log.topWidth > 0)
|
||||||
{
|
{
|
||||||
@ -154,8 +152,6 @@ public:
|
|||||||
<< now << log.background() << " : " ;
|
<< now << log.background() << " : " ;
|
||||||
}
|
}
|
||||||
stream << log.colour();
|
stream << log.colour();
|
||||||
stream.flags(f);
|
|
||||||
|
|
||||||
return stream;
|
return stream;
|
||||||
} else {
|
} else {
|
||||||
return devnull;
|
return devnull;
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
|
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
|
||||||
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;
|
|
||||||
|
@ -79,13 +79,6 @@ inline void removeWhitespace(std::string &key)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
class BinaryIO {
|
class BinaryIO {
|
||||||
public:
|
public:
|
||||||
struct IoPerf
|
|
||||||
{
|
|
||||||
uint64_t size{0},time{0};
|
|
||||||
double mbytesPerSecond{0.};
|
|
||||||
};
|
|
||||||
|
|
||||||
static IoPerf lastPerf;
|
|
||||||
static int latticeWriteMaxRetry;
|
static int latticeWriteMaxRetry;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
@ -509,15 +502,12 @@ class BinaryIO {
|
|||||||
timer.Stop();
|
timer.Stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
|
|
||||||
lastPerf.time = timer.useconds();
|
|
||||||
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
|
|
||||||
std::cout<<GridLogMessage<<"IOobject: ";
|
std::cout<<GridLogMessage<<"IOobject: ";
|
||||||
if ( control & BINARYIO_READ) std::cout << " read ";
|
if ( control & BINARYIO_READ) std::cout << " read ";
|
||||||
else std::cout << " write ";
|
else std::cout << " write ";
|
||||||
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
|
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
|
||||||
std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
|
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
<< lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
|
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
|
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
|
||||||
|
|
||||||
@ -673,15 +663,10 @@ class BinaryIO {
|
|||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
timer.Start();
|
timer.Start();
|
||||||
thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation
|
thread_for(lidx,lsites,{
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
|
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
|
||||||
Coordinate lcoor;
|
parallel_rng.SetState(tmp,lidx);
|
||||||
grid->LocalIndexToLocalCoor(lidx, lcoor);
|
|
||||||
int o_idx=grid->oIndex(lcoor);
|
|
||||||
int i_idx=grid->iIndex(lcoor);
|
|
||||||
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
|
|
||||||
parallel_rng.SetState(tmp,gidx);
|
|
||||||
});
|
});
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
|
||||||
@ -738,12 +723,7 @@ class BinaryIO {
|
|||||||
std::vector<RNGstate> iodata(lsites);
|
std::vector<RNGstate> iodata(lsites);
|
||||||
thread_for(lidx,lsites,{
|
thread_for(lidx,lsites,{
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
Coordinate lcoor;
|
parallel_rng.GetState(tmp,lidx);
|
||||||
grid->LocalIndexToLocalCoor(lidx, lcoor);
|
|
||||||
int o_idx=grid->oIndex(lcoor);
|
|
||||||
int i_idx=grid->iIndex(lcoor);
|
|
||||||
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
|
|
||||||
parallel_rng.GetState(tmp,gidx);
|
|
||||||
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
|
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
|
||||||
});
|
});
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_NVCC
|
||||||
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
||||||
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
||||||
#else
|
#else
|
||||||
@ -112,6 +112,7 @@ class PerformanceCounter {
|
|||||||
private:
|
private:
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
public:
|
||||||
uint32_t type;
|
uint32_t type;
|
||||||
uint64_t config;
|
uint64_t config;
|
||||||
const char *name;
|
const char *name;
|
||||||
|
@ -12773,7 +12773,7 @@ namespace pugi
|
|||||||
#undef PUGI__THROW_ERROR
|
#undef PUGI__THROW_ERROR
|
||||||
#undef PUGI__CHECK_ERROR
|
#undef PUGI__CHECK_ERROR
|
||||||
|
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_NVCC
|
||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ static constexpr int Ym = 5;
|
|||||||
static constexpr int Zm = 6;
|
static constexpr int Zm = 6;
|
||||||
static constexpr int Tm = 7;
|
static constexpr int Tm = 7;
|
||||||
|
|
||||||
static constexpr int Nc=Config_Nc;
|
static constexpr int Nc=3;
|
||||||
static constexpr int Ns=4;
|
static constexpr int Ns=4;
|
||||||
static constexpr int Nd=4;
|
static constexpr int Nd=4;
|
||||||
static constexpr int Nhs=2; // half spinor
|
static constexpr int Nhs=2; // half spinor
|
||||||
|
@ -114,22 +114,19 @@ public:
|
|||||||
U = adj(Cshift(U, mu, -1));
|
U = adj(Cshift(U, mu, -1));
|
||||||
PokeIndex<LorentzIndex>(Uadj, U, mu);
|
PokeIndex<LorentzIndex>(Uadj, U, mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
autoView(Umu_v,Umu,CpuRead);
|
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||||
autoView(Uadj_v,Uadj,CpuRead);
|
|
||||||
autoView(Uds_v,Uds,CpuWrite);
|
|
||||||
thread_for( lidx, GaugeGrid->lSites(), {
|
|
||||||
Coordinate lcoor;
|
Coordinate lcoor;
|
||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
|
||||||
peekLocalSite(ScalarUmu, Umu_v, lcoor);
|
peekLocalSite(ScalarUmu, Umu, lcoor);
|
||||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||||
|
|
||||||
peekLocalSite(ScalarUmu, Uadj_v, lcoor);
|
peekLocalSite(ScalarUmu, Uadj, lcoor);
|
||||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
|
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
|
||||||
|
|
||||||
pokeLocalSite(ScalarUds, Uds_v, lcoor);
|
pokeLocalSite(ScalarUds, Uds, lcoor);
|
||||||
});
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
|
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
|
||||||
|
@ -57,7 +57,6 @@ NAMESPACE_CHECK(WilsonClover);
|
|||||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||||
NAMESPACE_CHECK(Wilson5D);
|
NAMESPACE_CHECK(Wilson5D);
|
||||||
|
|
||||||
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
|
|
||||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
|
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
|
||||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
|
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
|
||||||
NAMESPACE_CHECK(Staggered);
|
NAMESPACE_CHECK(Staggered);
|
||||||
@ -283,15 +282,11 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
|
|||||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
|
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
|
||||||
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
|
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
|
||||||
|
|
||||||
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
|
|
||||||
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
|
|
||||||
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
|
|
||||||
|
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
||||||
|
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_NVCC
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
||||||
|
@ -96,11 +96,11 @@ public:
|
|||||||
int sl = St._simd_layout[direction];
|
int sl = St._simd_layout[direction];
|
||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
|
|
||||||
#ifdef GRID_SIMT
|
#ifdef __CUDA_ARCH__
|
||||||
_Spinor tmp;
|
_Spinor tmp;
|
||||||
|
|
||||||
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
||||||
int s = acceleratorSIMTlane(Nsimd);
|
int s = SIMTlane(Nsimd);
|
||||||
St.iCoorFromIindex(icoor,s);
|
St.iCoorFromIindex(icoor,s);
|
||||||
|
|
||||||
int mmu = mu % Nd;
|
int mmu = mu % Nd;
|
||||||
@ -232,17 +232,15 @@ public:
|
|||||||
if ( Params.twists[mu] ) {
|
if ( Params.twists[mu] ) {
|
||||||
Uconj = where(coor==neglink,-Uconj,Uconj);
|
Uconj = where(coor==neglink,-Uconj,Uconj);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
auto U_v = U.View();
|
||||||
autoView( U_v , U, CpuRead);
|
auto Uds_v = Uds.View();
|
||||||
autoView( Uconj_v , Uconj, CpuRead);
|
auto Uconj_v = Uconj.View();
|
||||||
autoView( Uds_v , Uds, CpuWrite);
|
auto Utmp_v= Utmp.View();
|
||||||
autoView( Utmp_v, Utmp, CpuWrite);
|
thread_foreach(ss,U_v,{
|
||||||
thread_foreach(ss,U_v,{
|
Uds_v[ss](0)(mu) = U_v[ss]();
|
||||||
Uds_v[ss](0)(mu) = U_v[ss]();
|
Uds_v[ss](1)(mu) = Uconj_v[ss]();
|
||||||
Uds_v[ss](1)(mu) = Uconj_v[ss]();
|
});
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
||||||
Uconj = adj(Cshift(Uconj,mu,-1));
|
Uconj = adj(Cshift(Uconj,mu,-1));
|
||||||
@ -252,25 +250,19 @@ public:
|
|||||||
Utmp = where(coor==0,Uconj,Utmp);
|
Utmp = where(coor==0,Uconj,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
thread_foreach(ss,Utmp_v,{
|
||||||
autoView( Uds_v , Uds, CpuWrite);
|
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
||||||
autoView( Utmp_v, Utmp, CpuWrite);
|
});
|
||||||
thread_foreach(ss,Utmp_v,{
|
|
||||||
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Utmp = Uconj;
|
Utmp = Uconj;
|
||||||
if ( Params.twists[mu] ) {
|
if ( Params.twists[mu] ) {
|
||||||
Utmp = where(coor==0,U,Utmp);
|
Utmp = where(coor==0,U,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
thread_foreach(ss,Utmp_v,{
|
||||||
autoView( Uds_v , Uds, CpuWrite);
|
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
||||||
autoView( Utmp_v, Utmp, CpuWrite);
|
});
|
||||||
thread_foreach(ss,Utmp_v,{
|
|
||||||
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -280,14 +272,11 @@ public:
|
|||||||
GaugeLinkField link(mat.Grid());
|
GaugeLinkField link(mat.Grid());
|
||||||
// use lorentz for flavour as hack.
|
// use lorentz for flavour as hack.
|
||||||
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
||||||
|
auto link_v = link.View();
|
||||||
{
|
auto tmp_v = tmp.View();
|
||||||
autoView( link_v , link, CpuWrite);
|
thread_foreach(ss,tmp_v,{
|
||||||
autoView( tmp_v , tmp, CpuRead);
|
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
|
||||||
thread_foreach(ss,tmp_v,{
|
});
|
||||||
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
PokeIndex<LorentzIndex>(mat, link, mu);
|
PokeIndex<LorentzIndex>(mat, link, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -317,18 +306,16 @@ public:
|
|||||||
|
|
||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
{
|
auto tmp_v = tmp.View();
|
||||||
autoView( tmp_v , tmp, CpuWrite);
|
auto Atilde_v = Atilde.View();
|
||||||
autoView( Atilde_v , Atilde, CpuRead);
|
auto Btilde_v = Btilde.View();
|
||||||
autoView( Btilde_v , Btilde, CpuRead);
|
thread_for(ss,tmp.Grid()->oSites(),{
|
||||||
thread_for(ss,tmp.Grid()->oSites(),{
|
for (int s = 0; s < Ls; s++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
int sF = s + Ls * ss;
|
||||||
int sF = s + Ls * ss;
|
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
|
||||||
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
|
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
||||||
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
}
|
||||||
}
|
});
|
||||||
});
|
|
||||||
}
|
|
||||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -61,8 +61,8 @@ public:
|
|||||||
double DhopCalls;
|
double DhopCalls;
|
||||||
double DhopCommTime;
|
double DhopCommTime;
|
||||||
double DhopComputeTime;
|
double DhopComputeTime;
|
||||||
double DhopComputeTime2;
|
double DhopComputeTime2;
|
||||||
double DhopFaceTime;
|
double DhopFaceTime;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
@ -208,7 +208,7 @@ public:
|
|||||||
LebesgueOrder LebesgueEvenOdd;
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
|
@ -1,194 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi, Peter Boyle
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
|
|
||||||
#define GRID_QCD_NAIVE_STAG_FERMION_H
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
class NaiveStaggeredFermionStatic {
|
|
||||||
public:
|
|
||||||
static const std::vector<int> directions;
|
|
||||||
static const std::vector<int> displacements;
|
|
||||||
static const int npoint = 8;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
|
|
||||||
public:
|
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
|
||||||
typedef StaggeredKernels<Impl> Kernels;
|
|
||||||
|
|
||||||
FermionField _tmp;
|
|
||||||
FermionField &tmp(void) { return _tmp; }
|
|
||||||
|
|
||||||
////////////////////////////////////////
|
|
||||||
// Performance monitoring
|
|
||||||
////////////////////////////////////////
|
|
||||||
void Report(void);
|
|
||||||
void ZeroCounters(void);
|
|
||||||
double DhopTotalTime;
|
|
||||||
double DhopCalls;
|
|
||||||
double DhopCommTime;
|
|
||||||
double DhopComputeTime;
|
|
||||||
double DhopComputeTime2;
|
|
||||||
double DhopFaceTime;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Implement the abstract base
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
GridBase *GaugeGrid(void) { return _grid; }
|
|
||||||
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
|
|
||||||
GridBase *FermionGrid(void) { return _grid; }
|
|
||||||
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
|
||||||
// override multiply; cut number routines if pass dagger argument
|
|
||||||
// and also make interface more uniformly consistent
|
|
||||||
//////////////////////////////////////////////////////////////////
|
|
||||||
void M(const FermionField &in, FermionField &out);
|
|
||||||
void Mdag(const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////
|
|
||||||
// half checkerboard operations
|
|
||||||
/////////////////////////////////////////////////////////
|
|
||||||
void Meooe(const FermionField &in, FermionField &out);
|
|
||||||
void MeooeDag(const FermionField &in, FermionField &out);
|
|
||||||
void Mooee(const FermionField &in, FermionField &out);
|
|
||||||
void MooeeDag(const FermionField &in, FermionField &out);
|
|
||||||
void MooeeInv(const FermionField &in, FermionField &out);
|
|
||||||
void MooeeInvDag(const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
////////////////////////
|
|
||||||
// Derivative interface
|
|
||||||
////////////////////////
|
|
||||||
// Interface calls an internal routine
|
|
||||||
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
|
||||||
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
|
||||||
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// non-hermitian hopping term; half cb or both
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
void Dhop (const FermionField &in, FermionField &out, int dag);
|
|
||||||
void DhopOE(const FermionField &in, FermionField &out, int dag);
|
|
||||||
void DhopEO(const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Multigrid assistance; force term uses too
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
|
|
||||||
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
|
|
||||||
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Extra methods added by derived
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
void DerivInternal(StencilImpl &st,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
GaugeField &mat,
|
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// Grid own interface Constructor
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
|
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
|
||||||
RealD _c1, RealD _u0,
|
|
||||||
const ImplParams &p = ImplParams());
|
|
||||||
NaiveStaggeredFermion(GridCartesian &Fgrid,
|
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
|
||||||
RealD _c1, RealD _u0,
|
|
||||||
const ImplParams &p = ImplParams());
|
|
||||||
|
|
||||||
// DoubleStore impl dependent
|
|
||||||
void ImportGauge (const GaugeField &_U );
|
|
||||||
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
|
||||||
void CopyGaugeCheckerboards(void);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Data members require to support the functionality
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
// protected:
|
|
||||||
public:
|
|
||||||
// any other parameters of action ???
|
|
||||||
virtual int isTrivialEE(void) { return 1; };
|
|
||||||
virtual RealD Mass(void) { return mass; }
|
|
||||||
RealD mass;
|
|
||||||
RealD u0;
|
|
||||||
RealD c1;
|
|
||||||
|
|
||||||
GridBase *_grid;
|
|
||||||
GridBase *_cbgrid;
|
|
||||||
|
|
||||||
// Defines the stencils for even and odd
|
|
||||||
StencilImpl Stencil;
|
|
||||||
StencilImpl StencilEven;
|
|
||||||
StencilImpl StencilOdd;
|
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
|
||||||
DoubledGaugeField Umu;
|
|
||||||
DoubledGaugeField UmuEven;
|
|
||||||
DoubledGaugeField UmuOdd;
|
|
||||||
|
|
||||||
LebesgueOrder Lebesgue;
|
|
||||||
LebesgueOrder LebesgueEvenOdd;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Conserved current utilities
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
|
||||||
PropagatorField &q_in_2,
|
|
||||||
PropagatorField &q_out,
|
|
||||||
PropagatorField &src,
|
|
||||||
Current curr_type,
|
|
||||||
unsigned int mu);
|
|
||||||
void SeqConservedCurrent(PropagatorField &q_in,
|
|
||||||
PropagatorField &q_out,
|
|
||||||
PropagatorField &srct,
|
|
||||||
Current curr_type,
|
|
||||||
unsigned int mu,
|
|
||||||
unsigned int tmin,
|
|
||||||
unsigned int tmax,
|
|
||||||
ComplexField &lattice_cmplx);
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
|
|
||||||
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
#endif
|
|
@ -47,37 +47,23 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef FermionOperator<Impl> Base;
|
typedef FermionOperator<Impl> Base;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
|
||||||
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
|
||||||
|
|
||||||
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
|
|
||||||
protected:
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Generic Nc kernels
|
// Generic Nc kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<int Naik>
|
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||||
static accelerator_inline
|
|
||||||
void DhopSiteGeneric(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
|
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
template<int Naik> static accelerator_inline
|
|
||||||
void DhopSiteGenericInt(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
|
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
template<int Naik> static accelerator_inline
|
|
||||||
void DhopSiteGenericExt(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
@ -85,21 +71,15 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Nc=3 specific kernels
|
// Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||||
template<int Naik> static accelerator_inline
|
|
||||||
void DhopSiteHand(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
|
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
template<int Naik> static accelerator_inline
|
|
||||||
void DhopSiteHandInt(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
|
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
template<int Naik> static accelerator_inline
|
|
||||||
void DhopSiteHandExt(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
@ -107,11 +87,27 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Asm Nc=3 specific kernels
|
// Asm Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
void DhopSiteAsm(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Generic interface; fan out to right routine
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
|
||||||
|
|
||||||
|
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
|
||||||
|
|
||||||
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -113,7 +113,20 @@ public:
|
|||||||
|
|
||||||
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
||||||
{
|
{
|
||||||
assert(0);
|
GridBase *GaugeGrid = U_ds.Grid();
|
||||||
|
thread_for(lidx, GaugeGrid->lSites(),{
|
||||||
|
|
||||||
|
SiteScalarGaugeLink ScalarU;
|
||||||
|
SiteDoubledGaugeField ScalarUds;
|
||||||
|
|
||||||
|
Coordinate lcoor;
|
||||||
|
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
peekLocalSite(ScalarUds, U_ds, lcoor);
|
||||||
|
|
||||||
|
peekLocalSite(ScalarU, U, lcoor);
|
||||||
|
ScalarUds(mu) = ScalarU();
|
||||||
|
|
||||||
|
});
|
||||||
}
|
}
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
inline void DoubleStore(GridBase *GaugeGrid,
|
||||||
DoubledGaugeField &UUUds, // for Naik term
|
DoubledGaugeField &UUUds, // for Naik term
|
||||||
|
@ -257,16 +257,15 @@ private:
|
|||||||
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
||||||
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
||||||
|
|
||||||
public:
|
|
||||||
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
||||||
// using the DeGrand-Rossi basis for the gamma matrices
|
// using the DeGrand-Rossi basis for the gamma matrices
|
||||||
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
||||||
{
|
{
|
||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
autoView(T_v,T,AcceleratorWrite);
|
auto T_v = T.View();
|
||||||
autoView(F_v,F,AcceleratorRead);
|
auto F_v = F.View();
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
||||||
@ -282,9 +281,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView(T_v, T,AcceleratorWrite);
|
auto T_v = T.View();
|
||||||
autoView(F_v, F,AcceleratorRead);
|
auto F_v = F.View();
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -F_v[i]()();
|
T_v[i]()(0, 1) = -F_v[i]()();
|
||||||
T_v[i]()(1, 0) = F_v[i]()();
|
T_v[i]()(1, 0) = F_v[i]()();
|
||||||
@ -300,9 +299,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView(T_v,T,AcceleratorWrite);
|
auto T_v = T.View();
|
||||||
autoView(F_v,F,AcceleratorRead);
|
auto F_v = F.View();
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
||||||
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
||||||
@ -318,9 +317,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView( T_v , T, AcceleratorWrite);
|
auto T_v = T.View();
|
||||||
autoView( F_v , F, AcceleratorRead);
|
auto F_v = F.View();
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
||||||
@ -336,9 +335,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView( T_v ,T,AcceleratorWrite);
|
auto T_v = T.View();
|
||||||
autoView( F_v ,F,AcceleratorRead);
|
auto F_v = F.View();
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -(F_v[i]()());
|
T_v[i]()(0, 1) = -(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = (F_v[i]()());
|
T_v[i]()(1, 0) = (F_v[i]()());
|
||||||
@ -355,9 +354,9 @@ private:
|
|||||||
|
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView( T_v , T,AcceleratorWrite);
|
auto T_v = T.View();
|
||||||
autoView( F_v , F,AcceleratorRead);
|
auto F_v = F.View();
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
||||||
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
||||||
|
@ -50,14 +50,14 @@ public:
|
|||||||
double, nu);
|
double, nu);
|
||||||
|
|
||||||
WilsonAnisotropyCoefficients():
|
WilsonAnisotropyCoefficients():
|
||||||
isAnisotropic(false),
|
isAnisotropic(false),
|
||||||
t_direction(Nd-1),
|
t_direction(Nd-1),
|
||||||
xi_0(1.0),
|
xi_0(1.0),
|
||||||
nu(1.0){}
|
nu(1.0){}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
|
class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
@ -74,20 +74,6 @@ public:
|
|||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
void Report(void);
|
|
||||||
void ZeroCounters(void);
|
|
||||||
double DhopCalls;
|
|
||||||
double DhopCommTime;
|
|
||||||
double DhopComputeTime;
|
|
||||||
double DhopComputeTime2;
|
|
||||||
double DhopFaceTime;
|
|
||||||
double DhopTotalTime;
|
|
||||||
|
|
||||||
double DerivCalls;
|
|
||||||
double DerivCommTime;
|
|
||||||
double DerivComputeTime;
|
|
||||||
double DerivDhopComputeTime;
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
// override multiply; cut number routines if pass dagger argument
|
// override multiply; cut number routines if pass dagger argument
|
||||||
// and also make interface more uniformly consistent
|
// and also make interface more uniformly consistent
|
||||||
@ -152,7 +138,7 @@ public:
|
|||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
const ImplParams &p = ImplParams(),
|
const ImplParams &p = ImplParams(),
|
||||||
const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
|
const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
|
||||||
|
|
||||||
// DoubleStore impl dependent
|
// DoubleStore impl dependent
|
||||||
@ -184,9 +170,9 @@ public:
|
|||||||
|
|
||||||
LebesgueOrder Lebesgue;
|
LebesgueOrder Lebesgue;
|
||||||
LebesgueOrder LebesgueEvenOdd;
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
WilsonAnisotropyCoefficients anisotropyCoeff;
|
WilsonAnisotropyCoefficients anisotropyCoeff;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -200,7 +186,7 @@ public:
|
|||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
PropagatorField &phys_src,
|
PropagatorField &phys_src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
unsigned int tmax,
|
unsigned int tmax,
|
||||||
ComplexField &lattice_cmplx);
|
ComplexField &lattice_cmplx);
|
||||||
@ -210,3 +196,5 @@ typedef WilsonFermion<WilsonImplF> WilsonFermionF;
|
|||||||
typedef WilsonFermion<WilsonImplD> WilsonFermionD;
|
typedef WilsonFermion<WilsonImplD> WilsonFermionD;
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
@ -215,7 +215,7 @@ public:
|
|||||||
LebesgueOrder LebesgueEvenOdd;
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -106,10 +106,10 @@ public:
|
|||||||
const _SpinorField & phi,
|
const _SpinorField & phi,
|
||||||
int mu)
|
int mu)
|
||||||
{
|
{
|
||||||
autoView( out_v, out, AcceleratorWrite);
|
auto out_v= out.View();
|
||||||
autoView( phi_v, phi, AcceleratorRead);
|
auto phi_v= phi.View();
|
||||||
autoView( Umu_v, Umu, AcceleratorRead);
|
auto Umu_v= Umu.View();
|
||||||
accelerator_for(sss,out.Grid()->oSites(),1,{
|
thread_for(sss,out.Grid()->oSites(),{
|
||||||
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -191,19 +191,18 @@ public:
|
|||||||
int Ls=Btilde.Grid()->_fdimensions[0];
|
int Ls=Btilde.Grid()->_fdimensions[0];
|
||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
{
|
auto tmp_v = tmp.View();
|
||||||
autoView( tmp_v , tmp, AcceleratorWrite);
|
auto Btilde_v = Btilde.View();
|
||||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
auto Atilde_v = Atilde.View();
|
||||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
thread_for(sss,tmp.Grid()->oSites(),{
|
||||||
accelerator_for(sss,tmp.Grid()->oSites(),1,{
|
int sU=sss;
|
||||||
int sU=sss;
|
for(int s=0;s<Ls;s++){
|
||||||
for(int s=0;s<Ls;s++){
|
int sF = s+Ls*sU;
|
||||||
int sF = s+Ls*sU;
|
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
|
||||||
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
|
}
|
||||||
}
|
});
|
||||||
});
|
|
||||||
}
|
|
||||||
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
|||||||
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_NVCC
|
||||||
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
|
|||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu)
|
unsigned int mu)
|
||||||
{
|
{
|
||||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
#ifndef GRID_NVCC
|
||||||
Gamma::Algebra Gmu [] = {
|
Gamma::Algebra Gmu [] = {
|
||||||
Gamma::Algebra::GammaX,
|
Gamma::Algebra::GammaX,
|
||||||
Gamma::Algebra::GammaY,
|
Gamma::Algebra::GammaY,
|
||||||
@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
|
|
||||||
PropagatorField tmp(UGrid);
|
PropagatorField tmp(UGrid);
|
||||||
PropagatorField Utmp(UGrid);
|
PropagatorField Utmp(UGrid);
|
||||||
PropagatorField zz (UGrid); zz=0.0;
|
LatticeInteger zz (UGrid); zz=0.0;
|
||||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
for (int s=0;s<Ls;s++) {
|
for (int s=0;s<Ls;s++) {
|
||||||
|
|
||||||
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
#ifndef GRID_NVCC
|
||||||
int tshift = (mu == Nd-1) ? 1 : 0;
|
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// GENERAL CAYLEY CASE
|
// GENERAL CAYLEY CASE
|
||||||
@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
PropagatorField tmp(UGrid);
|
PropagatorField tmp(UGrid);
|
||||||
PropagatorField Utmp(UGrid);
|
PropagatorField Utmp(UGrid);
|
||||||
|
|
||||||
PropagatorField zz (UGrid); zz=0.0;
|
LatticeInteger zz (UGrid); zz=0.0;
|
||||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
|
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
|
@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
|
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
autoView(psi , psi_i,AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(phi , phi_i,AcceleratorRead);
|
auto phi = phi_i.View();
|
||||||
autoView(chi , chi_i,AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
autoView(psi , psi_i,AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(phi , phi_i,AcceleratorRead);
|
auto phi = phi_i.View();
|
||||||
autoView(chi , chi_i,AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
|||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
|
|
||||||
autoView(psi , psi_i,AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i,AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
|||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
autoView(psi , psi_i,AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i,AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
auto plee = & lee [0];
|
auto plee = & lee [0];
|
||||||
auto pdee = & dee [0];
|
auto pdee = & dee [0];
|
||||||
|
@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
autoView(psi, psi_i,CpuRead);
|
auto psi = psi_i.View();
|
||||||
autoView(phi, phi_i,CpuRead);
|
auto phi = phi_i.View();
|
||||||
autoView(chi, chi_i,CpuWrite);
|
auto chi = chi_i.View();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
const int nsimd= Simd::Nsimd();
|
const int nsimd= Simd::Nsimd();
|
||||||
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
autoView(psi,psi_i,CpuRead);
|
auto psi=psi_i.View();
|
||||||
autoView(phi,phi_i,CpuRead);
|
auto phi=phi_i.View();
|
||||||
autoView(chi,chi_i,CpuWrite);
|
auto chi=chi_i.View();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
int nsimd= Simd::Nsimd();
|
int nsimd= Simd::Nsimd();
|
||||||
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
|
|||||||
Vector<iSinglet<Simd> > &Matm)
|
Vector<iSinglet<Simd> > &Matm)
|
||||||
{
|
{
|
||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
autoView(psi , psi_i,CpuRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i,CpuWrite);
|
auto chi = chi_i.View();
|
||||||
#ifndef AVX512
|
#ifndef AVX512
|
||||||
{
|
{
|
||||||
SiteHalfSpinor BcastP;
|
SiteHalfSpinor BcastP;
|
||||||
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
|
|||||||
EnableIf<Impl::LsVectorised,int> sfinae=0;
|
EnableIf<Impl::LsVectorised,int> sfinae=0;
|
||||||
#ifndef AVX512
|
#ifndef AVX512
|
||||||
{
|
{
|
||||||
autoView(psi , psi_i,CpuRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i,CpuWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
SiteHalfSpinor BcastP;
|
SiteHalfSpinor BcastP;
|
||||||
SiteHalfSpinor BcastM;
|
SiteHalfSpinor BcastM;
|
||||||
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
{
|
{
|
||||||
autoView(psi , psi_i,CpuRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i,CpuWrite);
|
auto chi = chi_i.View();
|
||||||
// pointers
|
// pointers
|
||||||
// MASK_REGS;
|
// MASK_REGS;
|
||||||
#define Chi_00 %zmm0
|
#define Chi_00 %zmm0
|
||||||
|
@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
autoView( phi , phi_i, AcceleratorRead);
|
auto phi = phi_i.View();
|
||||||
autoView( psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView( chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &upper[0];
|
auto pupper = &upper[0];
|
||||||
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
|||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
autoView( psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView( phi , phi_i, AcceleratorRead);
|
auto phi = phi_i.View();
|
||||||
autoView( chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &upper[0];
|
auto pupper = &upper[0];
|
||||||
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
autoView( psi, psi_i, AcceleratorRead);
|
auto psi=psi_i.View();
|
||||||
autoView( chi, chi_i, AcceleratorWrite);
|
auto chi=chi_i.View();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto plee = & this->lee[0];
|
auto plee = & this->lee[0];
|
||||||
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
autoView( psi, psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView( chi, chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto plee = & this->lee[0];
|
auto plee = & this->lee[0];
|
||||||
|
@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
|
|||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
autoView( Umu_v , Umu, CpuRead);
|
auto Umu_v = Umu.View();
|
||||||
autoView( UUUmu_v , UUUmu, CpuRead);
|
auto UUUmu_v = UUUmu.View();
|
||||||
autoView( in_v , in, CpuRead);
|
auto in_v = in.View();
|
||||||
autoView( out_v , out, CpuWrite);
|
auto out_v = out.View();
|
||||||
thread_for( ss,Umu.Grid()->oSites(),{
|
thread_for( ss,Umu.Grid()->oSites(),{
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
@ -281,9 +281,11 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
|||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
|
#endif
|
||||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -292,7 +294,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
@ -301,42 +305,99 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.Prepare();
|
st.Prepare();
|
||||||
st.HaloGather(in,compressor);
|
st.HaloGather(in,compressor);
|
||||||
DhopFaceTime+=usecond();
|
|
||||||
|
|
||||||
DhopCommTime -=usecond();
|
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
|
||||||
st.CommunicateBegin(requests);
|
|
||||||
|
|
||||||
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
double ctime=0;
|
||||||
|
double ptime=0;
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Remove explicit thread mapping introduced for OPA reasons.
|
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
DhopComputeTime-=usecond();
|
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||||
{
|
{
|
||||||
int interior=1;
|
int tid = omp_get_thread_num();
|
||||||
int exterior=0;
|
int nthreads = omp_get_num_threads();
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
double start = usecond();
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = U.Grid()->oSites(); // 4d vol
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the compute
|
||||||
|
auto U_v = U.View();
|
||||||
|
auto UUU_v = UUU.View();
|
||||||
|
auto in_v = in.View();
|
||||||
|
auto out_v = out.View();
|
||||||
|
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
int sU = ss;
|
||||||
|
// Interior = 1; Exterior = 0; must implement for staggered
|
||||||
|
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
// Interior = 1; Exterior = 0;
|
||||||
|
int sU = ss;
|
||||||
|
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ptime = usecond() - start;
|
||||||
|
} else {
|
||||||
|
double start = usecond();
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
ctime = usecond() - start;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
DhopComputeTime+=usecond();
|
DhopCommTime += ctime;
|
||||||
|
DhopComputeTime+=ptime;
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
st.CollateThreads();
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
|
||||||
DhopCommTime +=usecond();
|
|
||||||
|
|
||||||
DhopComputeTime2-=usecond();
|
DhopComputeTime2-=usecond();
|
||||||
{
|
|
||||||
int interior=0;
|
auto U_v = U.View();
|
||||||
int exterior=1;
|
auto UUU_v = UUU.View();
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
auto in_v = in.View();
|
||||||
|
auto out_v = out.View();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
thread_for( ss,sz,{
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
thread_for( ss,sz,{
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
|
||||||
|
});
|
||||||
}
|
}
|
||||||
DhopComputeTime2+=usecond();
|
DhopComputeTime2+=usecond();
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -347,6 +408,8 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//double t1=usecond();
|
//double t1=usecond();
|
||||||
DhopTotalTime -= usecond();
|
DhopTotalTime -= usecond();
|
||||||
DhopCommTime -= usecond();
|
DhopCommTime -= usecond();
|
||||||
@ -355,13 +418,28 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
|
|
||||||
DhopComputeTime -= usecond();
|
DhopComputeTime -= usecond();
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
{
|
auto U_v = U.View();
|
||||||
int interior=1;
|
auto UUU_v = UUU.View();
|
||||||
int exterior=1;
|
auto in_v = in.View();
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
auto out_v = out.View();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
thread_for( ss,U.Grid()->oSites(),{
|
||||||
|
int sU=ss;
|
||||||
|
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
thread_for( ss,U.Grid()->oSites(),{
|
||||||
|
int sU=ss;
|
||||||
|
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
|
//double t2=usecond();
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
/*CHANGE END*/
|
/*CHANGE END*/
|
||||||
|
@ -258,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
// Call the single hop
|
// Call the single hop
|
||||||
////////////////////////
|
////////////////////////
|
||||||
autoView( U_v , U, CpuRead);
|
auto U_v = U.View();
|
||||||
autoView( UUU_v , UUU, CpuRead);
|
auto UUU_v = UUU.View();
|
||||||
autoView( B_v , B, CpuWrite);
|
auto B_v = B.View();
|
||||||
autoView( Btilde_v , Btilde, CpuWrite);
|
auto Btilde_v = Btilde.View();
|
||||||
thread_for(sss,B.Grid()->oSites(),{
|
thread_for(sss,B.Grid()->oSites(),{
|
||||||
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
||||||
});
|
});
|
||||||
@ -386,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
|
|||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
autoView( Umu_v , Umu, CpuRead);
|
auto Umu_v = Umu.View();
|
||||||
autoView( UUUmu_v , UUUmu, CpuRead);
|
auto UUUmu_v = UUUmu.View();
|
||||||
autoView( in_v , in, CpuRead);
|
auto in_v = in.View();
|
||||||
autoView( out_v , out, CpuWrite);
|
auto out_v = out.View();
|
||||||
thread_for( sss, in.Grid()->oSites(),{
|
thread_for( sss, in.Grid()->oSites(),{
|
||||||
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
||||||
});
|
});
|
||||||
@ -403,9 +403,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
|
|||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
|
#endif
|
||||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -415,6 +417,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
int len = U.Grid()->oSites();
|
int len = U.Grid()->oSites();
|
||||||
|
|
||||||
@ -423,30 +426,60 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
DhopFaceTime -= usecond();
|
DhopFaceTime -= usecond();
|
||||||
st.Prepare();
|
st.Prepare();
|
||||||
st.HaloGather(in,compressor);
|
st.HaloGather(in,compressor);
|
||||||
|
st.CommsMergeSHM(compressor);
|
||||||
DhopFaceTime += usecond();
|
DhopFaceTime += usecond();
|
||||||
|
|
||||||
DhopCommTime -=usecond();
|
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
|
||||||
st.CommunicateBegin(requests);
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.CommsMergeSHM(compressor);
|
|
||||||
DhopFaceTime+= usecond();
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Removed explicit thread comms
|
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
DhopComputeTime -= usecond();
|
DhopComputeTime -= usecond();
|
||||||
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
int interior=1;
|
int tid = omp_get_thread_num();
|
||||||
int exterior=0;
|
int nthreads = omp_get_num_threads();
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = len;
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the compute
|
||||||
|
auto U_v = U.View();
|
||||||
|
auto UUU_v = UUU.View();
|
||||||
|
auto in_v = in.View();
|
||||||
|
auto out_v = out.View();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
int sU = ss;
|
||||||
|
// Interior = 1; Exterior = 0; must implement for staggered
|
||||||
|
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
// Interior = 1; Exterior = 0;
|
||||||
|
int sU = ss;
|
||||||
|
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
|
||||||
DhopCommTime +=usecond();
|
|
||||||
|
|
||||||
// First to enter, last to leave timing
|
// First to enter, last to leave timing
|
||||||
DhopFaceTime -= usecond();
|
DhopFaceTime -= usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
@ -454,11 +487,28 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
|
|
||||||
DhopComputeTime2 -= usecond();
|
DhopComputeTime2 -= usecond();
|
||||||
{
|
{
|
||||||
int interior=0;
|
auto U_v = U.View();
|
||||||
int exterior=1;
|
auto UUU_v = UUU.View();
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
auto in_v = in.View();
|
||||||
|
auto out_v = out.View();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
thread_for(ss,sz,{
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
thread_for(ss,sz,{
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
DhopComputeTime2 += usecond();
|
DhopComputeTime2 += usecond();
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -478,11 +528,19 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
|
|||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
DhopCommTime += usecond();
|
DhopCommTime += usecond();
|
||||||
|
|
||||||
|
auto U_v = U.View();
|
||||||
|
auto UUU_v = UUU.View();
|
||||||
|
auto in_v = in.View();
|
||||||
|
auto out_v = out.View();
|
||||||
DhopComputeTime -= usecond();
|
DhopComputeTime -= usecond();
|
||||||
{
|
if (dag == DaggerYes) {
|
||||||
int interior=1;
|
thread_for(sss, in.Grid()->oSites(),{
|
||||||
int exterior=1;
|
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
});
|
||||||
|
} else {
|
||||||
|
thread_for(sss, in.Grid()->oSites(),{
|
||||||
|
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
|
@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
autoView(psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(phi , phi_i, AcceleratorRead);
|
auto phi = phi_i.View();
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
autoView(psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(phi , phi_i, AcceleratorRead);
|
auto phi = phi_i.View();
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||||
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
autoView(psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(phi , phi_i, AcceleratorRead);
|
auto phi = phi_i.View();
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
|||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||||
autoView(psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(phi , phi_i, AcceleratorRead);
|
auto phi = phi_i.View();
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
autoView(psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->dee [0];
|
auto pdee = & this->dee [0];
|
||||||
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
autoView(psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
autoView(psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
|
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->dee [0];
|
auto pdee = & this->dee [0];
|
||||||
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
autoView(psi , psi_i, AcceleratorRead);
|
auto psi = psi_i.View();
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
auto chi = chi_i.View();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
|
@ -1,499 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi, Peter Boyle
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
/////////////////////////////////
|
|
||||||
// Constructor and gauge import
|
|
||||||
/////////////////////////////////
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
|
||||||
RealD _mass,
|
|
||||||
RealD _c1, RealD _u0,
|
|
||||||
const ImplParams &p)
|
|
||||||
: Kernels(p),
|
|
||||||
_grid(&Fgrid),
|
|
||||||
_cbgrid(&Hgrid),
|
|
||||||
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
|
|
||||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
|
||||||
mass(_mass),
|
|
||||||
Lebesgue(_grid),
|
|
||||||
LebesgueEvenOdd(_cbgrid),
|
|
||||||
Umu(&Fgrid),
|
|
||||||
UmuEven(&Hgrid),
|
|
||||||
UmuOdd(&Hgrid),
|
|
||||||
_tmp(&Hgrid)
|
|
||||||
{
|
|
||||||
int vol4;
|
|
||||||
int LLs=1;
|
|
||||||
c1=_c1;
|
|
||||||
u0=_u0;
|
|
||||||
vol4= _grid->oSites();
|
|
||||||
Stencil.BuildSurfaceList(LLs,vol4);
|
|
||||||
vol4= _cbgrid->oSites();
|
|
||||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
|
||||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
|
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
|
||||||
RealD _c1, RealD _u0,
|
|
||||||
const ImplParams &p)
|
|
||||||
: NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
|
|
||||||
{
|
|
||||||
ImportGauge(_U);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
|
||||||
// Momentum space propagator should be
|
|
||||||
// https://arxiv.org/pdf/hep-lat/9712010.pdf
|
|
||||||
//
|
|
||||||
// mom space action.
|
|
||||||
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
|
|
||||||
//
|
|
||||||
// must track through staggered flavour/spin reduction in literature to
|
|
||||||
// turn to free propagator for the one component chi field, a la page 4/5
|
|
||||||
// of above link to implmement fourier based solver.
|
|
||||||
////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
|
|
||||||
{
|
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
|
|
||||||
{
|
|
||||||
GaugeLinkField U(GaugeGrid());
|
|
||||||
DoubledGaugeField _UUU(GaugeGrid());
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
// Double Store should take two fields for Naik and one hop separately.
|
|
||||||
// Discard teh Naik as Naive
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
// Apply scale factors to get the right fermion Kinetic term
|
|
||||||
// Could pass coeffs into the double store to save work.
|
|
||||||
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
|
||||||
|
|
||||||
U = PeekIndex<LorentzIndex>(Umu, mu);
|
|
||||||
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
|
|
||||||
|
|
||||||
U = PeekIndex<LorentzIndex>(Umu, mu+4);
|
|
||||||
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
CopyGaugeCheckerboards();
|
|
||||||
}
|
|
||||||
|
|
||||||
/////////////////////////////
|
|
||||||
// Implement the interface
|
|
||||||
/////////////////////////////
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
Dhop(in, out, DaggerNo);
|
|
||||||
axpy(out, mass, in, out);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
Dhop(in, out, DaggerYes);
|
|
||||||
axpy(out, mass, in, out);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
|
||||||
if (in.Checkerboard() == Odd) {
|
|
||||||
DhopEO(in, out, DaggerNo);
|
|
||||||
} else {
|
|
||||||
DhopOE(in, out, DaggerNo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
|
||||||
if (in.Checkerboard() == Odd) {
|
|
||||||
DhopEO(in, out, DaggerYes);
|
|
||||||
} else {
|
|
||||||
DhopOE(in, out, DaggerYes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
typename FermionField::scalar_type scal(mass);
|
|
||||||
out = scal * in;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
Mooee(in, out);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
out = (1.0 / (mass)) * in;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
MooeeInv(in, out);
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////////////
|
|
||||||
// Internal
|
|
||||||
///////////////////////////////////
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|
||||||
GaugeField & mat,
|
|
||||||
const FermionField &A, const FermionField &B, int dag)
|
|
||||||
{
|
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
|
||||||
|
|
||||||
Compressor compressor;
|
|
||||||
|
|
||||||
FermionField Btilde(B.Grid());
|
|
||||||
FermionField Atilde(B.Grid());
|
|
||||||
Atilde = A;
|
|
||||||
|
|
||||||
st.HaloExchange(B, compressor);
|
|
||||||
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
|
||||||
|
|
||||||
////////////////////////
|
|
||||||
// Call the single hop
|
|
||||||
////////////////////////
|
|
||||||
autoView( U_v , U, CpuRead);
|
|
||||||
autoView( B_v , B, CpuWrite);
|
|
||||||
autoView( Btilde_v , Btilde, CpuWrite);
|
|
||||||
thread_for(sss,B.Grid()->oSites(),{
|
|
||||||
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
|
||||||
});
|
|
||||||
|
|
||||||
assert(0);// need to figure out the force interface with a blasted three link term.
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
|
||||||
|
|
||||||
conformable(U.Grid(), _grid);
|
|
||||||
conformable(U.Grid(), V.Grid());
|
|
||||||
conformable(U.Grid(), mat.Grid());
|
|
||||||
|
|
||||||
mat.Checkerboard() = U.Checkerboard();
|
|
||||||
|
|
||||||
DerivInternal(Stencil, Umu, mat, U, V, dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
|
||||||
|
|
||||||
conformable(U.Grid(), _cbgrid);
|
|
||||||
conformable(U.Grid(), V.Grid());
|
|
||||||
conformable(U.Grid(), mat.Grid());
|
|
||||||
|
|
||||||
assert(V.Checkerboard() == Even);
|
|
||||||
assert(U.Checkerboard() == Odd);
|
|
||||||
mat.Checkerboard() = Odd;
|
|
||||||
|
|
||||||
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
|
||||||
|
|
||||||
conformable(U.Grid(), _cbgrid);
|
|
||||||
conformable(U.Grid(), V.Grid());
|
|
||||||
conformable(U.Grid(), mat.Grid());
|
|
||||||
|
|
||||||
assert(V.Checkerboard() == Odd);
|
|
||||||
assert(U.Checkerboard() == Even);
|
|
||||||
mat.Checkerboard() = Even;
|
|
||||||
|
|
||||||
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
|
||||||
{
|
|
||||||
DhopCalls+=2;
|
|
||||||
conformable(in.Grid(), _grid); // verifies full grid
|
|
||||||
conformable(in.Grid(), out.Grid());
|
|
||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
|
|
||||||
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
|
||||||
{
|
|
||||||
DhopCalls+=1;
|
|
||||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
|
||||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
|
||||||
|
|
||||||
assert(in.Checkerboard() == Even);
|
|
||||||
out.Checkerboard() = Odd;
|
|
||||||
|
|
||||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
|
|
||||||
{
|
|
||||||
DhopCalls+=1;
|
|
||||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
|
||||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
|
||||||
|
|
||||||
assert(in.Checkerboard() == Odd);
|
|
||||||
out.Checkerboard() = Even;
|
|
||||||
|
|
||||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
|
|
||||||
{
|
|
||||||
DhopDir(in, out, dir, disp);
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
|
|
||||||
{
|
|
||||||
assert(0); // Not implemented yet
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
|
|
||||||
{
|
|
||||||
|
|
||||||
Compressor compressor;
|
|
||||||
Stencil.HaloExchange(in, compressor);
|
|
||||||
autoView( Umu_v , Umu, CpuRead);
|
|
||||||
autoView( in_v , in, CpuRead);
|
|
||||||
autoView( out_v , out, CpuWrite);
|
|
||||||
// thread_for( sss, in.Grid()->oSites(),{
|
|
||||||
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
|
||||||
// });
|
|
||||||
assert(0);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out, int dag)
|
|
||||||
{
|
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
|
||||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
|
||||||
else
|
|
||||||
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out, int dag)
|
|
||||||
{
|
|
||||||
Compressor compressor;
|
|
||||||
int len = U.Grid()->oSites();
|
|
||||||
|
|
||||||
DhopTotalTime -= usecond();
|
|
||||||
|
|
||||||
DhopFaceTime -= usecond();
|
|
||||||
st.Prepare();
|
|
||||||
st.HaloGather(in,compressor);
|
|
||||||
DhopFaceTime += usecond();
|
|
||||||
|
|
||||||
DhopCommTime -=usecond();
|
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
|
||||||
st.CommunicateBegin(requests);
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.CommsMergeSHM(compressor);
|
|
||||||
DhopFaceTime+= usecond();
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Removed explicit thread comms
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
DhopComputeTime -= usecond();
|
|
||||||
{
|
|
||||||
int interior=1;
|
|
||||||
int exterior=0;
|
|
||||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
|
||||||
}
|
|
||||||
DhopComputeTime += usecond();
|
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
|
||||||
DhopCommTime +=usecond();
|
|
||||||
|
|
||||||
// First to enter, last to leave timing
|
|
||||||
DhopFaceTime -= usecond();
|
|
||||||
st.CommsMerge(compressor);
|
|
||||||
DhopFaceTime -= usecond();
|
|
||||||
|
|
||||||
DhopComputeTime2 -= usecond();
|
|
||||||
{
|
|
||||||
int interior=0;
|
|
||||||
int exterior=1;
|
|
||||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
|
||||||
}
|
|
||||||
DhopComputeTime2 += usecond();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out, int dag)
|
|
||||||
{
|
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
|
||||||
|
|
||||||
DhopTotalTime -= usecond();
|
|
||||||
|
|
||||||
DhopCommTime -= usecond();
|
|
||||||
Compressor compressor;
|
|
||||||
st.HaloExchange(in, compressor);
|
|
||||||
DhopCommTime += usecond();
|
|
||||||
|
|
||||||
DhopComputeTime -= usecond();
|
|
||||||
{
|
|
||||||
int interior=1;
|
|
||||||
int exterior=1;
|
|
||||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
|
||||||
}
|
|
||||||
DhopComputeTime += usecond();
|
|
||||||
DhopTotalTime += usecond();
|
|
||||||
};
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
// Reporting
|
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
template<class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::Report(void)
|
|
||||||
{
|
|
||||||
Coordinate latt = _grid->GlobalDimensions();
|
|
||||||
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
|
||||||
RealD NP = _grid->_Nprocessors;
|
|
||||||
RealD NN = _grid->NodeCount();
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
|
|
||||||
<< DhopCalls << std::endl;
|
|
||||||
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
|
|
||||||
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
|
|
||||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
|
|
||||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
|
||||||
|
|
||||||
// Average the compute time
|
|
||||||
_grid->GlobalSum(DhopComputeTime);
|
|
||||||
DhopComputeTime/=NP;
|
|
||||||
|
|
||||||
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
|
||||||
|
|
||||||
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
|
||||||
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
|
||||||
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
|
|
||||||
{
|
|
||||||
DhopCalls = 0;
|
|
||||||
DhopTotalTime = 0;
|
|
||||||
DhopCommTime = 0;
|
|
||||||
DhopComputeTime = 0;
|
|
||||||
DhopFaceTime = 0;
|
|
||||||
|
|
||||||
Stencil.ZeroCounters();
|
|
||||||
StencilEven.ZeroCounters();
|
|
||||||
StencilOdd.ZeroCounters();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
// Conserved current - not yet implemented.
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|
||||||
PropagatorField &q_in_2,
|
|
||||||
PropagatorField &q_out,
|
|
||||||
PropagatorField &src,
|
|
||||||
Current curr_type,
|
|
||||||
unsigned int mu)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|
||||||
PropagatorField &q_out,
|
|
||||||
PropagatorField &src,
|
|
||||||
Current curr_type,
|
|
||||||
unsigned int mu,
|
|
||||||
unsigned int tmin,
|
|
||||||
unsigned int tmax,
|
|
||||||
ComplexField &lattice_cmplx)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -680,13 +680,12 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
|
|||||||
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
||||||
gauge3 =(uint64_t)&UU[sU]( T );
|
gauge3 =(uint64_t)&UU[sU]( T );
|
||||||
|
|
||||||
|
|
||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
#include <Grid/simd/Intel512single.h>
|
#include <Grid/simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -703,10 +702,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
|
|
||||||
// int sF=s+LLs*sU;
|
int sF=s+LLs*sU;
|
||||||
{
|
|
||||||
// Xp, Yp, Zp, Tp
|
// Xp, Yp, Zp, Tp
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
@ -738,10 +736,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView
|
|||||||
}
|
}
|
||||||
|
|
||||||
#include <Grid/simd/Intel512double.h>
|
#include <Grid/simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
|
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -758,9 +756,8 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
// int sF=s+LLs*sU;
|
int sF=s+LLs*sU;
|
||||||
{
|
|
||||||
// Xp, Yp, Zp, Tp
|
// Xp, Yp, Zp, Tp
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
@ -824,10 +821,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
|
|||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
|
|
||||||
#include <Grid/simd/Intel512single.h>
|
#include <Grid/simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
|
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -844,9 +841,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
// int sF=s+LLs*sU;
|
|
||||||
{
|
int sF=s+LLs*sU;
|
||||||
// Xp, Yp, Zp, Tp
|
// Xp, Yp, Zp, Tp
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHIa(addr0,addr1);
|
||||||
@ -893,10 +890,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#include <Grid/simd/Intel512double.h>
|
#include <Grid/simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
|
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -913,9 +910,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
// int sF=s+LLs*sU;
|
|
||||||
{
|
int sF=s+LLs*sU;
|
||||||
// Xp, Yp, Zp, Tp
|
// Xp, Yp, Zp, Tp
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHIa(addr0,addr1);
|
||||||
|
@ -146,10 +146,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik> accelerator_inline
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
@ -182,9 +181,8 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int skew;
|
int skew;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
// int sF=s+LLs*sU;
|
int sF=s+LLs*sU;
|
||||||
{
|
|
||||||
|
|
||||||
skew = 0;
|
skew = 0;
|
||||||
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
||||||
@ -195,7 +193,6 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|||||||
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
||||||
if (Naik) {
|
|
||||||
skew = 8;
|
skew = 8;
|
||||||
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
||||||
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
||||||
@ -205,7 +202,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|||||||
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
||||||
}
|
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
result()()(0) = - even_0 - odd_0;
|
result()()(0) = - even_0 - odd_0;
|
||||||
result()()(1) = - even_1 - odd_1;
|
result()()(1) = - even_1 - odd_1;
|
||||||
@ -221,10 +218,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik> accelerator_inline
|
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
@ -257,9 +253,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int skew;
|
int skew;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
// int sF=s+LLs*sU;
|
int sF=s+LLs*sU;
|
||||||
{
|
|
||||||
|
|
||||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||||
@ -273,7 +268,6 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
||||||
if (Naik) {
|
|
||||||
skew = 8;
|
skew = 8;
|
||||||
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
||||||
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
||||||
@ -283,7 +277,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
||||||
}
|
|
||||||
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
result()()(0) = - even_0 - odd_0;
|
result()()(0) = - even_0 - odd_0;
|
||||||
@ -300,10 +294,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik> accelerator_inline
|
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
@ -336,9 +329,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int skew;
|
int skew;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
// int sF=s+LLs*sU;
|
int sF=s+LLs*sU;
|
||||||
{
|
|
||||||
|
|
||||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||||
@ -352,7 +344,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
||||||
if (Naik) {
|
|
||||||
skew = 8;
|
skew = 8;
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
||||||
@ -362,7 +353,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
||||||
}
|
|
||||||
// Add sum of all exterior connected stencil legs
|
// Add sum of all exterior connected stencil legs
|
||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
@ -379,7 +370,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
@ -395,7 +385,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
SiteSpinor *buf, int LLs, int sU, \
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
*/
|
|
||||||
#undef LOAD_CHI
|
#undef LOAD_CHI
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
if (SE->_is_local ) { \
|
if (SE->_is_local ) { \
|
||||||
if (SE->_permute) { \
|
if (SE->_permute) { \
|
||||||
chi_p = χ \
|
chi_p = χ \
|
||||||
permute(chi, in[SE->_offset], ptype); \
|
permute(chi, in[SE->_offset], ptype); \
|
||||||
} else { \
|
} else { \
|
||||||
chi_p = &in[SE->_offset]; \
|
chi_p = &in[SE->_offset]; \
|
||||||
} \
|
} \
|
||||||
} else { \
|
} else { \
|
||||||
chi_p = &buf[SE->_offset]; \
|
chi_p = &buf[SE->_offset]; \
|
||||||
@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
if (SE->_is_local ) { \
|
if (SE->_is_local ) { \
|
||||||
if (SE->_permute) { \
|
if (SE->_permute) { \
|
||||||
chi_p = χ \
|
chi_p = χ \
|
||||||
permute(chi, in[SE->_offset], ptype); \
|
permute(chi, in[SE->_offset], ptype); \
|
||||||
} else { \
|
} else { \
|
||||||
chi_p = &in[SE->_offset]; \
|
chi_p = &in[SE->_offset]; \
|
||||||
} \
|
} \
|
||||||
} else if ( st.same_node[Dir] ) { \
|
} else if ( st.same_node[Dir] ) { \
|
||||||
chi_p = &buf[SE->_offset]; \
|
chi_p = &buf[SE->_offset]; \
|
||||||
} \
|
} \
|
||||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
multLink(Uchi, U[sU], *chi_p, Dir); \
|
multLink(Uchi, U[sU], *chi_p, Dir); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
||||||
@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||||
nmu++; \
|
nmu++; \
|
||||||
chi_p = &buf[SE->_offset]; \
|
chi_p = &buf[SE->_offset]; \
|
||||||
multLink(Uchi, U[sU], *chi_p, Dir); \
|
multLink(Uchi, U[sU], *chi_p, Dir); \
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -78,12 +78,10 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
|||||||
// Int, Ext, Int+Ext cases for comms overlap
|
// Int, Ext, Int+Ext cases for comms overlap
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik> accelerator_inline
|
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag)
|
const FermionFieldView &in, FermionFieldView &out, int dag) {
|
||||||
{
|
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
SiteSpinor chi;
|
SiteSpinor chi;
|
||||||
SiteSpinor Uchi;
|
SiteSpinor Uchi;
|
||||||
@ -91,10 +89,8 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|||||||
int ptype;
|
int ptype;
|
||||||
int skew;
|
int skew;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
//
|
int sF=LLs*sU+s;
|
||||||
// int sF=LLs*sU+s;
|
|
||||||
{
|
|
||||||
skew = 0;
|
skew = 0;
|
||||||
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
||||||
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
||||||
@ -104,7 +100,6 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|||||||
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
||||||
if ( Naik ) {
|
|
||||||
skew=8;
|
skew=8;
|
||||||
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
@ -114,7 +109,6 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|||||||
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
}
|
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
Uchi = - Uchi;
|
Uchi = - Uchi;
|
||||||
}
|
}
|
||||||
@ -126,10 +120,9 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|||||||
// Only contributions from interior of our node
|
// Only contributions from interior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik> accelerator_inline
|
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
SiteSpinor chi;
|
SiteSpinor chi;
|
||||||
@ -138,9 +131,8 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|||||||
int ptype;
|
int ptype;
|
||||||
int skew ;
|
int skew ;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
// int sF=LLs*sU+s;
|
int sF=LLs*sU+s;
|
||||||
{
|
|
||||||
skew = 0;
|
skew = 0;
|
||||||
Uchi=Zero();
|
Uchi=Zero();
|
||||||
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
@ -151,7 +143,6 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|||||||
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
if ( Naik ) {
|
|
||||||
skew=8;
|
skew=8;
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
@ -161,7 +152,6 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|||||||
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
}
|
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
Uchi = - Uchi;
|
Uchi = - Uchi;
|
||||||
}
|
}
|
||||||
@ -174,10 +164,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|||||||
// Only contributions from exterior of our node
|
// Only contributions from exterior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik> accelerator_inline
|
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
// SiteSpinor chi;
|
// SiteSpinor chi;
|
||||||
@ -187,9 +176,8 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
|||||||
int nmu=0;
|
int nmu=0;
|
||||||
int skew ;
|
int skew ;
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
// int sF=LLs*sU+s;
|
int sF=LLs*sU+s;
|
||||||
{
|
|
||||||
skew = 0;
|
skew = 0;
|
||||||
Uchi=Zero();
|
Uchi=Zero();
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
@ -200,7 +188,6 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
|||||||
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
if ( Naik ) {
|
|
||||||
skew=8;
|
skew=8;
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
@ -210,7 +197,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
|||||||
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
}
|
|
||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
out[sF] = out[sF] - Uchi;
|
out[sF] = out[sF] - Uchi;
|
||||||
@ -224,9 +211,72 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
|||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Driving / wrapping routine to select right kernel
|
// Driving / wrapping routine to select right kernel
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
template <class Impl>
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
|
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionFieldView &in, FermionFieldView &out,
|
||||||
|
int interior,int exterior)
|
||||||
|
{
|
||||||
|
int dag=1;
|
||||||
|
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionFieldView &in, FermionFieldView &out,
|
||||||
|
int interior,int exterior)
|
||||||
|
{
|
||||||
|
int dag=0;
|
||||||
|
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
|
SiteSpinor *buf, int LLs,
|
||||||
|
int sU, const FermionFieldView &in, FermionFieldView &out,
|
||||||
|
int dag,int interior,int exterior)
|
||||||
|
{
|
||||||
|
switch(Opt) {
|
||||||
|
#ifdef AVX512
|
||||||
|
case OptInlineAsm:
|
||||||
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else {
|
||||||
|
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
case OptHandUnroll:
|
||||||
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( interior ) {
|
||||||
|
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( exterior ) {
|
||||||
|
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case OptGeneric:
|
||||||
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( interior ) {
|
||||||
|
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( exterior ) {
|
||||||
|
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
||||||
|
assert(0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
|
||||||
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
|
||||||
{
|
{
|
||||||
// Disp should be either +1,-1,+3,-3
|
// Disp should be either +1,-1,+3,-3
|
||||||
// What about "dag" ?
|
// What about "dag" ?
|
||||||
@ -235,108 +285,6 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define KERNEL_CALLNB(A,improved) \
|
|
||||||
const uint64_t NN = Nsite*Ls; \
|
|
||||||
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
|
||||||
int sF = ss; \
|
|
||||||
int sU = ss/Ls; \
|
|
||||||
ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
|
|
||||||
});
|
|
||||||
|
|
||||||
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
|
|
||||||
|
|
||||||
#define ASM_CALL(A) \
|
|
||||||
const uint64_t NN = Nsite*Ls; \
|
|
||||||
thread_for( ss, NN, { \
|
|
||||||
int sF = ss; \
|
|
||||||
int sU = ss/Ls; \
|
|
||||||
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
|
|
||||||
});
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
|
||||||
{
|
|
||||||
GridBase *FGrid=in.Grid();
|
|
||||||
GridBase *UGrid=U.Grid();
|
|
||||||
typedef StaggeredKernels<Impl> ThisKernel;
|
|
||||||
autoView( UUU_v , UUU, AcceleratorRead);
|
|
||||||
autoView( U_v , U, AcceleratorRead);
|
|
||||||
autoView( in_v , in, AcceleratorRead);
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
|
||||||
autoView( st_v , st, AcceleratorRead);
|
|
||||||
SiteSpinor * buf = st.CommBuf();
|
|
||||||
|
|
||||||
int Ls=1;
|
|
||||||
if(FGrid->Nd()==UGrid->Nd()+1){
|
|
||||||
Ls = FGrid->_rdimensions[0];
|
|
||||||
}
|
|
||||||
int Nsite = UGrid->oSites();
|
|
||||||
|
|
||||||
if( interior && exterior ) {
|
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
|
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
|
|
||||||
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
|
|
||||||
#endif
|
|
||||||
} else if( interior ) {
|
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
|
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
|
|
||||||
#endif
|
|
||||||
} else if( exterior ) {
|
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
|
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
|
||||||
{
|
|
||||||
GridBase *FGrid=in.Grid();
|
|
||||||
GridBase *UGrid=U.Grid();
|
|
||||||
typedef StaggeredKernels<Impl> ThisKernel;
|
|
||||||
autoView( UUU_v , U, AcceleratorRead);
|
|
||||||
autoView( U_v , U, AcceleratorRead);
|
|
||||||
autoView( in_v , in, AcceleratorRead);
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
|
||||||
autoView( st_v , st, AcceleratorRead);
|
|
||||||
SiteSpinor * buf = st.CommBuf();
|
|
||||||
|
|
||||||
int Ls=1;
|
|
||||||
if(FGrid->Nd()==UGrid->Nd()+1){
|
|
||||||
Ls = FGrid->_rdimensions[0];
|
|
||||||
}
|
|
||||||
int Nsite = UGrid->oSites();
|
|
||||||
|
|
||||||
if( interior && exterior ) {
|
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
|
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
|
|
||||||
#endif
|
|
||||||
} else if( interior ) {
|
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
|
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
|
|
||||||
#endif
|
|
||||||
} else if( exterior ) {
|
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
|
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#undef KERNEL_CALLNB
|
|
||||||
#undef KERNEL_CALL
|
|
||||||
#undef ASM_CALL
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
@ -98,35 +98,32 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
Coordinate lcoor;
|
Coordinate lcoor;
|
||||||
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||||
|
|
||||||
|
for (int site = 0; site < lvol; site++)
|
||||||
{
|
{
|
||||||
autoView(CTv,CloverTerm,CpuRead);
|
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||||
autoView(CTIv,CloverTermInv,CpuWrite);
|
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||||
for (int site = 0; site < lvol; site++) {
|
peekLocalSite(Qx, CloverTerm, lcoor);
|
||||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
Qxinv = Zero();
|
||||||
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
//if (csw!=0){
|
||||||
peekLocalSite(Qx, CTv, lcoor);
|
for (int j = 0; j < Ns; j++)
|
||||||
Qxinv = Zero();
|
for (int k = 0; k < Ns; k++)
|
||||||
//if (csw!=0){
|
for (int a = 0; a < DimRep; a++)
|
||||||
for (int j = 0; j < Ns; j++)
|
for (int b = 0; b < DimRep; b++){
|
||||||
for (int k = 0; k < Ns; k++)
|
auto zz = Qx()(j, k)(a, b);
|
||||||
for (int a = 0; a < DimRep; a++)
|
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
|
||||||
for (int b = 0; b < DimRep; b++){
|
}
|
||||||
auto zz = Qx()(j, k)(a, b);
|
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
|
||||||
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
|
|
||||||
}
|
EigenInvCloverOp = EigenCloverOp.inverse();
|
||||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
|
//std::cout << EigenInvCloverOp << std::endl;
|
||||||
|
for (int j = 0; j < Ns; j++)
|
||||||
EigenInvCloverOp = EigenCloverOp.inverse();
|
for (int k = 0; k < Ns; k++)
|
||||||
//std::cout << EigenInvCloverOp << std::endl;
|
for (int a = 0; a < DimRep; a++)
|
||||||
for (int j = 0; j < Ns; j++)
|
for (int b = 0; b < DimRep; b++)
|
||||||
for (int k = 0; k < Ns; k++)
|
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
|
||||||
for (int a = 0; a < DimRep; a++)
|
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
||||||
for (int b = 0; b < DimRep; b++)
|
// }
|
||||||
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
|
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
|
||||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
|
||||||
// }
|
|
||||||
pokeLocalSite(Qxinv, CTIv, lcoor);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Separate the even and odd parts
|
// Separate the even and odd parts
|
||||||
|
@ -580,21 +580,16 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
|
|||||||
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
||||||
|
|
||||||
// FIXME Need a Lattice acosh
|
// FIXME Need a Lattice acosh
|
||||||
|
for(int idx=0;idx<_grid->lSites();idx++){
|
||||||
{
|
Coordinate lcoor(Nd);
|
||||||
autoView(cosha_v,cosha,CpuRead);
|
Tcomplex cc;
|
||||||
autoView(a_v,a,CpuWrite);
|
// RealD sgn;
|
||||||
for(int idx=0;idx<_grid->lSites();idx++){
|
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
Coordinate lcoor(Nd);
|
peekLocalSite(cc,cosha,lcoor);
|
||||||
Tcomplex cc;
|
assert((double)real(cc)>=1.0);
|
||||||
// RealD sgn;
|
assert(fabs((double)imag(cc))<=1.0e-15);
|
||||||
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
cc = ScalComplex(::acosh(real(cc)),0.0);
|
||||||
peekLocalSite(cc,cosha_v,lcoor);
|
pokeLocalSite(cc,a,lcoor);
|
||||||
assert((double)real(cc)>=1.0);
|
|
||||||
assert(fabs((double)imag(cc))<=1.0e-15);
|
|
||||||
cc = ScalComplex(::acosh(real(cc)),0.0);
|
|
||||||
pokeLocalSite(cc,a_v,lcoor);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Wea = ( exp( a) * abs(W) );
|
Wea = ( exp( a) * abs(W) );
|
||||||
@ -780,20 +775,17 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
|
|||||||
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
||||||
|
|
||||||
// FIXME Need a Lattice acosh
|
// FIXME Need a Lattice acosh
|
||||||
{
|
|
||||||
autoView(cosha_v,cosha,CpuRead);
|
|
||||||
autoView(a_v,a,CpuWrite);
|
|
||||||
for(int idx=0;idx<_grid->lSites();idx++){
|
for(int idx=0;idx<_grid->lSites();idx++){
|
||||||
Coordinate lcoor(Nd);
|
Coordinate lcoor(Nd);
|
||||||
Tcomplex cc;
|
Tcomplex cc;
|
||||||
// RealD sgn;
|
// RealD sgn;
|
||||||
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
peekLocalSite(cc,cosha_v,lcoor);
|
peekLocalSite(cc,cosha,lcoor);
|
||||||
assert((double)real(cc)>=1.0);
|
assert((double)real(cc)>=1.0);
|
||||||
assert(fabs((double)imag(cc))<=1.0e-15);
|
assert(fabs((double)imag(cc))<=1.0e-15);
|
||||||
cc = ScalComplex(::acosh(real(cc)),0.0);
|
cc = ScalComplex(::acosh(real(cc)),0.0);
|
||||||
pokeLocalSite(cc,a_v,lcoor);
|
pokeLocalSite(cc,a,lcoor);
|
||||||
}}
|
}
|
||||||
|
|
||||||
Wea = ( exp( a) * abs(W) );
|
Wea = ( exp( a) * abs(W) );
|
||||||
Wema= ( exp(-a) * abs(W) );
|
Wema= ( exp(-a) * abs(W) );
|
||||||
|
@ -43,7 +43,7 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
const ImplParams &p,
|
const ImplParams &p,
|
||||||
const WilsonAnisotropyCoefficients &anis)
|
const WilsonAnisotropyCoefficients &anis)
|
||||||
:
|
:
|
||||||
Kernels(p),
|
Kernels(p),
|
||||||
_grid(&Fgrid),
|
_grid(&Fgrid),
|
||||||
_cbgrid(&Hgrid),
|
_cbgrid(&Hgrid),
|
||||||
@ -67,101 +67,11 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
diag_mass = 4.0 + mass;
|
diag_mass = 4.0 + mass;
|
||||||
}
|
}
|
||||||
|
|
||||||
int vol4;
|
|
||||||
vol4=Fgrid.oSites();
|
|
||||||
Stencil.BuildSurfaceList(1,vol4);
|
|
||||||
vol4=Hgrid.oSites();
|
|
||||||
StencilEven.BuildSurfaceList(1,vol4);
|
|
||||||
StencilOdd.BuildSurfaceList(1,vol4);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion<Impl>::Report(void)
|
|
||||||
{
|
|
||||||
RealD NP = _grid->_Nprocessors;
|
|
||||||
RealD NN = _grid->NodeCount();
|
|
||||||
RealD volume = 1;
|
|
||||||
Coordinate latt = _grid->GlobalDimensions();
|
|
||||||
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
|
||||||
|
|
||||||
if ( DhopCalls > 0 ) {
|
|
||||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
|
|
||||||
|
|
||||||
// Average the compute time
|
|
||||||
_grid->GlobalSum(DhopComputeTime);
|
|
||||||
DhopComputeTime/=NP;
|
|
||||||
RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
|
||||||
|
|
||||||
RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( DerivCalls > 0 ) {
|
|
||||||
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " <<DerivCalls <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
|
|
||||||
|
|
||||||
// how to count flops here?
|
|
||||||
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call ? : " << mflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node ? : " << mflops/NP << std::endl;
|
|
||||||
|
|
||||||
// how to count flops here?
|
|
||||||
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call (full) ? : " << Fullmflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl; }
|
|
||||||
|
|
||||||
if (DerivCalls > 0 || DhopCalls > 0){
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion Stencil" <<std::endl; Stencil.Report();
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl; StencilEven.Report();
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
|
||||||
}
|
|
||||||
if ( DhopCalls > 0){
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion<Impl>::ZeroCounters(void) {
|
|
||||||
DhopCalls = 0; // ok
|
|
||||||
DhopCommTime = 0;
|
|
||||||
DhopComputeTime = 0;
|
|
||||||
DhopComputeTime2= 0;
|
|
||||||
DhopFaceTime = 0;
|
|
||||||
DhopTotalTime = 0;
|
|
||||||
|
|
||||||
DerivCalls = 0; // ok
|
|
||||||
DerivCommTime = 0;
|
|
||||||
DerivComputeTime = 0;
|
|
||||||
DerivDhopComputeTime = 0;
|
|
||||||
|
|
||||||
Stencil.ZeroCounters();
|
|
||||||
StencilEven.ZeroCounters();
|
|
||||||
StencilOdd.ZeroCounters();
|
|
||||||
Stencil.ZeroCountersi();
|
|
||||||
StencilEven.ZeroCountersi();
|
|
||||||
StencilOdd.ZeroCountersi();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||||
{
|
{
|
||||||
GaugeField HUmu(_Umu.Grid());
|
GaugeField HUmu(_Umu.Grid());
|
||||||
|
|
||||||
@ -192,7 +102,7 @@ void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out)
|
void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Dhop(in, out, DaggerNo);
|
Dhop(in, out, DaggerNo);
|
||||||
@ -200,7 +110,7 @@ void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Dhop(in, out, DaggerYes);
|
Dhop(in, out, DaggerYes);
|
||||||
@ -208,7 +118,7 @@ void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
|
void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
if (in.Checkerboard() == Odd) {
|
if (in.Checkerboard() == Odd) {
|
||||||
DhopEO(in, out, DaggerNo);
|
DhopEO(in, out, DaggerNo);
|
||||||
@ -218,7 +128,7 @@ void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
if (in.Checkerboard() == Odd) {
|
if (in.Checkerboard() == Odd) {
|
||||||
DhopEO(in, out, DaggerYes);
|
DhopEO(in, out, DaggerYes);
|
||||||
@ -226,9 +136,9 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
|||||||
DhopOE(in, out, DaggerYes);
|
DhopOE(in, out, DaggerYes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
|
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
typename FermionField::scalar_type scal(diag_mass);
|
typename FermionField::scalar_type scal(diag_mass);
|
||||||
@ -236,80 +146,80 @@ void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Mooee(in, out);
|
Mooee(in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
out = (1.0/(diag_mass))*in;
|
out = (1.0/(diag_mass))*in;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
MooeeInv(in,out);
|
MooeeInv(in,out);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
|
void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
|
||||||
{
|
{
|
||||||
typedef typename FermionField::vector_type vector_type;
|
typedef typename FermionField::vector_type vector_type;
|
||||||
typedef typename FermionField::scalar_type ScalComplex;
|
typedef typename FermionField::scalar_type ScalComplex;
|
||||||
typedef Lattice<iSinglet<vector_type> > LatComplex;
|
typedef Lattice<iSinglet<vector_type> > LatComplex;
|
||||||
|
|
||||||
// what type LatticeComplex
|
// what type LatticeComplex
|
||||||
conformable(_grid,out.Grid());
|
conformable(_grid,out.Grid());
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
Gamma::Algebra Gmu [] = {
|
||||||
Gamma::Algebra::GammaX,
|
Gamma::Algebra::GammaX,
|
||||||
Gamma::Algebra::GammaY,
|
Gamma::Algebra::GammaY,
|
||||||
Gamma::Algebra::GammaZ,
|
Gamma::Algebra::GammaZ,
|
||||||
Gamma::Algebra::GammaT
|
Gamma::Algebra::GammaT
|
||||||
};
|
};
|
||||||
|
|
||||||
Coordinate latt_size = _grid->_fdimensions;
|
Coordinate latt_size = _grid->_fdimensions;
|
||||||
|
|
||||||
FermionField num (_grid); num = Zero();
|
FermionField num (_grid); num = Zero();
|
||||||
LatComplex wilson(_grid); wilson= Zero();
|
LatComplex wilson(_grid); wilson= Zero();
|
||||||
LatComplex one (_grid); one = ScalComplex(1.0,0.0);
|
LatComplex one (_grid); one = ScalComplex(1.0,0.0);
|
||||||
|
|
||||||
LatComplex denom(_grid); denom= Zero();
|
LatComplex denom(_grid); denom= Zero();
|
||||||
LatComplex kmu(_grid);
|
LatComplex kmu(_grid);
|
||||||
ScalComplex ci(0.0,1.0);
|
ScalComplex ci(0.0,1.0);
|
||||||
// momphase = n * 2pi / L
|
// momphase = n * 2pi / L
|
||||||
for(int mu=0;mu<Nd;mu++) {
|
for(int mu=0;mu<Nd;mu++) {
|
||||||
|
|
||||||
LatticeCoordinate(kmu,mu);
|
LatticeCoordinate(kmu,mu);
|
||||||
|
|
||||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||||
|
|
||||||
kmu = TwoPiL * kmu;
|
kmu = TwoPiL * kmu;
|
||||||
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
|
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
|
||||||
|
|
||||||
wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
|
wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
|
||||||
|
|
||||||
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in); // derivative term
|
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in); // derivative term
|
||||||
|
|
||||||
denom=denom + sin(kmu)*sin(kmu);
|
denom=denom + sin(kmu)*sin(kmu);
|
||||||
}
|
}
|
||||||
|
|
||||||
wilson = wilson + _m; // 2 sin^2 k/2 + m
|
wilson = wilson + _m; // 2 sin^2 k/2 + m
|
||||||
|
|
||||||
num = num + wilson*in; // -i gmu sin k + 2 sin^2 k/2 + m
|
num = num + wilson*in; // -i gmu sin k + 2 sin^2 k/2 + m
|
||||||
|
|
||||||
denom= denom+wilson*wilson; // sin^2 k + (2 sin^2 k/2 + m)^2
|
denom= denom+wilson*wilson; // sin^2 k + (2 sin^2 k/2 + m)^2
|
||||||
|
|
||||||
denom= one/denom;
|
denom= one/denom;
|
||||||
|
|
||||||
out = num*denom; // [ -i gmu sin k + 2 sin^2 k/2 + m] / [ sin^2 k + (2 sin^2 k/2 + m)^2 ]
|
out = num*denom; // [ -i gmu sin k + 2 sin^2 k/2 + m] / [ sin^2 k + (2 sin^2 k/2 + m)^2 ]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
// Internal
|
// Internal
|
||||||
@ -319,7 +229,6 @@ template <class Impl>
|
|||||||
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||||
GaugeField &mat, const FermionField &A,
|
GaugeField &mat, const FermionField &A,
|
||||||
const FermionField &B, int dag) {
|
const FermionField &B, int dag) {
|
||||||
DerivCalls++;
|
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
@ -328,11 +237,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
FermionField Atilde(B.Grid());
|
FermionField Atilde(B.Grid());
|
||||||
Atilde = A;
|
Atilde = A;
|
||||||
|
|
||||||
DerivCommTime-=usecond();
|
|
||||||
st.HaloExchange(B, compressor);
|
st.HaloExchange(B, compressor);
|
||||||
DerivCommTime+=usecond();
|
|
||||||
|
|
||||||
DerivComputeTime-=usecond();
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Flip gamma (1+g)<->(1-g) if dag
|
// Flip gamma (1+g)<->(1-g) if dag
|
||||||
@ -340,7 +246,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
int gamma = mu;
|
int gamma = mu;
|
||||||
if (!dag) gamma += Nd;
|
if (!dag) gamma += Nd;
|
||||||
|
|
||||||
DerivDhopComputeTime -= usecond();
|
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
|
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
|
||||||
|
|
||||||
@ -348,13 +253,11 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
// spin trace outer product
|
// spin trace outer product
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
|
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
|
||||||
DerivDhopComputeTime += usecond();
|
|
||||||
}
|
}
|
||||||
DerivComputeTime += usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
{
|
{
|
||||||
conformable(U.Grid(), _grid);
|
conformable(U.Grid(), _grid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
@ -366,13 +269,13 @@ void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
{
|
{
|
||||||
conformable(U.Grid(), _cbgrid);
|
conformable(U.Grid(), _cbgrid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
//conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
|
//conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
|
||||||
// Motivation: look at the SchurDiff operator
|
// Motivation: look at the SchurDiff operator
|
||||||
|
|
||||||
assert(V.Checkerboard() == Even);
|
assert(V.Checkerboard() == Even);
|
||||||
assert(U.Checkerboard() == Odd);
|
assert(U.Checkerboard() == Odd);
|
||||||
mat.Checkerboard() = Odd;
|
mat.Checkerboard() = Odd;
|
||||||
@ -381,7 +284,7 @@ void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, co
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
{
|
{
|
||||||
conformable(U.Grid(), _cbgrid);
|
conformable(U.Grid(), _cbgrid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
@ -395,7 +298,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
conformable(in.Grid(), _grid); // verifies full grid
|
conformable(in.Grid(), _grid); // verifies full grid
|
||||||
conformable(in.Grid(), out.Grid());
|
conformable(in.Grid(), out.Grid());
|
||||||
@ -406,7 +309,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
@ -418,7 +321,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
@ -430,18 +333,18 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
|
void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
|
||||||
{
|
{
|
||||||
DhopDir(in, out, dir, disp);
|
DhopDir(in, out, dir, disp);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
|
void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
|
||||||
{
|
{
|
||||||
DhopDirAll(in, out);
|
DhopDirAll(in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
|
void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
|
||||||
{
|
{
|
||||||
Compressor compressor(DaggerNo);
|
Compressor compressor(DaggerNo);
|
||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
@ -453,12 +356,12 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
|
|||||||
DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
|
DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
|
||||||
};
|
};
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
|
void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
|
||||||
{
|
{
|
||||||
Compressor compressor(DaggerNo);
|
Compressor compressor(DaggerNo);
|
||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
|
|
||||||
assert((out.size()==8)||(out.size()==9));
|
assert((out.size()==8)||(out.size()==9));
|
||||||
for(int dir=0;dir<Nd;dir++){
|
for(int dir=0;dir<Nd;dir++){
|
||||||
for(int disp=-1;disp<=1;disp+=2){
|
for(int disp=-1;disp<=1;disp+=2){
|
||||||
|
|
||||||
@ -471,7 +374,7 @@ void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermion
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag)
|
void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag)
|
||||||
{
|
{
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
uint64_t Nsite=in.oSites();
|
uint64_t Nsite=in.oSites();
|
||||||
@ -482,23 +385,22 @@ template <class Impl>
|
|||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
DhopTotalTime-=usecond();
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
DhopInternalSerial(st,lo,U,in,out,dag);
|
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||||
DhopTotalTime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
@ -510,53 +412,38 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
st.Prepare();
|
st.Prepare();
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.HaloGather(in,compressor);
|
st.HaloGather(in,compressor);
|
||||||
DhopFaceTime+=usecond();
|
|
||||||
|
|
||||||
DhopCommTime -=usecond();
|
|
||||||
st.CommunicateBegin(requests);
|
st.CommunicateBegin(requests);
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Overlap with comms
|
// Overlap with comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.CommsMergeSHM(compressor);
|
st.CommsMergeSHM(compressor);
|
||||||
DhopFaceTime+=usecond();
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute interior
|
// do the compute interior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
DhopComputeTime-=usecond();
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
||||||
} else {
|
} else {
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
||||||
}
|
}
|
||||||
DhopComputeTime+=usecond();
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Complete comms
|
// Complete comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
DhopCommTime +=usecond();
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
DhopFaceTime+=usecond();
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute exterior
|
// do the compute exterior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
||||||
DhopComputeTime2-=usecond();
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
||||||
} else {
|
} else {
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
||||||
}
|
}
|
||||||
DhopComputeTime2+=usecond();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -564,28 +451,24 @@ template <class Impl>
|
|||||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
DhopCommTime-=usecond();
|
|
||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
DhopCommTime+=usecond();
|
|
||||||
|
|
||||||
DhopComputeTime-=usecond();
|
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
||||||
} else {
|
} else {
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
||||||
}
|
}
|
||||||
DhopComputeTime+=usecond();
|
|
||||||
};
|
};
|
||||||
/*Change ends */
|
/*Change ends */
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||||
* to make a conserved current sink or inserting the conserved current
|
* to make a conserved current sink or inserting the conserved current
|
||||||
* sequentially.
|
* sequentially.
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -600,23 +483,103 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|||||||
conformable(_grid, q_in_1.Grid());
|
conformable(_grid, q_in_1.Grid());
|
||||||
conformable(_grid, q_in_2.Grid());
|
conformable(_grid, q_in_2.Grid());
|
||||||
conformable(_grid, q_out.Grid());
|
conformable(_grid, q_out.Grid());
|
||||||
assert(0);
|
#if 0
|
||||||
|
PropagatorField tmp1(_grid), tmp2(_grid);
|
||||||
|
q_out = Zero();
|
||||||
|
|
||||||
|
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
|
||||||
|
// Inefficient comms method but not performance critical.
|
||||||
|
tmp1 = Cshift(q_in_1, mu, 1);
|
||||||
|
tmp2 = Cshift(q_in_2, mu, 1);
|
||||||
|
auto tmp1_v = tmp1.View();
|
||||||
|
auto tmp2_v = tmp2.View();
|
||||||
|
auto q_in_1_v=q_in_1.View();
|
||||||
|
auto q_in_2_v=q_in_2.View();
|
||||||
|
auto q_out_v = q_out.View();
|
||||||
|
auto Umu_v = Umu.View();
|
||||||
|
thread_for(sU, Umu.Grid()->oSites(),{
|
||||||
|
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
|
||||||
|
q_in_2_v[sU],
|
||||||
|
q_out_v[sU],
|
||||||
|
Umu_v, sU, mu);
|
||||||
|
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
|
||||||
|
tmp2_v[sU],
|
||||||
|
q_out_v[sU],
|
||||||
|
Umu_v, sU, mu);
|
||||||
|
});
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
PropagatorField &src,
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
unsigned int tmax,
|
unsigned int tmax,
|
||||||
ComplexField &lattice_cmplx)
|
ComplexField &lattice_cmplx)
|
||||||
{
|
{
|
||||||
conformable(_grid, q_in.Grid());
|
conformable(_grid, q_in.Grid());
|
||||||
conformable(_grid, q_out.Grid());
|
conformable(_grid, q_out.Grid());
|
||||||
assert(0);
|
#if 0
|
||||||
|
|
||||||
|
// Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
|
||||||
|
Complex i(0.0,1.0);
|
||||||
|
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
|
||||||
|
unsigned int tshift = (mu == Tp) ? 1 : 0;
|
||||||
|
unsigned int LLt = GridDefaultLatt()[Tp];
|
||||||
|
|
||||||
|
q_out = Zero();
|
||||||
|
LatticeInteger coords(_grid);
|
||||||
|
LatticeCoordinate(coords, Tp);
|
||||||
|
|
||||||
|
// Need q(x + mu) and q(x - mu).
|
||||||
|
tmp = Cshift(q_in, mu, 1);
|
||||||
|
tmpFwd = tmp*lattice_cmplx;
|
||||||
|
tmp = lattice_cmplx*q_in;
|
||||||
|
tmpBwd = Cshift(tmp, mu, -1);
|
||||||
|
|
||||||
|
auto coords_v = coords.View();
|
||||||
|
auto tmpFwd_v = tmpFwd.View();
|
||||||
|
auto tmpBwd_v = tmpBwd.View();
|
||||||
|
auto Umu_v = Umu.View();
|
||||||
|
auto q_out_v = q_out.View();
|
||||||
|
|
||||||
|
thread_for(sU, Umu.Grid()->oSites(), {
|
||||||
|
|
||||||
|
// Compute the sequential conserved current insertion only if our simd
|
||||||
|
// object contains a timeslice we need.
|
||||||
|
vPredicate t_mask;
|
||||||
|
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
|
||||||
|
Integer timeSlices = Reduce(t_mask());
|
||||||
|
|
||||||
|
if (timeSlices > 0) {
|
||||||
|
Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],
|
||||||
|
q_out_v[sU],
|
||||||
|
Umu_v, sU, mu, t_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Repeat for backward direction.
|
||||||
|
t_mask() = ((coords_v[sU] >= (tmin + tshift)) &&
|
||||||
|
(coords_v[sU] <= (tmax + tshift)));
|
||||||
|
|
||||||
|
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
||||||
|
unsigned int t0 = 0;
|
||||||
|
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
|
||||||
|
|
||||||
|
timeSlices = Reduce(t_mask());
|
||||||
|
|
||||||
|
if (timeSlices > 0) {
|
||||||
|
Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],
|
||||||
|
q_out_v[sU],
|
||||||
|
Umu_v, sU, mu, t_mask);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -1,574 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h
|
|
||||||
|
|
||||||
Copyright (C) 2020
|
|
||||||
|
|
||||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
//#if defined(A64FXASM)
|
|
||||||
#if defined(A64FX)
|
|
||||||
|
|
||||||
// safety include
|
|
||||||
#include <arm_sve.h>
|
|
||||||
|
|
||||||
// undefine everything related to kernels
|
|
||||||
#include <simd/Fujitsu_A64FX_undef.h>
|
|
||||||
|
|
||||||
// enable A64FX body
|
|
||||||
#define WILSONKERNELSASMBODYA64FX
|
|
||||||
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
// If we are A64FX specialise the single precision routine
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
#if defined(DSLASHINTRIN)
|
|
||||||
//#pragma message ("A64FX Dslash: intrin")
|
|
||||||
#include <simd/Fujitsu_A64FX_intrin_single.h>
|
|
||||||
#else
|
|
||||||
#pragma message ("A64FX Dslash: asm")
|
|
||||||
#include <simd/Fujitsu_A64FX_asm_single.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/// Switch off the 5d vectorised code optimisations
|
|
||||||
#undef DWFVEC5D
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
// XYZT vectorised, undag Kernel, single
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
#undef KERNEL_DAG
|
|
||||||
#define INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
|
||||||
#define INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#define EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
// XYZT vectorised, dag Kernel, single
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
#define KERNEL_DAG
|
|
||||||
#define INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
|
||||||
#define INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#define EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
// undefine
|
|
||||||
#include <simd/Fujitsu_A64FX_undef.h>
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
// If we are A64FX specialise the double precision routine
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
#if defined(DSLASHINTRIN)
|
|
||||||
#include <simd/Fujitsu_A64FX_intrin_double.h>
|
|
||||||
#else
|
|
||||||
#include <simd/Fujitsu_A64FX_asm_double.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// former KNL
|
|
||||||
//#define MAYBEPERM(A,perm) if (perm) { A ; }
|
|
||||||
//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
|
||||||
//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
|
|
||||||
|
|
||||||
|
|
||||||
#define INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
// XYZT vectorised, undag Kernel, double
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
#undef KERNEL_DAG
|
|
||||||
#define INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
|
||||||
#define INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#define EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
// XYZT vectorised, dag Kernel, double
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
#define KERNEL_DAG
|
|
||||||
#define INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
|
||||||
#define INTERIOR
|
|
||||||
#undef EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
|
||||||
#undef INTERIOR
|
|
||||||
#define EXTERIOR
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<> void
|
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// undefs
|
|
||||||
#undef WILSONKERNELSASMBODYA64FX
|
|
||||||
#include <simd/Fujitsu_A64FX_undef.h>
|
|
||||||
|
|
||||||
#endif //A64FXASM
|
|
@ -1,380 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: WilsonKernelsAsmBodyA64FX.h
|
|
||||||
|
|
||||||
Copyright (C) 2020
|
|
||||||
|
|
||||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#ifdef KERNEL_DAG
|
|
||||||
#define DIR0_PROJ XP_PROJ
|
|
||||||
#define DIR1_PROJ YP_PROJ
|
|
||||||
#define DIR2_PROJ ZP_PROJ
|
|
||||||
#define DIR3_PROJ TP_PROJ
|
|
||||||
#define DIR4_PROJ XM_PROJ
|
|
||||||
#define DIR5_PROJ YM_PROJ
|
|
||||||
#define DIR6_PROJ ZM_PROJ
|
|
||||||
#define DIR7_PROJ TM_PROJ
|
|
||||||
#define DIR0_RECON XP_RECON
|
|
||||||
#define DIR1_RECON YP_RECON_ACCUM
|
|
||||||
#define DIR2_RECON ZP_RECON_ACCUM
|
|
||||||
#define DIR3_RECON TP_RECON_ACCUM
|
|
||||||
#define DIR4_RECON XM_RECON_ACCUM
|
|
||||||
#define DIR5_RECON YM_RECON_ACCUM
|
|
||||||
#define DIR6_RECON ZM_RECON_ACCUM
|
|
||||||
#define DIR7_RECON TM_RECON_ACCUM
|
|
||||||
#else
|
|
||||||
#define DIR0_PROJ XM_PROJ
|
|
||||||
#define DIR1_PROJ YM_PROJ
|
|
||||||
#define DIR2_PROJ ZM_PROJ
|
|
||||||
#define DIR3_PROJ TM_PROJ
|
|
||||||
#define DIR4_PROJ XP_PROJ
|
|
||||||
#define DIR5_PROJ YP_PROJ
|
|
||||||
#define DIR6_PROJ ZP_PROJ
|
|
||||||
#define DIR7_PROJ TP_PROJ
|
|
||||||
#define DIR0_RECON XM_RECON
|
|
||||||
#define DIR1_RECON YM_RECON_ACCUM
|
|
||||||
#define DIR2_RECON ZM_RECON_ACCUM
|
|
||||||
#define DIR3_RECON TM_RECON_ACCUM
|
|
||||||
#define DIR4_RECON XP_RECON_ACCUM
|
|
||||||
#define DIR5_RECON YP_RECON_ACCUM
|
|
||||||
#define DIR6_RECON ZP_RECON_ACCUM
|
|
||||||
#define DIR7_RECON TP_RECON_ACCUM
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//using namespace std;
|
|
||||||
|
|
||||||
#undef SHOW
|
|
||||||
//#define SHOW
|
|
||||||
|
|
||||||
#undef WHERE
|
|
||||||
|
|
||||||
#ifdef INTERIOR_AND_EXTERIOR
|
|
||||||
#define WHERE "INT_AND_EXT"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef INTERIOR
|
|
||||||
#define WHERE "INT"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef EXTERIOR
|
|
||||||
#define WHERE "EXT"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//#pragma message("here")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Comms then compute kernel
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
#ifdef INTERIOR_AND_EXTERIOR
|
|
||||||
|
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
|
||||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
|
||||||
if ( local ) { \
|
|
||||||
LOAD_CHIMU(base); \
|
|
||||||
LOAD_TABLE(PERMUTE_DIR); \
|
|
||||||
PROJ; \
|
|
||||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
|
||||||
} else { \
|
|
||||||
LOAD_CHI(base); \
|
|
||||||
} \
|
|
||||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
|
||||||
MULT_2SPIN_1(Dir); \
|
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
PREFETCH_CHIMU_L2(basep); \
|
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
|
||||||
MULT_2SPIN_2; \
|
|
||||||
if (s == 0) { \
|
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
|
||||||
} \
|
|
||||||
RECON; \
|
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
|
||||||
PREFETCH1_CHIMU(base); \
|
|
||||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
|
||||||
|
|
||||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Pre comms kernel -- prefetch like normal because it is mostly right
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
#ifdef INTERIOR
|
|
||||||
|
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
|
||||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
|
||||||
if ( local ) { \
|
|
||||||
LOAD_CHIMU(base); \
|
|
||||||
LOAD_TABLE(PERMUTE_DIR); \
|
|
||||||
PROJ; \
|
|
||||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
|
||||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
|
||||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
|
||||||
if ( local || st.same_node[Dir] ) { \
|
|
||||||
MULT_2SPIN_1(Dir); \
|
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
|
||||||
MULT_2SPIN_2; \
|
|
||||||
if (s == 0) { \
|
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
|
||||||
} \
|
|
||||||
RECON; \
|
|
||||||
PREFETCH_CHIMU_L2(basep); \
|
|
||||||
} else { PREFETCH_CHIMU(base); } \
|
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
|
||||||
PREFETCH1_CHIMU(base); \
|
|
||||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
|
||||||
|
|
||||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Post comms kernel
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
#ifdef EXTERIOR
|
|
||||||
|
|
||||||
|
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
|
||||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
|
||||||
LOAD_CHI(base); \
|
|
||||||
MULT_2SPIN_1(Dir); \
|
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
|
||||||
MULT_2SPIN_2; \
|
|
||||||
if (s == 0) { \
|
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
|
||||||
} \
|
|
||||||
RECON; \
|
|
||||||
nmu++; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
|
||||||
nmu=0; \
|
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
|
|
||||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
|
||||||
LOAD_CHI(base); \
|
|
||||||
MULT_2SPIN_1(Dir); \
|
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
|
||||||
MULT_2SPIN_2; \
|
|
||||||
if (s == 0) { \
|
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
|
||||||
} \
|
|
||||||
RECON; \
|
|
||||||
nmu++; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
int nmu;
|
|
||||||
int local,perm, ptype;
|
|
||||||
uint64_t base;
|
|
||||||
uint64_t basep;
|
|
||||||
const uint64_t plocal =(uint64_t) & in[0];
|
|
||||||
|
|
||||||
MASK_REGS;
|
|
||||||
int nmax=U.oSites();
|
|
||||||
for(int site=0;site<Ns;site++) {
|
|
||||||
#ifndef EXTERIOR
|
|
||||||
// int sU =lo.Reorder(ssU);
|
|
||||||
int sU =ssU;
|
|
||||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
|
||||||
// int sUn=lo.Reorder(ssn);
|
|
||||||
int sUn=ssn;
|
|
||||||
LOCK_GAUGE(0);
|
|
||||||
#else
|
|
||||||
int sU =ssU;
|
|
||||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
|
||||||
int sUn=ssn;
|
|
||||||
#endif
|
|
||||||
for(int s=0;s<Ls;s++) {
|
|
||||||
ss =sU*Ls+s;
|
|
||||||
ssn=sUn*Ls+s;
|
|
||||||
int ent=ss*8;// 2*Ndim
|
|
||||||
int nent=ssn*8;
|
|
||||||
|
|
||||||
uint64_t delta_base, delta_base_p;
|
|
||||||
|
|
||||||
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
float rescale = 64. * 12.;
|
|
||||||
std::cout << "=================================================================" << std::endl;
|
|
||||||
std::cout << "ss = " << ss << " ssn = " << ssn << std::endl;
|
|
||||||
std::cout << "sU = " << sU << " ssU = " << ssU << std::endl;
|
|
||||||
std::cout << " " << std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
std::cout << "Dir = " << Xp << " " << WHERE<< std::endl;
|
|
||||||
|
|
||||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
|
||||||
std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl;
|
|
||||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
|
|
||||||
|
|
||||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
|
||||||
std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl;
|
|
||||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
|
|
||||||
|
|
||||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
|
||||||
std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl;
|
|
||||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
|
|
||||||
|
|
||||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
|
||||||
std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl;
|
|
||||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
|
|
||||||
|
|
||||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
|
||||||
std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl;
|
|
||||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
|
|
||||||
|
|
||||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
|
||||||
std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl;
|
|
||||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
|
|
||||||
|
|
||||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
|
||||||
std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl;
|
|
||||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
|
|
||||||
|
|
||||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
|
||||||
std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl;
|
|
||||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef EXTERIOR
|
|
||||||
if (nmu==0) break;
|
|
||||||
// if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
|
|
||||||
#endif
|
|
||||||
base = (uint64_t) &out[ss];
|
|
||||||
basep= st.GetPFInfo(nent,plocal); ent++;
|
|
||||||
basep = (uint64_t) &out[ssn];
|
|
||||||
RESULT(base,basep);
|
|
||||||
|
|
||||||
#ifdef SHOW
|
|
||||||
std::cout << "Dir = FINAL " << WHERE<< std::endl;;
|
|
||||||
|
|
||||||
base_ss = base;
|
|
||||||
std::cout << "base = " << (base - (uint64_t) &out[0])/rescale << std::endl;
|
|
||||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
|
||||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
|
||||||
std::cout << "----------------------------------------------------" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
|
||||||
ssU++;
|
|
||||||
UNLOCK_GAUGE(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef DIR0_PROJ
|
|
||||||
#undef DIR1_PROJ
|
|
||||||
#undef DIR2_PROJ
|
|
||||||
#undef DIR3_PROJ
|
|
||||||
#undef DIR4_PROJ
|
|
||||||
#undef DIR5_PROJ
|
|
||||||
#undef DIR6_PROJ
|
|
||||||
#undef DIR7_PROJ
|
|
||||||
#undef DIR0_RECON
|
|
||||||
#undef DIR1_RECON
|
|
||||||
#undef DIR2_RECON
|
|
||||||
#undef DIR3_RECON
|
|
||||||
#undef DIR4_RECON
|
|
||||||
#undef DIR5_RECON
|
|
||||||
#undef DIR6_RECON
|
|
||||||
#undef DIR7_RECON
|
|
||||||
#undef ASM_LEG
|
|
||||||
#undef ASM_LEG_XP
|
|
||||||
#undef RESULT
|
|
@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_RESULT_EXT(ss,F)
|
HAND_RESULT_EXT(ss,F)
|
||||||
|
|
||||||
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
||||||
template<> accelerator_inline void \
|
template<> void \
|
||||||
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> accelerator_inline void \
|
template<> void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> accelerator_inline void \
|
template<> void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> accelerator_inline void \
|
template<> void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> accelerator_inline void \
|
template<> void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
nmu = 0; \
|
nmu = 0; \
|
||||||
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
template<> accelerator_inline void \
|
template<> void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Impl> accelerator_inline void
|
template<class Impl> void
|
||||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> accelerator_inline
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> accelerator_inline void
|
template<class Impl> void
|
||||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> accelerator_inline
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> accelerator_inline void
|
template<class Impl> void
|
||||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
|
|||||||
HAND_RESULT_EXT(ss);
|
HAND_RESULT_EXT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> accelerator_inline
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
@ -1,943 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
|
|
||||||
|
|
||||||
#undef LOAD_CHIMU
|
|
||||||
#undef LOAD_CHI
|
|
||||||
#undef MULT_2SPIN
|
|
||||||
#undef PERMUTE_DIR
|
|
||||||
#undef XP_PROJ
|
|
||||||
#undef YP_PROJ
|
|
||||||
#undef ZP_PROJ
|
|
||||||
#undef TP_PROJ
|
|
||||||
#undef XM_PROJ
|
|
||||||
#undef YM_PROJ
|
|
||||||
#undef ZM_PROJ
|
|
||||||
#undef TM_PROJ
|
|
||||||
#undef XP_RECON
|
|
||||||
#undef XP_RECON_ACCUM
|
|
||||||
#undef XM_RECON
|
|
||||||
#undef XM_RECON_ACCUM
|
|
||||||
#undef YP_RECON_ACCUM
|
|
||||||
#undef YM_RECON_ACCUM
|
|
||||||
#undef ZP_RECON_ACCUM
|
|
||||||
#undef ZM_RECON_ACCUM
|
|
||||||
#undef TP_RECON_ACCUM
|
|
||||||
#undef TM_RECON_ACCUM
|
|
||||||
#undef ZERO_RESULT
|
|
||||||
#undef Chimu_00
|
|
||||||
#undef Chimu_01
|
|
||||||
#undef Chimu_02
|
|
||||||
#undef Chimu_10
|
|
||||||
#undef Chimu_11
|
|
||||||
#undef Chimu_12
|
|
||||||
#undef Chimu_20
|
|
||||||
#undef Chimu_21
|
|
||||||
#undef Chimu_22
|
|
||||||
#undef Chimu_30
|
|
||||||
#undef Chimu_31
|
|
||||||
#undef Chimu_32
|
|
||||||
#undef HAND_STENCIL_LEG
|
|
||||||
#undef HAND_STENCIL_LEG_INT
|
|
||||||
#undef HAND_STENCIL_LEG_EXT
|
|
||||||
#undef HAND_RESULT
|
|
||||||
#undef HAND_RESULT_INT
|
|
||||||
#undef HAND_RESULT_EXT
|
|
||||||
|
|
||||||
#define REGISTER
|
|
||||||
|
|
||||||
#define LOAD_CHIMU \
|
|
||||||
{const SiteSpinor & ref (in[offset]); \
|
|
||||||
Chimu_00=ref()(0)(0);\
|
|
||||||
Chimu_01=ref()(0)(1);\
|
|
||||||
Chimu_02=ref()(0)(2);\
|
|
||||||
Chimu_10=ref()(1)(0);\
|
|
||||||
Chimu_11=ref()(1)(1);\
|
|
||||||
Chimu_12=ref()(1)(2);\
|
|
||||||
Chimu_20=ref()(2)(0);\
|
|
||||||
Chimu_21=ref()(2)(1);\
|
|
||||||
Chimu_22=ref()(2)(2);\
|
|
||||||
Chimu_30=ref()(3)(0);\
|
|
||||||
Chimu_31=ref()(3)(1);\
|
|
||||||
Chimu_32=ref()(3)(2);\
|
|
||||||
std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \
|
|
||||||
std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \
|
|
||||||
std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \
|
|
||||||
std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \
|
|
||||||
std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \
|
|
||||||
std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \
|
|
||||||
std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \
|
|
||||||
std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \
|
|
||||||
std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \
|
|
||||||
std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \
|
|
||||||
std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \
|
|
||||||
std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \
|
|
||||||
std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define LOAD_CHI\
|
|
||||||
{const SiteHalfSpinor &ref(buf[offset]); \
|
|
||||||
Chi_00 = ref()(0)(0);\
|
|
||||||
Chi_01 = ref()(0)(1);\
|
|
||||||
Chi_02 = ref()(0)(2);\
|
|
||||||
Chi_10 = ref()(1)(0);\
|
|
||||||
Chi_11 = ref()(1)(1);\
|
|
||||||
Chi_12 = ref()(1)(2);\
|
|
||||||
std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl; \
|
|
||||||
}
|
|
||||||
|
|
||||||
// To splat or not to splat depends on the implementation
|
|
||||||
#define MULT_2SPIN(A)\
|
|
||||||
{auto & ref(U[sU](A)); \
|
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
|
||||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
|
||||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
|
||||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
|
||||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
|
||||||
UChi_00 = U_00*Chi_00;\
|
|
||||||
UChi_10 = U_00*Chi_10;\
|
|
||||||
UChi_01 = U_10*Chi_00;\
|
|
||||||
UChi_11 = U_10*Chi_10;\
|
|
||||||
UChi_02 = U_20*Chi_00;\
|
|
||||||
UChi_12 = U_20*Chi_10;\
|
|
||||||
UChi_00+= U_01*Chi_01;\
|
|
||||||
UChi_10+= U_01*Chi_11;\
|
|
||||||
UChi_01+= U_11*Chi_01;\
|
|
||||||
UChi_11+= U_11*Chi_11;\
|
|
||||||
UChi_02+= U_21*Chi_01;\
|
|
||||||
UChi_12+= U_21*Chi_11;\
|
|
||||||
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
|
||||||
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
|
||||||
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
|
||||||
UChi_00+= U_00*Chi_02;\
|
|
||||||
UChi_10+= U_00*Chi_12;\
|
|
||||||
UChi_01+= U_10*Chi_02;\
|
|
||||||
UChi_11+= U_10*Chi_12;\
|
|
||||||
UChi_02+= U_20*Chi_02;\
|
|
||||||
UChi_12+= U_20*Chi_12;\
|
|
||||||
std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \
|
|
||||||
std::cout << "UChi_00 -- " << UChi_00 << std::endl; \
|
|
||||||
std::cout << "UChi_01 -- " << UChi_01 << std::endl; \
|
|
||||||
std::cout << "UChi_02 -- " << UChi_02 << std::endl; \
|
|
||||||
std::cout << "UChi_10 -- " << UChi_10 << std::endl; \
|
|
||||||
std::cout << "UChi_11 -- " << UChi_11 << std::endl; \
|
|
||||||
std::cout << "UChi_12 -- " << UChi_12 << std::endl; \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
|
||||||
std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl; \
|
|
||||||
permute##dir(Chi_00,Chi_00);\
|
|
||||||
permute##dir(Chi_01,Chi_01);\
|
|
||||||
permute##dir(Chi_02,Chi_02);\
|
|
||||||
permute##dir(Chi_10,Chi_10);\
|
|
||||||
permute##dir(Chi_11,Chi_11);\
|
|
||||||
permute##dir(Chi_12,Chi_12);\
|
|
||||||
std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
|
||||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
|
||||||
#define XP_PROJ \
|
|
||||||
Chi_00 = Chimu_00+timesI(Chimu_30);\
|
|
||||||
Chi_01 = Chimu_01+timesI(Chimu_31);\
|
|
||||||
Chi_02 = Chimu_02+timesI(Chimu_32);\
|
|
||||||
Chi_10 = Chimu_10+timesI(Chimu_20);\
|
|
||||||
Chi_11 = Chimu_11+timesI(Chimu_21);\
|
|
||||||
Chi_12 = Chimu_12+timesI(Chimu_22);\
|
|
||||||
std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
#define YP_PROJ \
|
|
||||||
Chi_00 = Chimu_00-Chimu_30;\
|
|
||||||
Chi_01 = Chimu_01-Chimu_31;\
|
|
||||||
Chi_02 = Chimu_02-Chimu_32;\
|
|
||||||
Chi_10 = Chimu_10+Chimu_20;\
|
|
||||||
Chi_11 = Chimu_11+Chimu_21;\
|
|
||||||
Chi_12 = Chimu_12+Chimu_22;\
|
|
||||||
std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
#define ZP_PROJ \
|
|
||||||
Chi_00 = Chimu_00+timesI(Chimu_20); \
|
|
||||||
Chi_01 = Chimu_01+timesI(Chimu_21); \
|
|
||||||
Chi_02 = Chimu_02+timesI(Chimu_22); \
|
|
||||||
Chi_10 = Chimu_10-timesI(Chimu_30); \
|
|
||||||
Chi_11 = Chimu_11-timesI(Chimu_31); \
|
|
||||||
Chi_12 = Chimu_12-timesI(Chimu_32);\
|
|
||||||
std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
#define TP_PROJ \
|
|
||||||
Chi_00 = Chimu_00+Chimu_20; \
|
|
||||||
Chi_01 = Chimu_01+Chimu_21; \
|
|
||||||
Chi_02 = Chimu_02+Chimu_22; \
|
|
||||||
Chi_10 = Chimu_10+Chimu_30; \
|
|
||||||
Chi_11 = Chimu_11+Chimu_31; \
|
|
||||||
Chi_12 = Chimu_12+Chimu_32;\
|
|
||||||
std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
// hspin(0)=fspin(0)-timesI(fspin(3));
|
|
||||||
// hspin(1)=fspin(1)-timesI(fspin(2));
|
|
||||||
#define XM_PROJ \
|
|
||||||
Chi_00 = Chimu_00-timesI(Chimu_30);\
|
|
||||||
Chi_01 = Chimu_01-timesI(Chimu_31);\
|
|
||||||
Chi_02 = Chimu_02-timesI(Chimu_32);\
|
|
||||||
Chi_10 = Chimu_10-timesI(Chimu_20);\
|
|
||||||
Chi_11 = Chimu_11-timesI(Chimu_21);\
|
|
||||||
Chi_12 = Chimu_12-timesI(Chimu_22);\
|
|
||||||
std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
#define YM_PROJ \
|
|
||||||
Chi_00 = Chimu_00+Chimu_30;\
|
|
||||||
Chi_01 = Chimu_01+Chimu_31;\
|
|
||||||
Chi_02 = Chimu_02+Chimu_32;\
|
|
||||||
Chi_10 = Chimu_10-Chimu_20;\
|
|
||||||
Chi_11 = Chimu_11-Chimu_21;\
|
|
||||||
Chi_12 = Chimu_12-Chimu_22;\
|
|
||||||
std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
#define ZM_PROJ \
|
|
||||||
Chi_00 = Chimu_00-timesI(Chimu_20); \
|
|
||||||
Chi_01 = Chimu_01-timesI(Chimu_21); \
|
|
||||||
Chi_02 = Chimu_02-timesI(Chimu_22); \
|
|
||||||
Chi_10 = Chimu_10+timesI(Chimu_30); \
|
|
||||||
Chi_11 = Chimu_11+timesI(Chimu_31); \
|
|
||||||
Chi_12 = Chimu_12+timesI(Chimu_32);\
|
|
||||||
std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
#define TM_PROJ \
|
|
||||||
Chi_00 = Chimu_00-Chimu_20; \
|
|
||||||
Chi_01 = Chimu_01-Chimu_21; \
|
|
||||||
Chi_02 = Chimu_02-Chimu_22; \
|
|
||||||
Chi_10 = Chimu_10-Chimu_30; \
|
|
||||||
Chi_11 = Chimu_11-Chimu_31; \
|
|
||||||
Chi_12 = Chimu_12-Chimu_32;\
|
|
||||||
std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \
|
|
||||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
|
||||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
|
||||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
|
||||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
|
||||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
|
||||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
|
||||||
|
|
||||||
// fspin(0)=hspin(0);
|
|
||||||
// fspin(1)=hspin(1);
|
|
||||||
// fspin(2)=timesMinusI(hspin(1));
|
|
||||||
// fspin(3)=timesMinusI(hspin(0));
|
|
||||||
#define XP_RECON\
|
|
||||||
result_00 = UChi_00;\
|
|
||||||
result_01 = UChi_01;\
|
|
||||||
result_02 = UChi_02;\
|
|
||||||
result_10 = UChi_10;\
|
|
||||||
result_11 = UChi_11;\
|
|
||||||
result_12 = UChi_12;\
|
|
||||||
result_20 = timesMinusI(UChi_10);\
|
|
||||||
result_21 = timesMinusI(UChi_11);\
|
|
||||||
result_22 = timesMinusI(UChi_12);\
|
|
||||||
result_30 = timesMinusI(UChi_00);\
|
|
||||||
result_31 = timesMinusI(UChi_01);\
|
|
||||||
result_32 = timesMinusI(UChi_02);\
|
|
||||||
std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define XP_RECON_ACCUM\
|
|
||||||
result_00+=UChi_00;\
|
|
||||||
result_01+=UChi_01;\
|
|
||||||
result_02+=UChi_02;\
|
|
||||||
result_10+=UChi_10;\
|
|
||||||
result_11+=UChi_11;\
|
|
||||||
result_12+=UChi_12;\
|
|
||||||
result_20-=timesI(UChi_10);\
|
|
||||||
result_21-=timesI(UChi_11);\
|
|
||||||
result_22-=timesI(UChi_12);\
|
|
||||||
result_30-=timesI(UChi_00);\
|
|
||||||
result_31-=timesI(UChi_01);\
|
|
||||||
result_32-=timesI(UChi_02);\
|
|
||||||
std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define XM_RECON\
|
|
||||||
result_00 = UChi_00;\
|
|
||||||
result_01 = UChi_01;\
|
|
||||||
result_02 = UChi_02;\
|
|
||||||
result_10 = UChi_10;\
|
|
||||||
result_11 = UChi_11;\
|
|
||||||
result_12 = UChi_12;\
|
|
||||||
result_20 = timesI(UChi_10);\
|
|
||||||
result_21 = timesI(UChi_11);\
|
|
||||||
result_22 = timesI(UChi_12);\
|
|
||||||
result_30 = timesI(UChi_00);\
|
|
||||||
result_31 = timesI(UChi_01);\
|
|
||||||
result_32 = timesI(UChi_02);\
|
|
||||||
std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define XM_RECON_ACCUM\
|
|
||||||
result_00+= UChi_00;\
|
|
||||||
result_01+= UChi_01;\
|
|
||||||
result_02+= UChi_02;\
|
|
||||||
result_10+= UChi_10;\
|
|
||||||
result_11+= UChi_11;\
|
|
||||||
result_12+= UChi_12;\
|
|
||||||
result_20+= timesI(UChi_10);\
|
|
||||||
result_21+= timesI(UChi_11);\
|
|
||||||
result_22+= timesI(UChi_12);\
|
|
||||||
result_30+= timesI(UChi_00);\
|
|
||||||
result_31+= timesI(UChi_01);\
|
|
||||||
result_32+= timesI(UChi_02);\
|
|
||||||
std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define YP_RECON_ACCUM\
|
|
||||||
result_00+= UChi_00;\
|
|
||||||
result_01+= UChi_01;\
|
|
||||||
result_02+= UChi_02;\
|
|
||||||
result_10+= UChi_10;\
|
|
||||||
result_11+= UChi_11;\
|
|
||||||
result_12+= UChi_12;\
|
|
||||||
result_20+= UChi_10;\
|
|
||||||
result_21+= UChi_11;\
|
|
||||||
result_22+= UChi_12;\
|
|
||||||
result_30-= UChi_00;\
|
|
||||||
result_31-= UChi_01;\
|
|
||||||
result_32-= UChi_02;\
|
|
||||||
std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define YM_RECON_ACCUM\
|
|
||||||
result_00+= UChi_00;\
|
|
||||||
result_01+= UChi_01;\
|
|
||||||
result_02+= UChi_02;\
|
|
||||||
result_10+= UChi_10;\
|
|
||||||
result_11+= UChi_11;\
|
|
||||||
result_12+= UChi_12;\
|
|
||||||
result_20-= UChi_10;\
|
|
||||||
result_21-= UChi_11;\
|
|
||||||
result_22-= UChi_12;\
|
|
||||||
result_30+= UChi_00;\
|
|
||||||
result_31+= UChi_01;\
|
|
||||||
result_32+= UChi_02;\
|
|
||||||
std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define ZP_RECON_ACCUM\
|
|
||||||
result_00+= UChi_00;\
|
|
||||||
result_01+= UChi_01;\
|
|
||||||
result_02+= UChi_02;\
|
|
||||||
result_10+= UChi_10;\
|
|
||||||
result_11+= UChi_11;\
|
|
||||||
result_12+= UChi_12;\
|
|
||||||
result_20-= timesI(UChi_00); \
|
|
||||||
result_21-= timesI(UChi_01); \
|
|
||||||
result_22-= timesI(UChi_02); \
|
|
||||||
result_30+= timesI(UChi_10); \
|
|
||||||
result_31+= timesI(UChi_11); \
|
|
||||||
result_32+= timesI(UChi_12);\
|
|
||||||
std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define ZM_RECON_ACCUM\
|
|
||||||
result_00+= UChi_00;\
|
|
||||||
result_01+= UChi_01;\
|
|
||||||
result_02+= UChi_02;\
|
|
||||||
result_10+= UChi_10;\
|
|
||||||
result_11+= UChi_11;\
|
|
||||||
result_12+= UChi_12;\
|
|
||||||
result_20+= timesI(UChi_00); \
|
|
||||||
result_21+= timesI(UChi_01); \
|
|
||||||
result_22+= timesI(UChi_02); \
|
|
||||||
result_30-= timesI(UChi_10); \
|
|
||||||
result_31-= timesI(UChi_11); \
|
|
||||||
result_32-= timesI(UChi_12);\
|
|
||||||
std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define TP_RECON_ACCUM\
|
|
||||||
result_00+= UChi_00;\
|
|
||||||
result_01+= UChi_01;\
|
|
||||||
result_02+= UChi_02;\
|
|
||||||
result_10+= UChi_10;\
|
|
||||||
result_11+= UChi_11;\
|
|
||||||
result_12+= UChi_12;\
|
|
||||||
result_20+= UChi_00; \
|
|
||||||
result_21+= UChi_01; \
|
|
||||||
result_22+= UChi_02; \
|
|
||||||
result_30+= UChi_10; \
|
|
||||||
result_31+= UChi_11; \
|
|
||||||
result_32+= UChi_12;\
|
|
||||||
std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define TM_RECON_ACCUM\
|
|
||||||
result_00+= UChi_00;\
|
|
||||||
result_01+= UChi_01;\
|
|
||||||
result_02+= UChi_02;\
|
|
||||||
result_10+= UChi_10;\
|
|
||||||
result_11+= UChi_11;\
|
|
||||||
result_12+= UChi_12;\
|
|
||||||
result_20-= UChi_00; \
|
|
||||||
result_21-= UChi_01; \
|
|
||||||
result_22-= UChi_02; \
|
|
||||||
result_30-= UChi_10; \
|
|
||||||
result_31-= UChi_11; \
|
|
||||||
result_32-= UChi_12;\
|
|
||||||
std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
|
||||||
SE=st.GetEntry(ptype,DIR,ss); \
|
|
||||||
offset = SE->_offset; \
|
|
||||||
local = SE->_is_local; \
|
|
||||||
perm = SE->_permute; \
|
|
||||||
if ( local ) { \
|
|
||||||
LOAD_CHIMU; \
|
|
||||||
PROJ; \
|
|
||||||
if ( perm) { \
|
|
||||||
PERMUTE_DIR(PERM); \
|
|
||||||
} \
|
|
||||||
} else { \
|
|
||||||
LOAD_CHI; \
|
|
||||||
} \
|
|
||||||
MULT_2SPIN(DIR); \
|
|
||||||
RECON;
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
|
|
||||||
SE=st.GetEntry(ptype,DIR,ss); \
|
|
||||||
offset = SE->_offset; \
|
|
||||||
local = SE->_is_local; \
|
|
||||||
perm = SE->_permute; \
|
|
||||||
if ( local ) { \
|
|
||||||
LOAD_CHIMU; \
|
|
||||||
PROJ; \
|
|
||||||
if ( perm) { \
|
|
||||||
PERMUTE_DIR(PERM); \
|
|
||||||
} \
|
|
||||||
} else if ( st.same_node[DIR] ) { \
|
|
||||||
LOAD_CHI; \
|
|
||||||
} \
|
|
||||||
if (local || st.same_node[DIR] ) { \
|
|
||||||
MULT_2SPIN(DIR); \
|
|
||||||
RECON; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
|
|
||||||
SE=st.GetEntry(ptype,DIR,ss); \
|
|
||||||
offset = SE->_offset; \
|
|
||||||
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
|
||||||
LOAD_CHI; \
|
|
||||||
MULT_2SPIN(DIR); \
|
|
||||||
RECON; \
|
|
||||||
nmu++; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HAND_RESULT(ss) \
|
|
||||||
{ \
|
|
||||||
SiteSpinor & ref (out[ss]); \
|
|
||||||
vstream(ref()(0)(0),result_00); \
|
|
||||||
vstream(ref()(0)(1),result_01); \
|
|
||||||
vstream(ref()(0)(2),result_02); \
|
|
||||||
vstream(ref()(1)(0),result_10); \
|
|
||||||
vstream(ref()(1)(1),result_11); \
|
|
||||||
vstream(ref()(1)(2),result_12); \
|
|
||||||
vstream(ref()(2)(0),result_20); \
|
|
||||||
vstream(ref()(2)(1),result_21); \
|
|
||||||
vstream(ref()(2)(2),result_22); \
|
|
||||||
vstream(ref()(3)(0),result_30); \
|
|
||||||
vstream(ref()(3)(1),result_31); \
|
|
||||||
vstream(ref()(3)(2),result_32); \
|
|
||||||
std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;\
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HAND_RESULT_EXT(ss) \
|
|
||||||
if (nmu){ \
|
|
||||||
SiteSpinor & ref (out[ss]); \
|
|
||||||
ref()(0)(0)+=result_00; \
|
|
||||||
ref()(0)(1)+=result_01; \
|
|
||||||
ref()(0)(2)+=result_02; \
|
|
||||||
ref()(1)(0)+=result_10; \
|
|
||||||
ref()(1)(1)+=result_11; \
|
|
||||||
ref()(1)(2)+=result_12; \
|
|
||||||
ref()(2)(0)+=result_20; \
|
|
||||||
ref()(2)(1)+=result_21; \
|
|
||||||
ref()(2)(2)+=result_22; \
|
|
||||||
ref()(3)(0)+=result_30; \
|
|
||||||
ref()(3)(1)+=result_31; \
|
|
||||||
ref()(3)(2)+=result_32; \
|
|
||||||
std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \
|
|
||||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
|
||||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
|
||||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
|
||||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
|
||||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
|
||||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
|
||||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
|
||||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
|
||||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
|
||||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
|
||||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
|
||||||
std::cout << "result_32 -- " << result_32 << std::endl;\
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define HAND_DECLARATIONS(a) \
|
|
||||||
Simd result_00; \
|
|
||||||
Simd result_01; \
|
|
||||||
Simd result_02; \
|
|
||||||
Simd result_10; \
|
|
||||||
Simd result_11; \
|
|
||||||
Simd result_12; \
|
|
||||||
Simd result_20; \
|
|
||||||
Simd result_21; \
|
|
||||||
Simd result_22; \
|
|
||||||
Simd result_30; \
|
|
||||||
Simd result_31; \
|
|
||||||
Simd result_32; \
|
|
||||||
Simd Chi_00; \
|
|
||||||
Simd Chi_01; \
|
|
||||||
Simd Chi_02; \
|
|
||||||
Simd Chi_10; \
|
|
||||||
Simd Chi_11; \
|
|
||||||
Simd Chi_12; \
|
|
||||||
Simd UChi_00; \
|
|
||||||
Simd UChi_01; \
|
|
||||||
Simd UChi_02; \
|
|
||||||
Simd UChi_10; \
|
|
||||||
Simd UChi_11; \
|
|
||||||
Simd UChi_12; \
|
|
||||||
Simd U_00; \
|
|
||||||
Simd U_10; \
|
|
||||||
Simd U_20; \
|
|
||||||
Simd U_01; \
|
|
||||||
Simd U_11; \
|
|
||||||
Simd U_21;\
|
|
||||||
Simd debugreg;\
|
|
||||||
svbool_t pg1; \
|
|
||||||
pg1 = svptrue_b64(); \
|
|
||||||
|
|
||||||
#define ZERO_RESULT \
|
|
||||||
result_00=Zero(); \
|
|
||||||
result_01=Zero(); \
|
|
||||||
result_02=Zero(); \
|
|
||||||
result_10=Zero(); \
|
|
||||||
result_11=Zero(); \
|
|
||||||
result_12=Zero(); \
|
|
||||||
result_20=Zero(); \
|
|
||||||
result_21=Zero(); \
|
|
||||||
result_22=Zero(); \
|
|
||||||
result_30=Zero(); \
|
|
||||||
result_31=Zero(); \
|
|
||||||
result_32=Zero();
|
|
||||||
|
|
||||||
#define Chimu_00 Chi_00
|
|
||||||
#define Chimu_01 Chi_01
|
|
||||||
#define Chimu_02 Chi_02
|
|
||||||
#define Chimu_10 Chi_10
|
|
||||||
#define Chimu_11 Chi_11
|
|
||||||
#define Chimu_12 Chi_12
|
|
||||||
#define Chimu_20 UChi_00
|
|
||||||
#define Chimu_21 UChi_01
|
|
||||||
#define Chimu_22 UChi_02
|
|
||||||
#define Chimu_30 UChi_10
|
|
||||||
#define Chimu_31 UChi_11
|
|
||||||
#define Chimu_32 UChi_12
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
template<class Impl> void
|
|
||||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
|
||||||
|
|
||||||
int offset,local,perm, ptype;
|
|
||||||
StencilEntry *SE;
|
|
||||||
|
|
||||||
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
|
|
||||||
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
|
||||||
HAND_RESULT(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
|
||||||
int offset,local,perm, ptype;
|
|
||||||
|
|
||||||
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
|
|
||||||
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
|
||||||
HAND_RESULT(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl> void
|
|
||||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
|
||||||
|
|
||||||
int offset,local,perm, ptype;
|
|
||||||
StencilEntry *SE;
|
|
||||||
ZERO_RESULT;
|
|
||||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
|
||||||
HAND_RESULT(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
|
||||||
int offset,local,perm, ptype;
|
|
||||||
ZERO_RESULT;
|
|
||||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
|
||||||
HAND_RESULT(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl> void
|
|
||||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
|
||||||
|
|
||||||
int offset, ptype;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int nmu=0;
|
|
||||||
ZERO_RESULT;
|
|
||||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
|
||||||
HAND_RESULT_EXT(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
|
||||||
int offset, ptype;
|
|
||||||
int nmu=0;
|
|
||||||
ZERO_RESULT;
|
|
||||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
|
||||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
|
||||||
HAND_RESULT_EXT(ss);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////// Wilson ; uses this implementation /////////////////////
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
#undef LOAD_CHIMU
|
|
||||||
#undef LOAD_CHI
|
|
||||||
#undef MULT_2SPIN
|
|
||||||
#undef PERMUTE_DIR
|
|
||||||
#undef XP_PROJ
|
|
||||||
#undef YP_PROJ
|
|
||||||
#undef ZP_PROJ
|
|
||||||
#undef TP_PROJ
|
|
||||||
#undef XM_PROJ
|
|
||||||
#undef YM_PROJ
|
|
||||||
#undef ZM_PROJ
|
|
||||||
#undef TM_PROJ
|
|
||||||
#undef XP_RECON
|
|
||||||
#undef XP_RECON_ACCUM
|
|
||||||
#undef XM_RECON
|
|
||||||
#undef XM_RECON_ACCUM
|
|
||||||
#undef YP_RECON_ACCUM
|
|
||||||
#undef YM_RECON_ACCUM
|
|
||||||
#undef ZP_RECON_ACCUM
|
|
||||||
#undef ZM_RECON_ACCUM
|
|
||||||
#undef TP_RECON_ACCUM
|
|
||||||
#undef TM_RECON_ACCUM
|
|
||||||
#undef ZERO_RESULT
|
|
||||||
#undef Chimu_00
|
|
||||||
#undef Chimu_01
|
|
||||||
#undef Chimu_02
|
|
||||||
#undef Chimu_10
|
|
||||||
#undef Chimu_11
|
|
||||||
#undef Chimu_12
|
|
||||||
#undef Chimu_20
|
|
||||||
#undef Chimu_21
|
|
||||||
#undef Chimu_22
|
|
||||||
#undef Chimu_30
|
|
||||||
#undef Chimu_31
|
|
||||||
#undef Chimu_32
|
|
||||||
#undef HAND_STENCIL_LEG
|
|
||||||
#undef HAND_STENCIL_LEG_INT
|
|
||||||
#undef HAND_STENCIL_LEG_EXT
|
|
||||||
#undef HAND_RESULT
|
|
||||||
#undef HAND_RESULT_INT
|
|
||||||
#undef HAND_RESULT_EXT
|
|
@ -39,21 +39,19 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Generic implementation; move to different file?
|
// Generic implementation; move to different file?
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
|
||||||
/*
|
|
||||||
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||||
{
|
{
|
||||||
#ifdef GRID_SIMT
|
#ifdef __CUDA_ARCH__
|
||||||
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
||||||
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
||||||
uint4 * chip_pun = (uint4 *)&chip;
|
uint4 * chip_pun = (uint4 *)&chip;
|
||||||
* chip_pun = * mem_pun;
|
* chip_pun = * mem_pun;
|
||||||
#else
|
#else
|
||||||
chip = *mem;
|
chip = *mem;
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
||||||
SE = st.GetEntry(ptype, Dir, sF); \
|
SE = st.GetEntry(ptype, Dir, sF); \
|
||||||
if (SE->_is_local) { \
|
if (SE->_is_local) { \
|
||||||
@ -63,10 +61,10 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
} else { \
|
} else { \
|
||||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||||
} \
|
} \
|
||||||
acceleratorSynchronise(); \
|
synchronise(); \
|
||||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
||||||
Recon(result, Uchi);
|
Recon(result, Uchi);
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \
|
#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \
|
||||||
SE = st.GetEntry(ptype, Dir, sF); \
|
SE = st.GetEntry(ptype, Dir, sF); \
|
||||||
if (SE->_is_local) { \
|
if (SE->_is_local) { \
|
||||||
@ -76,12 +74,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
} else if ( st.same_node[Dir] ) { \
|
} else if ( st.same_node[Dir] ) { \
|
||||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||||
} \
|
} \
|
||||||
acceleratorSynchronise(); \
|
synchronise(); \
|
||||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
||||||
Recon(result, Uchi); \
|
Recon(result, Uchi); \
|
||||||
} \
|
} \
|
||||||
acceleratorSynchronise();
|
synchronise();
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
|
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
|
||||||
SE = st.GetEntry(ptype, Dir, sF); \
|
SE = st.GetEntry(ptype, Dir, sF); \
|
||||||
@ -91,7 +89,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
Recon(result, Uchi); \
|
Recon(result, Uchi); \
|
||||||
nmu++; \
|
nmu++; \
|
||||||
} \
|
} \
|
||||||
acceleratorSynchronise();
|
synchronise();
|
||||||
|
|
||||||
#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \
|
#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \
|
||||||
if (SE->_is_local ) { \
|
if (SE->_is_local ) { \
|
||||||
@ -101,9 +99,9 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
} else { \
|
} else { \
|
||||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||||
} \
|
} \
|
||||||
acceleratorSynchronise(); \
|
synchronise(); \
|
||||||
Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
|
Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
|
||||||
Recon(result, Uchi);
|
Recon(result, Uchi);
|
||||||
|
|
||||||
#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \
|
#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \
|
||||||
if (gamma == Dir) { \
|
if (gamma == Dir) { \
|
||||||
@ -114,7 +112,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// All legs kernels ; comms then compute
|
// All legs kernels ; comms then compute
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl> accelerator_inline
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -128,7 +126,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
const int lane=SIMTlane(Nsimd);
|
||||||
GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
|
GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
|
||||||
GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
|
GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
|
||||||
GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
|
GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
|
||||||
@ -140,10 +138,10 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
|||||||
coalescedWrite(out[sF],result,lane);
|
coalescedWrite(out[sF],result,lane);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl> accelerator_inline
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
||||||
@ -155,7 +153,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
|||||||
int ptype;
|
int ptype;
|
||||||
|
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
const int lane=SIMTlane(Nsimd);
|
||||||
GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
|
GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
|
||||||
GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
|
GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
|
||||||
GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
|
GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
|
||||||
@ -169,7 +167,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Interior kernels
|
// Interior kernels
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl> accelerator_inline
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -183,7 +181,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
const int lane=SIMTlane(Nsimd);
|
||||||
|
|
||||||
result=Zero();
|
result=Zero();
|
||||||
GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
|
GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
|
||||||
@ -197,15 +195,15 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
|||||||
coalescedWrite(out[sF], result,lane);
|
coalescedWrite(out[sF], result,lane);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl> accelerator_inline
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
const int lane=SIMTlane(Nsimd);
|
||||||
|
|
||||||
calcHalfSpinor chi;
|
calcHalfSpinor chi;
|
||||||
// calcHalfSpinor *chi_p;
|
// calcHalfSpinor *chi_p;
|
||||||
@ -227,7 +225,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Exterior kernels
|
// Exterior kernels
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl> accelerator_inline
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -241,7 +239,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
|||||||
int ptype;
|
int ptype;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
const int lane=SIMTlane(Nsimd);
|
||||||
result=Zero();
|
result=Zero();
|
||||||
GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
|
GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
|
||||||
GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
|
GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
|
||||||
@ -251,17 +249,17 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
|||||||
GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
|
GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
|
||||||
GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
|
||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
auto out_t = coalescedRead(out[sF],lane);
|
auto out_t = coalescedRead(out[sF],lane);
|
||||||
out_t = out_t + result;
|
out_t = out_t + result;
|
||||||
coalescedWrite(out[sF],out_t,lane);
|
coalescedWrite(out[sF],out_t,lane);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl> accelerator_inline
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
||||||
@ -272,7 +270,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
int ptype;
|
int ptype;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
const int lane=SIMTlane(Nsimd);
|
||||||
result=Zero();
|
result=Zero();
|
||||||
GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
|
GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
|
||||||
GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
|
GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
|
||||||
@ -282,7 +280,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
|
GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
|
||||||
GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
|
||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
auto out_t = coalescedRead(out[sF],lane);
|
auto out_t = coalescedRead(out[sF],lane);
|
||||||
out_t = out_t + result;
|
out_t = out_t + result;
|
||||||
coalescedWrite(out[sF],out_t,lane);
|
coalescedWrite(out[sF],out_t,lane);
|
||||||
@ -290,7 +288,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
};
|
};
|
||||||
|
|
||||||
#define DhopDirMacro(Dir,spProj,spRecon) \
|
#define DhopDirMacro(Dir,spProj,spRecon) \
|
||||||
template <class Impl> accelerator_inline \
|
template <class Impl> \
|
||||||
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
||||||
{ \
|
{ \
|
||||||
@ -302,12 +300,12 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
StencilEntry *SE; \
|
StencilEntry *SE; \
|
||||||
int ptype; \
|
int ptype; \
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd(); \
|
const int Nsimd = SiteHalfSpinor::Nsimd(); \
|
||||||
const int lane=acceleratorSIMTlane(Nsimd); \
|
const int lane=SIMTlane(Nsimd); \
|
||||||
\
|
\
|
||||||
SE = st.GetEntry(ptype, dir, sF); \
|
SE = st.GetEntry(ptype, dir, sF); \
|
||||||
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \
|
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \
|
||||||
coalescedWrite(out[sF], result,lane); \
|
coalescedWrite(out[sF], result,lane); \
|
||||||
}
|
}
|
||||||
|
|
||||||
DhopDirMacro(Xp,spProjXp,spReconXp);
|
DhopDirMacro(Xp,spProjXp,spReconXp);
|
||||||
DhopDirMacro(Yp,spProjYp,spReconYp);
|
DhopDirMacro(Yp,spProjYp,spReconYp);
|
||||||
@ -318,9 +316,9 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
|
|||||||
DhopDirMacro(Zm,spProjZm,spReconZm);
|
DhopDirMacro(Zm,spProjZm,spReconZm);
|
||||||
DhopDirMacro(Tm,spProjTm,spReconTm);
|
DhopDirMacro(Tm,spProjTm,spReconTm);
|
||||||
|
|
||||||
template <class Impl> accelerator_inline
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||||
{
|
{
|
||||||
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
||||||
@ -330,7 +328,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
const int lane=SIMTlane(Nsimd);
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, dir, sF);
|
SE = st.GetEntry(ptype, dir, sF);
|
||||||
GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
|
GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
|
||||||
@ -346,55 +344,54 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
|
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
|
||||||
int Nsite, const FermionField &in, std::vector<FermionField> &out)
|
int Nsite, const FermionField &in, std::vector<FermionField> &out)
|
||||||
{
|
{
|
||||||
autoView(U_v ,U,AcceleratorRead);
|
auto U_v = U.View();
|
||||||
autoView(in_v ,in,AcceleratorRead);
|
auto in_v = in.View();
|
||||||
autoView(st_v ,st,AcceleratorRead);
|
auto st_v = st.View();
|
||||||
|
|
||||||
autoView(out_Xm,out[0],AcceleratorWrite);
|
auto out_Xm = out[0].View();
|
||||||
autoView(out_Ym,out[1],AcceleratorWrite);
|
auto out_Ym = out[1].View();
|
||||||
autoView(out_Zm,out[2],AcceleratorWrite);
|
auto out_Zm = out[2].View();
|
||||||
autoView(out_Tm,out[3],AcceleratorWrite);
|
auto out_Tm = out[3].View();
|
||||||
autoView(out_Xp,out[4],AcceleratorWrite);
|
auto out_Xp = out[4].View();
|
||||||
autoView(out_Yp,out[5],AcceleratorWrite);
|
auto out_Yp = out[5].View();
|
||||||
autoView(out_Zp,out[6],AcceleratorWrite);
|
auto out_Zp = out[6].View();
|
||||||
autoView(out_Tp,out[7],AcceleratorWrite);
|
auto out_Tp = out[7].View();
|
||||||
auto CBp=st.CommBuf();
|
|
||||||
accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
|
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
|
||||||
int sU=sss/Ls;
|
int sU=sss/Ls;
|
||||||
int sF =sss;
|
int sF =sss;
|
||||||
DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
|
DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
|
||||||
DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
|
DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
|
||||||
DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
|
DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
|
||||||
DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
|
DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
|
||||||
DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
|
DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
|
||||||
DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
|
DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
|
||||||
DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
|
DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
|
||||||
DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
|
DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
|
void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
|
||||||
int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
|
int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
|
||||||
{
|
{
|
||||||
assert(dirdisp<=7);
|
assert(dirdisp<=7);
|
||||||
assert(dirdisp>=0);
|
assert(dirdisp>=0);
|
||||||
|
|
||||||
autoView(U_v ,U ,AcceleratorRead);
|
auto U_v = U.View();
|
||||||
autoView(in_v ,in ,AcceleratorRead);
|
auto in_v = in.View();
|
||||||
autoView(out_v,out,AcceleratorWrite);
|
auto out_v = out.View();
|
||||||
autoView(st_v ,st ,AcceleratorRead);
|
auto st_v = st.View();
|
||||||
auto CBp=st.CommBuf();
|
|
||||||
#define LoopBody(Dir) \
|
#define LoopBody(Dir) \
|
||||||
case Dir : \
|
case Dir : \
|
||||||
accelerator_for(ss,Nsite,Simd::Nsimd(),{ \
|
accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \
|
||||||
for(int s=0;s<Ls;s++){ \
|
for(int s=0;s<Ls;s++){ \
|
||||||
int sU=ss; \
|
int sU=ss; \
|
||||||
int sF = s+Ls*sU; \
|
int sF = s+Ls*sU; \
|
||||||
DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
|
DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
|
||||||
} \
|
} \
|
||||||
}); \
|
}); \
|
||||||
break;
|
break;
|
||||||
@ -414,7 +411,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#undef LoopBody
|
#undef LoopBody
|
||||||
}
|
}
|
||||||
|
|
||||||
#define KERNEL_CALLNB(A) \
|
#define KERNEL_CALLNB(A) \
|
||||||
const uint64_t NN = Nsite*Ls; \
|
const uint64_t NN = Nsite*Ls; \
|
||||||
@ -424,7 +421,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||||
});
|
});
|
||||||
|
|
||||||
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
|
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
|
||||||
|
|
||||||
#define ASM_CALL(A) \
|
#define ASM_CALL(A) \
|
||||||
thread_for( ss, Nsite, { \
|
thread_for( ss, Nsite, { \
|
||||||
@ -436,28 +433,28 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior,int exterior)
|
int interior,int exterior)
|
||||||
{
|
{
|
||||||
autoView(U_v , U,AcceleratorRead);
|
auto U_v = U.View();
|
||||||
autoView(in_v , in,AcceleratorRead);
|
auto in_v = in.View();
|
||||||
autoView(out_v,out,AcceleratorWrite);
|
auto out_v = out.View();
|
||||||
autoView(st_v , st,AcceleratorRead);
|
auto st_v = st.View();
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
||||||
#endif
|
#endif
|
||||||
@ -467,28 +464,28 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior,int exterior)
|
int interior,int exterior)
|
||||||
{
|
{
|
||||||
autoView(U_v ,U,AcceleratorRead);
|
auto U_v = U.View();
|
||||||
autoView(in_v ,in,AcceleratorRead);
|
auto in_v = in.View();
|
||||||
autoView(out_v,out,AcceleratorWrite);
|
auto out_v = out.View();
|
||||||
autoView(st_v ,st,AcceleratorRead);
|
auto st_v = st.View();
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||||
#endif
|
#endif
|
||||||
@ -496,8 +493,5 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef KERNEL_CALLNB
|
|
||||||
#undef KERNEL_CALL
|
|
||||||
#undef ASM_CALL
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -1,36 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi, Peter Boyle
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
|
|
||||||
const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -1,37 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi, Peter Boyle
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#include "impl.h"
|
|
||||||
template class NaiveStaggeredFermion<IMPLEMENTATION>;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -1 +0,0 @@
|
|||||||
../NaiveStaggeredFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../NaiveStaggeredFermionInstantiation.cc.master
|
|
@ -1,51 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015, 2020
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
|
||||||
|
|
||||||
#ifndef AVX512
|
|
||||||
#ifndef QPX
|
|
||||||
#ifndef A64FX
|
|
||||||
#ifndef A64FXFIXEDSIZE
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#include "impl.h"
|
|
||||||
template class WilsonKernels<IMPLEMENTATION>;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -0,0 +1 @@
|
|||||||
|
../WilsonKernelsInstantiation.cc.master
|
@ -1,51 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015, 2020
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
|
||||||
|
|
||||||
#ifndef AVX512
|
|
||||||
#ifndef QPX
|
|
||||||
#ifndef A64FX
|
|
||||||
#ifndef A64FXFIXEDSIZE
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#include "impl.h"
|
|
||||||
template class WilsonKernels<IMPLEMENTATION>;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -0,0 +1 @@
|
|||||||
|
../WilsonKernelsInstantiation.cc.master
|
@ -1,51 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015, 2020
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
|
||||||
|
|
||||||
#ifndef AVX512
|
|
||||||
#ifndef QPX
|
|
||||||
#ifndef A64FX
|
|
||||||
#ifndef A64FXFIXEDSIZE
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#include "impl.h"
|
|
||||||
template class WilsonKernels<IMPLEMENTATION>;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -0,0 +1 @@
|
|||||||
|
../WilsonKernelsInstantiation.cc.master
|
@ -1,51 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015, 2020
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
|
||||||
|
|
||||||
#ifndef AVX512
|
|
||||||
#ifndef QPX
|
|
||||||
#ifndef A64FX
|
|
||||||
#ifndef A64FXFIXEDSIZE
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#include "impl.h"
|
|
||||||
template class WilsonKernels<IMPLEMENTATION>;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -0,0 +1 @@
|
|||||||
|
../WilsonKernelsInstantiation.cc.master
|
@ -1,51 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015, 2020
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
|
||||||
|
|
||||||
#ifndef AVX512
|
|
||||||
#ifndef QPX
|
|
||||||
#ifndef A64FX
|
|
||||||
#ifndef A64FXFIXEDSIZE
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#include "impl.h"
|
|
||||||
template class WilsonKernels<IMPLEMENTATION>;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user