mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-15 06:17:05 +01:00
Compare commits
351 Commits
feature/ei
...
feature/be
Author | SHA1 | Date | |
---|---|---|---|
1ba25a0d8c | |||
9ba3647bdf | |||
5ee832f738 | |||
e9c5a271a8 | |||
acac2d6938 | |||
ace9cd64bb | |||
a3e2aeb603 | |||
049dd25785 | |||
d43d372294 | |||
b71a081cba | |||
c48909590b | |||
446ef40570 | |||
81441e98f4 | |||
ecd3f890f5 | |||
1c881ce23c | |||
dacbbdd051 | |||
2859955a03 | |||
cc220abd1d | |||
d1c0c0197e | |||
fd9424ef27 | |||
a5c35c4024 | |||
e03b64dc06 | |||
4677c40195 | |||
288c615782 | |||
48e81cf6f8 | |||
65b724bb5f | |||
6dbd117aa5 | |||
198b29f618 | |||
a8309638d4 | |||
f98a4e880e | |||
8244caff25 | |||
bcd7895362 | |||
85b1c5df39 | |||
b4255140d6 | |||
0c3095e173 | |||
d3ce60713d | |||
eac1f08b7b | |||
1654c4f3c0 | |||
8807d998bc | |||
5791021dcd | |||
c273fb051c | |||
c545530170 | |||
d982a5b6d5 | |||
15ca8637f3 | |||
cbc995b74c | |||
8b74174d74 | |||
e21fef17df | |||
3d27708f07 | |||
b918744184 | |||
7d14a3c086 | |||
e14a84317d | |||
6c31b99f1f | |||
9522dcd611 | |||
ed469898dc | |||
1eee94a809 | |||
54523369a3 | |||
a98c91c2a5 | |||
a9b92867a8 | |||
65920faeba | |||
3448b7387c | |||
47b89d2739 | |||
1efe30d6cc | |||
0b787e9fe0 | |||
37ec4b241c | |||
90ea7dfa99 | |||
f866d7c33e | |||
542bdef198 | |||
06007db3d9 | |||
12e6059a70 | |||
dbaa24ebf6 | |||
3276aa67dc | |||
3b30b9f0c0 | |||
69db4816f7 | |||
3abe09025a | |||
e33878e0de | |||
27b4fbf3f0 | |||
968a90633a | |||
6365a89ba3 | |||
ddbb008694 | |||
7997e0a449 | |||
197612bc7a | |||
0e88bf4bff | |||
3e64d78469 | |||
2004611def | |||
a2868c96a4 | |||
7cf7f11e1a | |||
ea7f8fda5e | |||
906b78811b | |||
97703b181b | |||
d9474c6cb6 | |||
bbd145382b | |||
1b08cb7300 | |||
337d9dc043 | |||
8726e94ea7 | |||
67db4993c2 | |||
f1f655d92b | |||
43334e88c3 | |||
4f1e66b044 | |||
fd3c8b0e85 | |||
1635c263ee | |||
64fe5b21b4 | |||
ee9889821d | |||
eb470aa6dc | |||
77af9a3ddc | |||
102089798c | |||
39cea8b5a7 | |||
a65f66d2db | |||
936c5ecf69 | |||
22cfbdbbb3 | |||
093d1ee21b | |||
d6ba2581ce | |||
577c064184 | |||
2ff1fa6fad | |||
70be1bd8be | |||
4ef50ba31f | |||
3e97a26f90 | |||
599f28f6ef | |||
c48da35921 | |||
6c5fa8dcd8 | |||
0d2f913a1a | |||
5b117865b2 | |||
1a74816c25 | |||
73de335256 | |||
228fd450ce | |||
b949cf6b12 | |||
11bc1aeadc | |||
66005929af | |||
05bbc49a99 | |||
ff7c847735 | |||
1aa988b2af | |||
edf17708a8 | |||
81a8209749 | |||
a87e45ba25 | |||
465856331a | |||
cc958aa9ed | |||
f46f029dbb | |||
3dccd7aa2c | |||
a25e4b3d0c | |||
d1210ca12a | |||
36ea0e222a | |||
65e6e7da6f | |||
b5e87e8d97 | |||
5f5807d60a | |||
92281ec22d | |||
87266ce099 | |||
2a23f133e8 | |||
8dbf790f62 | |||
2402b4940e | |||
2111052fbe | |||
7974acff54 | |||
f0d17d2b49 | |||
244c003a1b | |||
0174f5f742 | |||
32b2b59be4 | |||
86bb0cc24b | |||
84c19587e7 | |||
237ce92540 | |||
a7ffc61e82 | |||
fd97f64612 | |||
8720aecb80 | |||
cdf0a04fc5 | |||
616d3dd737 | |||
8b066baca8 | |||
e97f3688db | |||
433766ac62 | |||
93a37c8f68 | |||
89a1e78390 | |||
ffbb3fc02c | |||
5a73ef3647 | |||
87e5d2f4b7 | |||
d720f10758 | |||
14fcd0912a | |||
3111c0bd4f | |||
e03064490e | |||
1a4c8c3387 | |||
2b1e259441 | |||
f39c2a240b | |||
0d95805cde | |||
f67830587f | |||
6bf7f839ff | |||
e3147881a9 | |||
9872c76825 | |||
fb559614ad | |||
e93e12b6a4 | |||
0c3112cd94 | |||
8cfd5d2639 | |||
1c9f20b15e | |||
32237895bd | |||
5ee3ea2144 | |||
9fcb47ee63 | |||
5050833b42 | |||
7bee4ebb54 | |||
71cf9851e7 | |||
b4735c9904 | |||
9b2699226c | |||
5f52804907 | |||
936071773e | |||
1732f9319e | |||
91c81cab30 | |||
38164f8480 | |||
f013979791 | |||
e947b563ea | |||
5cb3530c34 | |||
250008372f | |||
1d252d0922 | |||
006cc8a8f1 | |||
4fedd8d29f | |||
cf2938688a | |||
ee63721bad | |||
22c5168d70 | |||
949ac3cd24 | |||
7bc0166c1c | |||
cb0d1b3399 | |||
d1f1ccc705 | |||
c7519a237a | |||
32be2b13d3 | |||
92b342a477 | |||
556da86ac3 | |||
8285e41574 | |||
f999408e92 | |||
a7abda89e2 | |||
7860a50f70 | |||
6ddcef1bca | |||
8c5a5fdfce | |||
046b1cbbc0 | |||
a65ce237c1 | |||
cd27f1005d | |||
f8c0a59221 | |||
832485699f | |||
81484a4760 | |||
9a86059761 | |||
b780b7b7a0 | |||
9e085bd04e | |||
6c6812a5ca | |||
8358ee38c4 | |||
1f154fe652 | |||
d708c0258d | |||
a7635fd5ba | |||
6b6bf537d3 | |||
323a651c71 | |||
9f212679f1 | |||
032f7dde1a | |||
ebb60330c9 | |||
50b1db1e8b | |||
015d8bb38a | |||
10a34312dc | |||
db8c0e7584 | |||
32fbdf4fb1 | |||
a9847aa866 | |||
d15ccad8a7 | |||
0009b5cee8 | |||
20d1941a45 | |||
d24d8e8398 | |||
162e4bb567 | |||
07c0c02f8c | |||
8c31c065b5 | |||
b7c76ede29 | |||
05edf803bd | |||
b1c86900b2 | |||
78b8e40f83 | |||
fc2e9850d3 | |||
ffaaed679e | |||
bbbee5660d | |||
b2fd8b993a | |||
291ee8c3d0 | |||
e1a5b3ea49 | |||
55a55660cb | |||
52081acfa5 | |||
f8b8e00090 | |||
ceb8b374da | |||
4bc2ad2894 | |||
798af3e68f | |||
b0ef2367f3 | |||
71a7350a85 | |||
6f79369955 | |||
f9cb6b979f | |||
ed4d9d17f8 | |||
fbed02690d | |||
39f3ae5b1d | |||
e64bec8c8e | |||
0893b4e552 | |||
92f0f29670 | |||
48a340a9d1 | |||
f45621109b | |||
32d1a0bbea | |||
267cce66a1 | |||
3417147b11 | |||
b338719bc8 | |||
2b81cbe2c2 | |||
acff9d6ed2 | |||
a306a49788 | |||
7ef03c5368 | |||
28a1fcaaff | |||
5abec5b8a9 | |||
499edc0636 | |||
d990e61be3 | |||
3edb2dc2da | |||
345721220e | |||
6db68d6ecb | |||
09f0963d1f | |||
6f44e3c192 | |||
5893888f87 | |||
39b448affb | |||
e54a8f05a9 | |||
64b72fc17f | |||
6fdce60492 | |||
852db4626a | |||
6504a098cc | |||
79a385faca | |||
c12a67030a | |||
581392f2f2 | |||
113f277b6a | |||
974586bedc | |||
160f78c1e4 | |||
7e4e1bbbc2 | |||
e699b7e9f9 | |||
a28bc0de90 | |||
14d0fe4d6c | |||
0ad2e0815c | |||
1c8ca05e16 | |||
dc9c8340bb | |||
19eef97503 | |||
635246ce50 | |||
5cdbb7e71e | |||
8123590a1b | |||
86c9c4da8b | |||
cd1efee866 | |||
bd310932f7 | |||
304762e7ac | |||
d79ab03a6c | |||
d5708e0eb2 | |||
123f6b7a61 | |||
2b6457dd9a | |||
b367cbd422 | |||
e252c1aca3 | |||
b140c6a4f9 | |||
326de36467 | |||
9f224a1647 | |||
bb46ba9b5f | |||
dd5a22b36b | |||
1ea85b9972 | |||
8fb63f1c25 | |||
77fa586f6c | |||
15238e8d5e | |||
b27e31957a | |||
46927771e3 | |||
d8cea77707 | |||
5f8a76d490 | |||
28d49a3b60 | |||
b4c624ece6 | |||
2c22db841a |
@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/perfmon/PerfCount.h>
|
||||
#include <Grid/util/Util.h>
|
||||
#include <Grid/log/Log.h>
|
||||
#include <Grid/allocator/AlignedAllocator.h>
|
||||
#include <Grid/allocator/Allocator.h>
|
||||
#include <Grid/simd/Simd.h>
|
||||
#include <Grid/threads/Threads.h>
|
||||
#include <Grid/threads/ThreadReduction.h>
|
||||
#include <Grid/serialisation/Serialisation.h>
|
||||
#include <Grid/util/Sha.h>
|
||||
#include <Grid/communicator/Communicator.h>
|
||||
|
@ -6,6 +6,7 @@
|
||||
///////////////////
|
||||
#include <cassert>
|
||||
#include <complex>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <string>
|
||||
|
@ -18,21 +18,28 @@
|
||||
#pragma push_macro("__CUDA_ARCH__")
|
||||
#pragma push_macro("__NVCC__")
|
||||
#pragma push_macro("__CUDACC__")
|
||||
#undef __CUDA_ARCH__
|
||||
#undef __NVCC__
|
||||
#undef __CUDACC__
|
||||
#undef __CUDA_ARCH__
|
||||
#define __NVCC__REDEFINE__
|
||||
#endif
|
||||
|
||||
/* SYCL save and restore compile environment*/
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
#ifdef GRID_SYCL
|
||||
#pragma push
|
||||
#pragma push_macro("__SYCL_DEVICE_ONLY__")
|
||||
#undef __SYCL_DEVICE_ONLY__
|
||||
#undef EIGEN_USE_SYCL
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
//#undef EIGEN_USE_SYCL
|
||||
#define __SYCL__REDEFINE__
|
||||
#endif
|
||||
|
||||
/* HIP save and restore compile environment*/
|
||||
#ifdef GRID_HIP
|
||||
#pragma push
|
||||
#pragma push_macro("__HIP_DEVICE_COMPILE__")
|
||||
#endif
|
||||
#define EIGEN_NO_HIP
|
||||
|
||||
#include <Grid/Eigen/Dense>
|
||||
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
||||
@ -51,6 +58,12 @@
|
||||
#pragma pop
|
||||
#endif
|
||||
|
||||
/*HIP restore*/
|
||||
#ifdef __HIP__REDEFINE__
|
||||
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
|
||||
#pragma pop
|
||||
#endif
|
||||
|
||||
#if defined __GNUC__
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
@ -21,7 +21,7 @@ if BUILD_HDF5
|
||||
extra_headers+=serialisation/Hdf5Type.h
|
||||
endif
|
||||
|
||||
all: version-cache
|
||||
all: version-cache Version.h
|
||||
|
||||
version-cache:
|
||||
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
|
||||
@ -42,7 +42,7 @@ version-cache:
|
||||
fi;\
|
||||
rm -f vertmp
|
||||
|
||||
Version.h:
|
||||
Version.h: version-cache
|
||||
cp version-cache Version.h
|
||||
|
||||
.PHONY: version-cache
|
||||
|
@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#ifndef GRID_ALGORITHMS_H
|
||||
#define GRID_ALGORITHMS_H
|
||||
|
||||
NAMESPACE_CHECK(algorithms);
|
||||
#include <Grid/algorithms/SparseMatrix.h>
|
||||
#include <Grid/algorithms/LinearOperator.h>
|
||||
#include <Grid/algorithms/Preconditioner.h>
|
||||
NAMESPACE_CHECK(SparseMatrix);
|
||||
|
||||
#include <Grid/algorithms/approx/Zolotarev.h>
|
||||
#include <Grid/algorithms/approx/Chebyshev.h>
|
||||
@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/algorithms/approx/Forecast.h>
|
||||
#include <Grid/algorithms/approx/RemezGeneral.h>
|
||||
#include <Grid/algorithms/approx/ZMobius.h>
|
||||
|
||||
NAMESPACE_CHECK(approx);
|
||||
#include <Grid/algorithms/iterative/Deflation.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||
NAMESPACE_CHECK(ConjGrad);
|
||||
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
||||
NAMESPACE_CHECK(BiCGSTAB);
|
||||
#include <Grid/algorithms/iterative/ConjugateResidual.h>
|
||||
#include <Grid/algorithms/iterative/NormalEquations.h>
|
||||
#include <Grid/algorithms/iterative/SchurRedBlack.h>
|
||||
@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||
#include <Grid/algorithms/iterative/PowerMethod.h>
|
||||
|
||||
NAMESPACE_CHECK(PowerMethod);
|
||||
#include <Grid/algorithms/CoarsenedMatrix.h>
|
||||
NAMESPACE_CHECK(CoarsendMatrix);
|
||||
#include <Grid/algorithms/FFT.h>
|
||||
|
||||
#endif
|
||||
|
@ -1,14 +1,3 @@
|
||||
// blockZaxpy in bockPromote - 3s, 5%
|
||||
// noncoalesced linalg in Preconditionoer ~ 3s 5%
|
||||
// Lancos tuning or replace 10-20s ~ 25%, open ended
|
||||
// setup tuning 5s ~ 8%
|
||||
// -- e.g. ordermin, orderstep tunables.
|
||||
// MdagM path without norm in LinOp code. few seconds
|
||||
|
||||
// Mdir calc blocking kernels
|
||||
// Fuse kernels in blockMaskedInnerProduct
|
||||
// preallocate Vectors in Cayley 5D ~ few percent few seconds
|
||||
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@ -91,35 +80,8 @@ public:
|
||||
}
|
||||
directions [2*_d]=0;
|
||||
displacements[2*_d]=0;
|
||||
|
||||
//// report back
|
||||
std::cout<<GridLogMessage<<"directions :";
|
||||
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
|
||||
std::cout<<std::endl;
|
||||
std::cout<<GridLogMessage<<"displacements :";
|
||||
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
|
||||
/*
|
||||
// Original cleaner code
|
||||
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
|
||||
for(int d=0;d<dimension;d++){
|
||||
directions[2*d ] = d;
|
||||
directions[2*d+1] = d;
|
||||
displacements[2*d ] = +1;
|
||||
displacements[2*d+1] = -1;
|
||||
}
|
||||
directions [2*dimension]=0;
|
||||
displacements[2*dimension]=0;
|
||||
}
|
||||
std::vector<int> GetDelta(int point) {
|
||||
std::vector<int> delta(dimension,0);
|
||||
delta[directions[point]] = displacements[point];
|
||||
return delta;
|
||||
};
|
||||
*/
|
||||
|
||||
};
|
||||
|
||||
template<class Fobj,class CComplex,int nbasis>
|
||||
@ -149,24 +111,6 @@ public:
|
||||
CoarseScalar InnerProd(CoarseGrid);
|
||||
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
|
||||
blockOrthogonalise(InnerProd,subspace);
|
||||
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
|
||||
// blockOrthogonalise(InnerProd,subspace);
|
||||
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
|
||||
// CheckOrthogonal();
|
||||
}
|
||||
void CheckOrthogonal(void){
|
||||
CoarseVector iProj(CoarseGrid);
|
||||
CoarseVector eProj(CoarseGrid);
|
||||
for(int i=0;i<nbasis;i++){
|
||||
blockProject(iProj,subspace[i],subspace);
|
||||
eProj=Zero();
|
||||
accelerator_for(ss, CoarseGrid->oSites(),1,{
|
||||
eProj[ss](i)=CComplex(1.0);
|
||||
});
|
||||
eProj=eProj - iProj;
|
||||
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
|
||||
}
|
||||
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
|
||||
}
|
||||
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
||||
blockProject(CoarseVec,FineVec,subspace);
|
||||
@ -175,11 +119,6 @@ public:
|
||||
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
||||
blockPromote(CoarseVec,FineVec,subspace);
|
||||
}
|
||||
void CreateSubspaceRandom(GridParallelRNG &RNG){
|
||||
for(int i=0;i<nbasis;i++){
|
||||
random(RNG,subspace[i]);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
||||
|
||||
@ -218,7 +157,7 @@ public:
|
||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||
// and this is the best I found
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#if 1
|
||||
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
@ -280,10 +219,10 @@ public:
|
||||
|
||||
hermop.HermOp(*Tn,y);
|
||||
|
||||
auto y_v = y.View();
|
||||
auto Tn_v = Tn->View();
|
||||
auto Tnp_v = Tnp->View();
|
||||
auto Tnm_v = Tnm->View();
|
||||
autoView( y_v , y, AcceleratorWrite);
|
||||
autoView( Tn_v , (*Tn), AcceleratorWrite);
|
||||
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
|
||||
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||
@ -313,201 +252,6 @@ public:
|
||||
}
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo,
|
||||
int orderfilter,
|
||||
int ordermin,
|
||||
int orderstep,
|
||||
double filterlo
|
||||
) {
|
||||
|
||||
RealD scale;
|
||||
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
FineField combined(FineGrid);
|
||||
|
||||
// New normalised noise
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
// Initial matrix element
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
int b =0;
|
||||
#define FILTERb(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
Cheb(hermop,noise,Mn); \
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||
subspace[b] = Mn; \
|
||||
hermop.Op(Mn,tmp); \
|
||||
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
|
||||
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
|
||||
|
||||
RealD alpha=-0.8;
|
||||
RealD beta =-0.8;
|
||||
#define FILTER(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
|
||||
Cheb(hermop,noise,Mn); \
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||
subspace[b] = Mn; \
|
||||
hermop.Op(Mn,tmp); \
|
||||
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
|
||||
#define FILTERc(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
Cheb(hermop,noise,combined); \
|
||||
}
|
||||
|
||||
double node = 0.000;
|
||||
FILTERb(lo,hi,orderfilter);// 0
|
||||
// FILTERc(node,hi,51);// 0
|
||||
noise = Mn;
|
||||
int base = 0;
|
||||
int mult = 100;
|
||||
FILTER(node,hi,base+1*mult);
|
||||
FILTER(node,hi,base+2*mult);
|
||||
FILTER(node,hi,base+3*mult);
|
||||
FILTER(node,hi,base+4*mult);
|
||||
FILTER(node,hi,base+5*mult);
|
||||
FILTER(node,hi,base+6*mult);
|
||||
FILTER(node,hi,base+7*mult);
|
||||
FILTER(node,hi,base+8*mult);
|
||||
FILTER(node,hi,base+9*mult);
|
||||
FILTER(node,hi,base+10*mult);
|
||||
FILTER(node,hi,base+11*mult);
|
||||
FILTER(node,hi,base+12*mult);
|
||||
FILTER(node,hi,base+13*mult);
|
||||
FILTER(node,hi,base+14*mult);
|
||||
FILTER(node,hi,base+15*mult);
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo,
|
||||
int orderfilter,
|
||||
int ordermin,
|
||||
int orderstep,
|
||||
double filterlo
|
||||
) {
|
||||
|
||||
RealD scale;
|
||||
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
FineField combined(FineGrid);
|
||||
|
||||
// New normalised noise
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
// Initial matrix element
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
int b =0;
|
||||
{
|
||||
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
|
||||
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
|
||||
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
|
||||
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
|
||||
JacobiPoly(hermop,noise,Mn);
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
b++;
|
||||
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||
// subspace[b] = tmp; b++;
|
||||
// }
|
||||
}
|
||||
|
||||
#define FILTER(lambda) \
|
||||
{ \
|
||||
hermop.HermOp(subspace[0],tmp); \
|
||||
tmp = tmp - lambda *subspace[0]; \
|
||||
scale = std::pow(norm2(tmp),-0.5); \
|
||||
tmp=tmp*scale; \
|
||||
subspace[b] = tmp; \
|
||||
hermop.Op(subspace[b],tmp); \
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||
// subspace[b] = tmp; b++;
|
||||
// }
|
||||
|
||||
FILTER(2.0e-5);
|
||||
FILTER(2.0e-4);
|
||||
FILTER(4.0e-4);
|
||||
FILTER(8.0e-4);
|
||||
FILTER(8.0e-4);
|
||||
|
||||
FILTER(2.0e-3);
|
||||
FILTER(3.0e-3);
|
||||
FILTER(4.0e-3);
|
||||
FILTER(5.0e-3);
|
||||
FILTER(6.0e-3);
|
||||
|
||||
FILTER(2.5e-3);
|
||||
FILTER(3.5e-3);
|
||||
FILTER(4.5e-3);
|
||||
FILTER(5.5e-3);
|
||||
FILTER(6.5e-3);
|
||||
|
||||
// FILTER(6.0e-5);//6
|
||||
// FILTER(7.0e-5);//8
|
||||
// FILTER(8.0e-5);//9
|
||||
// FILTER(9.0e-5);//3
|
||||
|
||||
/*
|
||||
// FILTER(1.0e-4);//10
|
||||
FILTER(2.0e-4);//11
|
||||
// FILTER(3.0e-4);//12
|
||||
// FILTER(4.0e-4);//13
|
||||
FILTER(5.0e-4);//14
|
||||
|
||||
FILTER(6.0e-3);//4
|
||||
FILTER(7.0e-4);//1
|
||||
FILTER(8.0e-4);//7
|
||||
FILTER(9.0e-4);//15
|
||||
FILTER(1.0e-3);//2
|
||||
|
||||
FILTER(2.0e-3);//2
|
||||
FILTER(3.0e-3);//2
|
||||
FILTER(4.0e-3);//2
|
||||
FILTER(5.0e-3);//2
|
||||
FILTER(6.0e-3);//2
|
||||
|
||||
FILTER(7.0e-3);//2
|
||||
FILTER(8.0e-3);//2
|
||||
FILTER(1.0e-2);//2
|
||||
*/
|
||||
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
};
|
||||
|
||||
@ -549,13 +293,13 @@ public:
|
||||
SimpleCompressor<siteVector> compressor;
|
||||
|
||||
Stencil.HaloExchange(in,compressor);
|
||||
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
@ -572,24 +316,25 @@ public:
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
int lane=SIMTlane(Nsimd);
|
||||
for(int point=0;point<geom.npoint;point++){
|
||||
|
||||
SE=Stencil.GetEntry(ptype,point,ss);
|
||||
|
||||
if(SE->_is_local) {
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||
} else {
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
||||
}
|
||||
synchronise();
|
||||
acceleratorSynchronise();
|
||||
|
||||
for(int bb=0;bb<nbasis;bb++) {
|
||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||
}
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res,lane);
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
};
|
||||
|
||||
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||
@ -617,11 +362,11 @@ public:
|
||||
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
auto out_v = out.View();
|
||||
auto in_v = in.View();
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||
@ -635,45 +380,21 @@ public:
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
int lane=SIMTlane(Nsimd);
|
||||
SE=Stencil.GetEntry(ptype,point,ss);
|
||||
|
||||
if(SE->_is_local) {
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||
} else {
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
||||
}
|
||||
synchronise();
|
||||
acceleratorSynchronise();
|
||||
|
||||
for(int bb=0;bb<nbasis;bb++) {
|
||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res,lane);
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
#if 0
|
||||
accelerator_for(ss,Grid()->oSites(),1,{
|
||||
|
||||
siteVector res = Zero();
|
||||
siteVector nbr;
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
SE=Stencil.GetEntry(ptype,point,ss);
|
||||
|
||||
if(SE->_is_local&&SE->_permute) {
|
||||
permute(nbr,in_v[SE->_offset],ptype);
|
||||
} else if(SE->_is_local) {
|
||||
nbr = in_v[SE->_offset];
|
||||
} else {
|
||||
nbr = Stencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
synchronise();
|
||||
|
||||
res = res + Aview_p[point][ss]*nbr;
|
||||
|
||||
out_v[ss]=res;
|
||||
});
|
||||
#endif
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
}
|
||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||
{
|
||||
@ -841,10 +562,10 @@ public:
|
||||
|
||||
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
|
||||
|
||||
auto iZProj_v = iZProj.View() ;
|
||||
auto oZProj_v = oZProj.View() ;
|
||||
auto A_p = A[p].View();
|
||||
auto A_self = A[self_stencil].View();
|
||||
autoView( iZProj_v , iZProj, AcceleratorRead) ;
|
||||
autoView( oZProj_v , oZProj, AcceleratorRead) ;
|
||||
autoView( A_p , A[p], AcceleratorWrite);
|
||||
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
||||
|
||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
||||
|
||||
@ -860,11 +581,11 @@ public:
|
||||
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
||||
|
||||
{
|
||||
auto tmp_ = tmp.View();
|
||||
auto evenmask_ = evenmask.View();
|
||||
auto oddmask_ = oddmask.View();
|
||||
auto Mphie_ = Mphie.View();
|
||||
auto Mphio_ = Mphio.View();
|
||||
autoView( tmp_ , tmp, AcceleratorWrite);
|
||||
autoView( evenmask_ , evenmask, AcceleratorRead);
|
||||
autoView( oddmask_ , oddmask, AcceleratorRead);
|
||||
autoView( Mphie_ , Mphie, AcceleratorRead);
|
||||
autoView( Mphio_ , Mphio, AcceleratorRead);
|
||||
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
||||
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
||||
});
|
||||
@ -872,8 +593,8 @@ public:
|
||||
|
||||
blockProject(SelfProj,tmp,Subspace.subspace);
|
||||
|
||||
auto SelfProj_ = SelfProj.View();
|
||||
auto A_self = A[self_stencil].View();
|
||||
autoView( SelfProj_ , SelfProj, AcceleratorRead);
|
||||
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
||||
|
||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
||||
for(int j=0;j<nbasis;j++){
|
||||
@ -887,33 +608,8 @@ public:
|
||||
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
||||
ForceHermitian();
|
||||
}
|
||||
// AssertHermitian();
|
||||
// ForceDiagonal();
|
||||
}
|
||||
|
||||
#if 0
|
||||
///////////////////////////
|
||||
// test code worth preserving in if block
|
||||
///////////////////////////
|
||||
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
|
||||
std::cout<<GridLogMessage<< A[p] << std::endl;
|
||||
}
|
||||
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
|
||||
|
||||
phi=Subspace.subspace[0];
|
||||
std::vector<int> bc(FineGrid->_ndimension,0);
|
||||
|
||||
blockPick(Grid(),phi,tmp,bc); // Pick out a block
|
||||
linop.Op(tmp,Mphi); // Apply big dop
|
||||
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
|
||||
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
|
||||
std::cout<<GridLogMessage<< iProj <<std::endl;
|
||||
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
|
||||
#endif
|
||||
|
||||
|
||||
void ForceHermitian(void) {
|
||||
CoarseMatrix Diff (Grid());
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
@ -933,27 +629,6 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
void AssertHermitian(void) {
|
||||
CoarseMatrix AA (Grid());
|
||||
CoarseMatrix AAc (Grid());
|
||||
CoarseMatrix Diff (Grid());
|
||||
for(int d=0;d<4;d++){
|
||||
|
||||
int dd=d+1;
|
||||
AAc = Cshift(A[2*d+1],dd,1);
|
||||
AA = A[2*d];
|
||||
|
||||
Diff = AA - adj(AAc);
|
||||
|
||||
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
|
||||
|
||||
}
|
||||
Diff = A[8] - adj(A[8]);
|
||||
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class scalar> struct FFTW { };
|
||||
@ -191,7 +189,7 @@ public:
|
||||
typedef typename sobj::scalar_type scalar;
|
||||
|
||||
Lattice<sobj> pgbuf(&pencil_g);
|
||||
auto pgbuf_v = pgbuf.View();
|
||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||
|
||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||
@ -232,15 +230,18 @@ public:
|
||||
result = source;
|
||||
int pc = processor_coor[dim];
|
||||
for(int p=0;p<processors[dim];p++) {
|
||||
{
|
||||
autoView(r_v,result,CpuRead);
|
||||
autoView(p_v,pgbuf,CpuWrite);
|
||||
thread_for(idx, sgrid->lSites(),{
|
||||
Coordinate cbuf(Nd);
|
||||
sobj s;
|
||||
sgrid->LocalIndexToLocalCoor(idx,cbuf);
|
||||
peekLocalSite(s,result,cbuf);
|
||||
peekLocalSite(s,r_v,cbuf);
|
||||
cbuf[dim]+=((pc+p) % processors[dim])*L;
|
||||
// cbuf[dim]+=p*L;
|
||||
pokeLocalSite(s,pgbuf,cbuf);
|
||||
pokeLocalSite(s,p_v,cbuf);
|
||||
});
|
||||
}
|
||||
if (p != processors[dim] - 1) {
|
||||
result = Cshift(result,dim,L);
|
||||
}
|
||||
@ -269,15 +270,19 @@ public:
|
||||
flops+= flops_call*NN;
|
||||
|
||||
// writing out result
|
||||
{
|
||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||
autoView(result_v,result,CpuWrite);
|
||||
thread_for(idx,sgrid->lSites(),{
|
||||
Coordinate clbuf(Nd), cgbuf(Nd);
|
||||
sobj s;
|
||||
sgrid->LocalIndexToLocalCoor(idx,clbuf);
|
||||
cgbuf = clbuf;
|
||||
cgbuf[dim] = clbuf[dim]+L*pc;
|
||||
peekLocalSite(s,pgbuf,cgbuf);
|
||||
pokeLocalSite(s,result,clbuf);
|
||||
peekLocalSite(s,pgbuf_v,cgbuf);
|
||||
pokeLocalSite(s,result_v,clbuf);
|
||||
});
|
||||
}
|
||||
result = result*div;
|
||||
|
||||
// destroying plan
|
||||
|
@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
|
||||
|
||||
LinearCombTimer.Start();
|
||||
bo = beta * omega;
|
||||
auto p_v = p.View();
|
||||
auto r_v = r.View();
|
||||
auto v_v = v.View();
|
||||
{
|
||||
autoView( p_v , p, AcceleratorWrite);
|
||||
autoView( r_v , r, AcceleratorRead);
|
||||
autoView( v_v , v, AcceleratorRead);
|
||||
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
|
||||
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
|
||||
});
|
||||
}
|
||||
LinearCombTimer.Stop();
|
||||
LinalgTimer.Stop();
|
||||
|
||||
@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
|
||||
alpha = rho / Calpha.real();
|
||||
|
||||
LinearCombTimer.Start();
|
||||
auto h_v = h.View();
|
||||
auto psi_v = psi.View();
|
||||
{
|
||||
autoView( p_v , p, AcceleratorRead);
|
||||
autoView( r_v , r, AcceleratorRead);
|
||||
autoView( v_v , v, AcceleratorRead);
|
||||
autoView( psi_v,psi, AcceleratorRead);
|
||||
autoView( h_v , h, AcceleratorWrite);
|
||||
autoView( s_v , s, AcceleratorWrite);
|
||||
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
|
||||
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
|
||||
});
|
||||
|
||||
auto s_v = s.View();
|
||||
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
|
||||
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
|
||||
});
|
||||
}
|
||||
LinearCombTimer.Stop();
|
||||
LinalgTimer.Stop();
|
||||
|
||||
@ -166,11 +172,17 @@ class BiCGSTAB : public OperatorFunction<Field>
|
||||
omega = Comega.real() / norm2(t);
|
||||
|
||||
LinearCombTimer.Start();
|
||||
auto t_v = t.View();
|
||||
{
|
||||
autoView( psi_v,psi, AcceleratorWrite);
|
||||
autoView( r_v , r, AcceleratorWrite);
|
||||
autoView( h_v , h, AcceleratorRead);
|
||||
autoView( s_v , s, AcceleratorRead);
|
||||
autoView( t_v , t, AcceleratorRead);
|
||||
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
|
||||
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
|
||||
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
|
||||
});
|
||||
}
|
||||
LinearCombTimer.Stop();
|
||||
|
||||
cp = norm2(r);
|
||||
|
@ -140,13 +140,15 @@ public:
|
||||
b = cp / c;
|
||||
|
||||
LinearCombTimer.Start();
|
||||
auto psi_v = psi.View();
|
||||
auto p_v = p.View();
|
||||
auto r_v = r.View();
|
||||
{
|
||||
autoView( psi_v , psi, AcceleratorWrite);
|
||||
autoView( p_v , p, AcceleratorWrite);
|
||||
autoView( r_v , r, AcceleratorWrite);
|
||||
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
|
||||
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
|
||||
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
|
||||
});
|
||||
}
|
||||
LinearCombTimer.Stop();
|
||||
LinalgTimer.Stop();
|
||||
|
||||
|
@ -0,0 +1,241 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_PREC_GCR_NON_HERM_H
|
||||
#define GRID_PREC_GCR_NON_HERM_H
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//VPGCR Abe and Zhang, 2005.
|
||||
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
|
||||
//Computing and Information Volume 2, Number 2, Pages 147-161
|
||||
//NB. Likely not original reference since they are focussing on a preconditioner variant.
|
||||
// but VPGCR was nicely written up in their paper
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
|
||||
|
||||
template<class Field>
|
||||
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
|
||||
public:
|
||||
|
||||
RealD Tolerance;
|
||||
Integer MaxIterations;
|
||||
int verbose;
|
||||
int mmax;
|
||||
int nstep;
|
||||
int steps;
|
||||
int level;
|
||||
GridStopWatch PrecTimer;
|
||||
GridStopWatch MatTimer;
|
||||
GridStopWatch LinalgTimer;
|
||||
|
||||
LinearFunction<Field> &Preconditioner;
|
||||
LinearOperatorBase<Field> &Linop;
|
||||
|
||||
void Level(int lv) { level=lv; };
|
||||
|
||||
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
|
||||
Tolerance(tol),
|
||||
MaxIterations(maxit),
|
||||
Linop(_Linop),
|
||||
Preconditioner(Prec),
|
||||
mmax(_mmax),
|
||||
nstep(_nstep)
|
||||
{
|
||||
level=1;
|
||||
verbose=1;
|
||||
};
|
||||
|
||||
void operator() (const Field &src, Field &psi){
|
||||
|
||||
psi=Zero();
|
||||
RealD cp, ssq,rsq;
|
||||
ssq=norm2(src);
|
||||
rsq=Tolerance*Tolerance*ssq;
|
||||
|
||||
Field r(src.Grid());
|
||||
|
||||
PrecTimer.Reset();
|
||||
MatTimer.Reset();
|
||||
LinalgTimer.Reset();
|
||||
|
||||
GridStopWatch SolverTimer;
|
||||
SolverTimer.Start();
|
||||
|
||||
steps=0;
|
||||
for(int k=0;k<MaxIterations;k++){
|
||||
|
||||
cp=GCRnStep(src,psi,rsq);
|
||||
|
||||
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
|
||||
|
||||
if(cp<rsq) {
|
||||
|
||||
SolverTimer.Stop();
|
||||
|
||||
Linop.Op(psi,r);
|
||||
axpy(r,-1.0,src,r);
|
||||
RealD tr = norm2(r);
|
||||
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
|
||||
<< " computed residual "<<sqrt(cp/ssq)
|
||||
<< " true residual " <<sqrt(tr/ssq)
|
||||
<< " target " <<Tolerance <<std::endl;
|
||||
|
||||
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
|
||||
// assert(0);
|
||||
}
|
||||
|
||||
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
|
||||
|
||||
RealD cp;
|
||||
ComplexD a, b, zAz;
|
||||
RealD zAAz;
|
||||
ComplexD rq;
|
||||
|
||||
GridBase *grid = src.Grid();
|
||||
|
||||
Field r(grid);
|
||||
Field z(grid);
|
||||
Field tmp(grid);
|
||||
Field ttmp(grid);
|
||||
Field Az(grid);
|
||||
|
||||
////////////////////////////////
|
||||
// history for flexible orthog
|
||||
////////////////////////////////
|
||||
std::vector<Field> q(mmax,grid);
|
||||
std::vector<Field> p(mmax,grid);
|
||||
std::vector<RealD> qq(mmax);
|
||||
|
||||
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
|
||||
|
||||
//////////////////////////////////
|
||||
// initial guess x0 is taken as nonzero.
|
||||
// r0=src-A x0 = src
|
||||
//////////////////////////////////
|
||||
MatTimer.Start();
|
||||
Linop.Op(psi,Az);
|
||||
zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
MatTimer.Stop();
|
||||
|
||||
|
||||
LinalgTimer.Start();
|
||||
r=src-Az;
|
||||
LinalgTimer.Stop();
|
||||
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
|
||||
|
||||
/////////////////////
|
||||
// p = Prec(r)
|
||||
/////////////////////
|
||||
|
||||
PrecTimer.Start();
|
||||
Preconditioner(r,z);
|
||||
PrecTimer.Stop();
|
||||
|
||||
MatTimer.Start();
|
||||
Linop.Op(z,Az);
|
||||
MatTimer.Stop();
|
||||
|
||||
LinalgTimer.Start();
|
||||
|
||||
zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
|
||||
//p[0],q[0],qq[0]
|
||||
p[0]= z;
|
||||
q[0]= Az;
|
||||
qq[0]= zAAz;
|
||||
|
||||
cp =norm2(r);
|
||||
LinalgTimer.Stop();
|
||||
|
||||
for(int k=0;k<nstep;k++){
|
||||
|
||||
steps++;
|
||||
|
||||
int kp = k+1;
|
||||
int peri_k = k %mmax;
|
||||
int peri_kp= kp%mmax;
|
||||
|
||||
LinalgTimer.Start();
|
||||
rq= innerProduct(q[peri_k],r); // what if rAr not real?
|
||||
a = rq/qq[peri_k];
|
||||
|
||||
axpy(psi,a,p[peri_k],psi);
|
||||
|
||||
cp = axpy_norm(r,-a,q[peri_k],r);
|
||||
LinalgTimer.Stop();
|
||||
|
||||
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
|
||||
|
||||
if((k==nstep-1)||(cp<rsq)){
|
||||
return cp;
|
||||
}
|
||||
|
||||
|
||||
PrecTimer.Start();
|
||||
Preconditioner(r,z);// solve Az = r
|
||||
PrecTimer.Stop();
|
||||
|
||||
MatTimer.Start();
|
||||
Linop.Op(z,Az);
|
||||
MatTimer.Stop();
|
||||
zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
|
||||
LinalgTimer.Start();
|
||||
|
||||
q[peri_kp]=Az;
|
||||
p[peri_kp]=z;
|
||||
|
||||
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
|
||||
for(int back=0;back<northog;back++){
|
||||
|
||||
int peri_back=(k-back)%mmax; assert((k-back)>=0);
|
||||
|
||||
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
|
||||
p[peri_kp]=p[peri_kp]+b*p[peri_back];
|
||||
q[peri_kp]=q[peri_kp]+b*q[peri_back];
|
||||
|
||||
}
|
||||
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
|
||||
LinalgTimer.Stop();
|
||||
}
|
||||
assert(0); // never reached
|
||||
return cp;
|
||||
}
|
||||
};
|
||||
NAMESPACE_END(Grid);
|
||||
#endif
|
@ -6,93 +6,6 @@ NAMESPACE_BEGIN(Grid);
|
||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||
bool MemoryProfiler::debug = false;
|
||||
|
||||
int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
|
||||
#ifdef GRID_CUDA
|
||||
int PointerCache::Ncache = 32;
|
||||
#else
|
||||
int PointerCache::Ncache = 8;
|
||||
#endif
|
||||
int PointerCache::Victim;
|
||||
int PointerCache::VictimSmall;
|
||||
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
|
||||
PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
|
||||
|
||||
void PointerCache::Init(void)
|
||||
{
|
||||
char * str;
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
||||
if ( str ) Ncache = atoi(str);
|
||||
if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||
if ( str ) NcacheSmall = atoi(str);
|
||||
if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
|
||||
|
||||
// printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
|
||||
}
|
||||
void *PointerCache::Insert(void *ptr,size_t bytes)
|
||||
{
|
||||
if (bytes < GRID_ALLOC_SMALL_LIMIT )
|
||||
return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
|
||||
return Insert(ptr,bytes,Entries,Ncache,Victim);
|
||||
}
|
||||
void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
|
||||
void * ret = NULL;
|
||||
int v = -1;
|
||||
|
||||
for(int e=0;e<ncache;e++) {
|
||||
if ( entries[e].valid==0 ) {
|
||||
v=e;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( v==-1 ) {
|
||||
v=victim;
|
||||
victim = (victim+1)%ncache;
|
||||
}
|
||||
|
||||
if ( entries[v].valid ) {
|
||||
ret = entries[v].address;
|
||||
entries[v].valid = 0;
|
||||
entries[v].address = NULL;
|
||||
entries[v].bytes = 0;
|
||||
}
|
||||
|
||||
entries[v].address=ptr;
|
||||
entries[v].bytes =bytes;
|
||||
entries[v].valid =1;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *PointerCache::Lookup(size_t bytes)
|
||||
{
|
||||
if (bytes < GRID_ALLOC_SMALL_LIMIT )
|
||||
return Lookup(bytes,EntriesSmall,NcacheSmall);
|
||||
return Lookup(bytes,Entries,Ncache);
|
||||
}
|
||||
void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
for(int e=0;e<ncache;e++){
|
||||
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||
entries[e].valid = 0;
|
||||
return entries[e].address;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||
{
|
||||
#ifdef __linux__
|
||||
|
@ -26,129 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
||||
#define GRID_ALIGNED_ALLOCATOR_H
|
||||
|
||||
#ifdef HAVE_MALLOC_MALLOC_H
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#ifdef HAVE_MALLOC_H
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
#include <mm_malloc.h>
|
||||
#endif
|
||||
|
||||
#define POINTER_CACHE
|
||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// Move control to configure.ac and Config.h?
|
||||
|
||||
class PointerCache {
|
||||
private:
|
||||
/*Pinning pages is costly*/
|
||||
/*Could maintain separate large and small allocation caches*/
|
||||
/* Could make these configurable, perhaps up to a max size*/
|
||||
static const int NcacheSmallMax=128;
|
||||
static const int NcacheMax=16;
|
||||
static int NcacheSmall;
|
||||
static int Ncache;
|
||||
|
||||
typedef struct {
|
||||
void *address;
|
||||
size_t bytes;
|
||||
int valid;
|
||||
} PointerCacheEntry;
|
||||
|
||||
static PointerCacheEntry Entries[NcacheMax];
|
||||
static int Victim;
|
||||
static PointerCacheEntry EntriesSmall[NcacheSmallMax];
|
||||
static int VictimSmall;
|
||||
|
||||
public:
|
||||
static void Init(void);
|
||||
static void *Insert(void *ptr,size_t bytes) ;
|
||||
static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
|
||||
static void *Lookup(size_t bytes) ;
|
||||
static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
|
||||
};
|
||||
|
||||
std::string sizeString(size_t bytes);
|
||||
|
||||
struct MemoryStats
|
||||
{
|
||||
size_t totalAllocated{0}, maxAllocated{0},
|
||||
currentlyAllocated{0}, totalFreed{0};
|
||||
};
|
||||
|
||||
class MemoryProfiler
|
||||
{
|
||||
public:
|
||||
static MemoryStats *stats;
|
||||
static bool debug;
|
||||
};
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#define profilerCudaMeminfo \
|
||||
{ size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
|
||||
#else
|
||||
#define profilerCudaMeminfo
|
||||
#endif
|
||||
|
||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
||||
#define profilerDebugPrint \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
||||
<< std::endl; \
|
||||
} \
|
||||
profilerCudaMeminfo;
|
||||
|
||||
#define profilerAllocate(bytes) \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
s->totalAllocated += (bytes); \
|
||||
s->currentlyAllocated += (bytes); \
|
||||
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
|
||||
} \
|
||||
if (MemoryProfiler::debug) \
|
||||
{ \
|
||||
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
|
||||
profilerDebugPrint; \
|
||||
}
|
||||
|
||||
#define profilerFree(bytes) \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
s->totalFreed += (bytes); \
|
||||
s->currentlyAllocated -= (bytes); \
|
||||
} \
|
||||
if (MemoryProfiler::debug) \
|
||||
{ \
|
||||
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
|
||||
profilerDebugPrint; \
|
||||
}
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// A lattice of something, but assume the something is SIMDized.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename _Tp>
|
||||
class alignedAllocator {
|
||||
public:
|
||||
@ -172,89 +53,122 @@ public:
|
||||
{
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
profilerAllocate(bytes);
|
||||
|
||||
|
||||
#ifdef POINTER_CACHE
|
||||
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
||||
#else
|
||||
pointer ptr = nullptr;
|
||||
#endif
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
////////////////////////////////////
|
||||
// Unified (managed) memory
|
||||
////////////////////////////////////
|
||||
if ( ptr == (_Tp *) NULL ) {
|
||||
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
|
||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (_Tp *) NULL;
|
||||
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
assert( ptr != (_Tp *)NULL);
|
||||
#else
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// 2MB align; could make option probably doesn't need configurability
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
|
||||
#else
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
||||
#endif
|
||||
assert( ptr != (_Tp *)NULL);
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// First touch optimise in threaded loop
|
||||
//////////////////////////////////////////////////
|
||||
uint64_t *cp = (uint64_t *)ptr;
|
||||
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
|
||||
cp[n]=0;
|
||||
});
|
||||
#endif
|
||||
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
|
||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void deallocate(pointer __p, size_type __n) {
|
||||
void deallocate(pointer __p, size_type __n)
|
||||
{
|
||||
size_type bytes = __n * sizeof(_Tp);
|
||||
|
||||
profilerFree(bytes);
|
||||
|
||||
#ifdef POINTER_CACHE
|
||||
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
||||
#else
|
||||
pointer __freeme = __p;
|
||||
#endif
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
if ( __freeme ) cudaFree((void *)__freeme);
|
||||
#else
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
if ( __freeme ) _mm_free((void *)__freeme);
|
||||
#else
|
||||
if ( __freeme ) free((void *)__freeme);
|
||||
#endif
|
||||
#endif
|
||||
MemoryManager::CpuFree((void *)__p,bytes);
|
||||
}
|
||||
|
||||
// FIXME: hack for the copy constructor, eventually it must be avoided
|
||||
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
||||
//void construct(pointer __p, const _Tp& __val) { };
|
||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
||||
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Unified virtual memory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
template<typename _Tp>
|
||||
class uvmAllocator {
|
||||
public:
|
||||
typedef std::size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef _Tp* pointer;
|
||||
typedef const _Tp* const_pointer;
|
||||
typedef _Tp& reference;
|
||||
typedef const _Tp& const_reference;
|
||||
typedef _Tp value_type;
|
||||
|
||||
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
|
||||
uvmAllocator() throw() { }
|
||||
uvmAllocator(const uvmAllocator&) throw() { }
|
||||
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
|
||||
~uvmAllocator() throw() { }
|
||||
pointer address(reference __x) const { return &__x; }
|
||||
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||
|
||||
pointer allocate(size_type __n, const void* _p= 0)
|
||||
{
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
profilerAllocate(bytes);
|
||||
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
|
||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void deallocate(pointer __p, size_type __n)
|
||||
{
|
||||
size_type bytes = __n * sizeof(_Tp);
|
||||
profilerFree(bytes);
|
||||
MemoryManager::SharedFree((void *)__p,bytes);
|
||||
}
|
||||
|
||||
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
|
||||
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Device memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<typename _Tp>
|
||||
class devAllocator {
|
||||
public:
|
||||
typedef std::size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef _Tp* pointer;
|
||||
typedef const _Tp* const_pointer;
|
||||
typedef _Tp& reference;
|
||||
typedef const _Tp& const_reference;
|
||||
typedef _Tp value_type;
|
||||
|
||||
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
|
||||
devAllocator() throw() { }
|
||||
devAllocator(const devAllocator&) throw() { }
|
||||
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
|
||||
~devAllocator() throw() { }
|
||||
pointer address(reference __x) const { return &__x; }
|
||||
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||
|
||||
pointer allocate(size_type __n, const void* _p= 0)
|
||||
{
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
profilerAllocate(bytes);
|
||||
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
|
||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void deallocate(pointer __p, size_type __n)
|
||||
{
|
||||
size_type bytes = __n * sizeof(_Tp);
|
||||
profilerFree(bytes);
|
||||
MemoryManager::AcceleratorFree((void *)__p,bytes);
|
||||
}
|
||||
void construct(pointer __p, const _Tp& __val) { };
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
|
||||
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Template typedefs
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T> using commAllocator = alignedAllocator<T>;
|
||||
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
|
||||
template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
|
||||
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
|
||||
//template<class T> using commAllocator = devAllocator<T>;
|
||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
||||
|
||||
|
4
Grid/allocator/Allocator.h
Normal file
4
Grid/allocator/Allocator.h
Normal file
@ -0,0 +1,4 @@
|
||||
#pragma once
|
||||
#include <Grid/allocator/MemoryStats.h>
|
||||
#include <Grid/allocator/MemoryManager.h>
|
||||
#include <Grid/allocator/AlignedAllocator.h>
|
254
Grid/allocator/MemoryManager.cc
Normal file
254
Grid/allocator/MemoryManager.cc
Normal file
@ -0,0 +1,254 @@
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*Allocation types, saying which pointer cache should be used*/
|
||||
#define Cpu (0)
|
||||
#define CpuSmall (1)
|
||||
#define Acc (2)
|
||||
#define AccSmall (3)
|
||||
#define Shared (4)
|
||||
#define SharedSmall (5)
|
||||
uint64_t total_shared;
|
||||
uint64_t total_device;
|
||||
uint64_t total_host;;
|
||||
void MemoryManager::PrintBytes(void)
|
||||
{
|
||||
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
|
||||
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
|
||||
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Data tables for recently freed pooiniter caches
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
||||
int MemoryManager::Victim[MemoryManager::NallocType];
|
||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Actual allocation and deallocation utils
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void *MemoryManager::AcceleratorAllocate(size_t bytes)
|
||||
{
|
||||
void *ptr = (void *) Lookup(bytes,Acc);
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocDevice(bytes);
|
||||
total_device+=bytes;
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
|
||||
{
|
||||
void *__freeme = Insert(ptr,bytes,Acc);
|
||||
if ( __freeme ) {
|
||||
acceleratorFreeDevice(__freeme);
|
||||
total_device-=bytes;
|
||||
// PrintBytes();
|
||||
}
|
||||
}
|
||||
void *MemoryManager::SharedAllocate(size_t bytes)
|
||||
{
|
||||
void *ptr = (void *) Lookup(bytes,Shared);
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocShared(bytes);
|
||||
total_shared+=bytes;
|
||||
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
||||
// PrintBytes();
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void MemoryManager::SharedFree (void *ptr,size_t bytes)
|
||||
{
|
||||
void *__freeme = Insert(ptr,bytes,Shared);
|
||||
if ( __freeme ) {
|
||||
acceleratorFreeShared(__freeme);
|
||||
total_shared-=bytes;
|
||||
// PrintBytes();
|
||||
}
|
||||
}
|
||||
#ifdef GRID_UVM
|
||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||
{
|
||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocShared(bytes);
|
||||
total_host+=bytes;
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||
{
|
||||
NotifyDeletion(_ptr);
|
||||
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||
if ( __freeme ) {
|
||||
acceleratorFreeShared(__freeme);
|
||||
total_host-=bytes;
|
||||
}
|
||||
}
|
||||
#else
|
||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||
{
|
||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocCpu(bytes);
|
||||
total_host+=bytes;
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||
{
|
||||
NotifyDeletion(_ptr);
|
||||
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||
if ( __freeme ) {
|
||||
acceleratorFreeCpu(__freeme);
|
||||
total_host-=bytes;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////
|
||||
// call only once
|
||||
//////////////////////////////////////////
|
||||
void MemoryManager::Init(void)
|
||||
{
|
||||
|
||||
char * str;
|
||||
int Nc;
|
||||
int NcS;
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
||||
if ( str ) {
|
||||
Nc = atoi(str);
|
||||
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||
Ncache[Cpu]=Nc;
|
||||
Ncache[Acc]=Nc;
|
||||
Ncache[Shared]=Nc;
|
||||
}
|
||||
}
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||
if ( str ) {
|
||||
Nc = atoi(str);
|
||||
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||
Ncache[CpuSmall]=Nc;
|
||||
Ncache[AccSmall]=Nc;
|
||||
Ncache[SharedSmall]=Nc;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void MemoryManager::InitMessage(void) {
|
||||
|
||||
#ifndef GRID_UVM
|
||||
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
|
||||
#endif
|
||||
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
||||
#ifdef ALLOCATION_CACHE
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
||||
#endif
|
||||
|
||||
#ifdef GRID_UVM
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
|
||||
#ifdef GRID_CUDA
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
|
||||
#endif
|
||||
#else
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
|
||||
#ifdef GRID_CUDA
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||
{
|
||||
#ifdef ALLOCATION_CACHE
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type + small;
|
||||
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
|
||||
#else
|
||||
return ptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
|
||||
void * ret = NULL;
|
||||
int v = -1;
|
||||
|
||||
for(int e=0;e<ncache;e++) {
|
||||
if ( entries[e].valid==0 ) {
|
||||
v=e;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( v==-1 ) {
|
||||
v=victim;
|
||||
victim = (victim+1)%ncache;
|
||||
}
|
||||
|
||||
if ( entries[v].valid ) {
|
||||
ret = entries[v].address;
|
||||
entries[v].valid = 0;
|
||||
entries[v].address = NULL;
|
||||
entries[v].bytes = 0;
|
||||
}
|
||||
|
||||
entries[v].address=ptr;
|
||||
entries[v].bytes =bytes;
|
||||
entries[v].valid =1;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *MemoryManager::Lookup(size_t bytes,int type)
|
||||
{
|
||||
#ifdef ALLOCATION_CACHE
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type+small;
|
||||
return Lookup(bytes,Entries[cache],Ncache[cache]);
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
for(int e=0;e<ncache;e++){
|
||||
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||
entries[e].valid = 0;
|
||||
return entries[e].address;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
182
Grid/allocator/MemoryManager.h
Normal file
182
Grid/allocator/MemoryManager.h
Normal file
@ -0,0 +1,182 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/MemoryManager.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
#include <list>
|
||||
#include <unordered_map>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// Move control to configure.ac and Config.h?
|
||||
|
||||
#define ALLOCATION_CACHE
|
||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||
|
||||
/*Pinning pages is costly*/
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Advise the LatticeAccelerator class
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
enum ViewAdvise {
|
||||
AdviseDefault = 0x0, // Regular data
|
||||
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
|
||||
// significantly influence performance of bulk storage.
|
||||
|
||||
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
|
||||
// enables read-only copies of memory to be kept on
|
||||
// host and device.
|
||||
|
||||
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// View Access Mode
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
enum ViewMode {
|
||||
AcceleratorRead = 0x01,
|
||||
AcceleratorWrite = 0x02,
|
||||
AcceleratorWriteDiscard = 0x04,
|
||||
CpuRead = 0x08,
|
||||
CpuWrite = 0x10,
|
||||
CpuWriteDiscard = 0x10 // same for now
|
||||
};
|
||||
|
||||
class MemoryManager {
|
||||
private:
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// For caching recently freed allocations
|
||||
////////////////////////////////////////////////////////////
|
||||
typedef struct {
|
||||
void *address;
|
||||
size_t bytes;
|
||||
int valid;
|
||||
} AllocationCacheEntry;
|
||||
|
||||
static const int NallocCacheMax=128;
|
||||
static const int NallocType=6;
|
||||
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
||||
static int Victim[NallocType];
|
||||
static int Ncache[NallocType];
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Free pool
|
||||
/////////////////////////////////////////////////
|
||||
static void *Insert(void *ptr,size_t bytes,int type) ;
|
||||
static void *Lookup(size_t bytes,int type) ;
|
||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
||||
|
||||
static void PrintBytes(void);
|
||||
public:
|
||||
static void Init(void);
|
||||
static void InitMessage(void);
|
||||
static void *AcceleratorAllocate(size_t bytes);
|
||||
static void AcceleratorFree (void *ptr,size_t bytes);
|
||||
static void *SharedAllocate(size_t bytes);
|
||||
static void SharedFree (void *ptr,size_t bytes);
|
||||
static void *CpuAllocate(size_t bytes);
|
||||
static void CpuFree (void *ptr,size_t bytes);
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Footprint tracking
|
||||
////////////////////////////////////////////////////////
|
||||
static uint64_t DeviceBytes;
|
||||
static uint64_t DeviceLRUBytes;
|
||||
static uint64_t DeviceMaxBytes;
|
||||
static uint64_t HostToDeviceBytes;
|
||||
static uint64_t DeviceToHostBytes;
|
||||
static uint64_t HostToDeviceXfer;
|
||||
static uint64_t DeviceToHostXfer;
|
||||
|
||||
private:
|
||||
#ifndef GRID_UVM
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Data tables for ViewCache
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
typedef std::list<uint64_t> LRU_t;
|
||||
typedef typename LRU_t::iterator LRUiterator;
|
||||
typedef struct {
|
||||
int LRU_valid;
|
||||
LRUiterator LRU_entry;
|
||||
uint64_t CpuPtr;
|
||||
uint64_t AccPtr;
|
||||
size_t bytes;
|
||||
uint32_t transient;
|
||||
uint32_t state;
|
||||
uint32_t accLock;
|
||||
uint32_t cpuLock;
|
||||
} AcceleratorViewEntry;
|
||||
|
||||
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
|
||||
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
|
||||
|
||||
static AccViewTable_t AccViewTable;
|
||||
static LRU_t LRU;
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Device motion
|
||||
/////////////////////////////////////////////////
|
||||
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
|
||||
static void Evict(AcceleratorViewEntry &AccCache);
|
||||
static void Flush(AcceleratorViewEntry &AccCache);
|
||||
static void Clone(AcceleratorViewEntry &AccCache);
|
||||
static void AccDiscard(AcceleratorViewEntry &AccCache);
|
||||
static void CpuDiscard(AcceleratorViewEntry &AccCache);
|
||||
|
||||
// static void LRUupdate(AcceleratorViewEntry &AccCache);
|
||||
static void LRUinsert(AcceleratorViewEntry &AccCache);
|
||||
static void LRUremove(AcceleratorViewEntry &AccCache);
|
||||
|
||||
// manage entries in the table
|
||||
static int EntryPresent(uint64_t CpuPtr);
|
||||
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||
static void EntryErase (uint64_t CpuPtr);
|
||||
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
|
||||
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
|
||||
|
||||
static void AcceleratorViewClose(uint64_t AccPtr);
|
||||
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||
static void CpuViewClose(uint64_t Ptr);
|
||||
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||
#endif
|
||||
static void NotifyDeletion(void * CpuPtr);
|
||||
|
||||
public:
|
||||
static void Print(void);
|
||||
static int isOpen (void* CpuPtr);
|
||||
static void ViewClose(void* CpuPtr,ViewMode mode);
|
||||
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
468
Grid/allocator/MemoryManagerCache.cc
Normal file
468
Grid/allocator/MemoryManagerCache.cc
Normal file
@ -0,0 +1,468 @@
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
#ifndef GRID_UVM
|
||||
|
||||
#warning "Using explicit device memory copies"
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#define dprintf(...)
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// For caching copies of data on device
|
||||
////////////////////////////////////////////////////////////
|
||||
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
|
||||
MemoryManager::LRU_t MemoryManager::LRU;
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Footprint tracking
|
||||
////////////////////////////////////////////////////////
|
||||
uint64_t MemoryManager::DeviceBytes;
|
||||
uint64_t MemoryManager::DeviceLRUBytes;
|
||||
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
|
||||
uint64_t MemoryManager::HostToDeviceBytes;
|
||||
uint64_t MemoryManager::DeviceToHostBytes;
|
||||
uint64_t MemoryManager::HostToDeviceXfer;
|
||||
uint64_t MemoryManager::DeviceToHostXfer;
|
||||
|
||||
////////////////////////////////////
|
||||
// Priority ordering for unlocked entries
|
||||
// Empty
|
||||
// CpuDirty
|
||||
// Consistent
|
||||
// AccDirty
|
||||
////////////////////////////////////
|
||||
#define Empty (0x0) /*Entry unoccupied */
|
||||
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
|
||||
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
|
||||
#define AccDirty (0x4) /*ACC copy is golden */
|
||||
#define EvictNext (0x8) /*Priority for eviction*/
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Mechanics of data table maintenance
|
||||
/////////////////////////////////////////////////
|
||||
int MemoryManager::EntryPresent(uint64_t CpuPtr)
|
||||
{
|
||||
if(AccViewTable.empty()) return 0;
|
||||
|
||||
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
|
||||
return count;
|
||||
}
|
||||
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
||||
{
|
||||
assert(!EntryPresent(CpuPtr));
|
||||
AcceleratorViewEntry AccCache;
|
||||
AccCache.CpuPtr = CpuPtr;
|
||||
AccCache.AccPtr = (uint64_t)NULL;
|
||||
AccCache.bytes = bytes;
|
||||
AccCache.state = CpuDirty;
|
||||
AccCache.LRU_valid=0;
|
||||
AccCache.transient=0;
|
||||
AccCache.accLock=0;
|
||||
AccCache.cpuLock=0;
|
||||
AccViewTable[CpuPtr] = AccCache;
|
||||
}
|
||||
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
|
||||
{
|
||||
assert(EntryPresent(CpuPtr));
|
||||
auto AccCacheIterator = AccViewTable.find(CpuPtr);
|
||||
assert(AccCacheIterator!=AccViewTable.end());
|
||||
return AccCacheIterator;
|
||||
}
|
||||
void MemoryManager::EntryErase(uint64_t CpuPtr)
|
||||
{
|
||||
auto AccCache = EntryLookup(CpuPtr);
|
||||
AccViewTable.erase(CpuPtr);
|
||||
}
|
||||
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
assert(AccCache.LRU_valid==0);
|
||||
if (AccCache.transient) {
|
||||
LRU.push_back(AccCache.CpuPtr);
|
||||
AccCache.LRU_entry = --LRU.end();
|
||||
} else {
|
||||
LRU.push_front(AccCache.CpuPtr);
|
||||
AccCache.LRU_entry = LRU.begin();
|
||||
}
|
||||
AccCache.LRU_valid = 1;
|
||||
DeviceLRUBytes+=AccCache.bytes;
|
||||
}
|
||||
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
assert(AccCache.LRU_valid==1);
|
||||
LRU.erase(AccCache.LRU_entry);
|
||||
AccCache.LRU_valid = 0;
|
||||
DeviceLRUBytes-=AccCache.bytes;
|
||||
}
|
||||
/////////////////////////////////////////////////
|
||||
// Accelerator cache motion & consistency logic
|
||||
/////////////////////////////////////////////////
|
||||
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
///////////////////////////////////////////////////////////
|
||||
// Remove from Accelerator, remove entry, without flush
|
||||
// Cannot be locked. If allocated Must be in LRU pool.
|
||||
///////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||
assert(AccCache.accLock==0);
|
||||
assert(AccCache.cpuLock==0);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
if(AccCache.AccPtr) {
|
||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
LRUremove(AccCache);
|
||||
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||
}
|
||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
EntryErase(CpuPtr);
|
||||
}
|
||||
|
||||
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Make CPU consistent, remove from Accelerator, remove entry
|
||||
// Cannot be locked. If allocated must be in LRU pool.
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||
assert(AccCache.accLock==0);
|
||||
assert(AccCache.cpuLock==0);
|
||||
if(AccCache.state==AccDirty) {
|
||||
Flush(AccCache);
|
||||
}
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
if(AccCache.AccPtr) {
|
||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
LRUremove(AccCache);
|
||||
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||
}
|
||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
EntryErase(CpuPtr);
|
||||
}
|
||||
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
assert(AccCache.state==AccDirty);
|
||||
assert(AccCache.cpuLock==0);
|
||||
assert(AccCache.accLock==0);
|
||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
DeviceToHostBytes+=AccCache.bytes;
|
||||
DeviceToHostXfer++;
|
||||
AccCache.state=Consistent;
|
||||
}
|
||||
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
assert(AccCache.state==CpuDirty);
|
||||
assert(AccCache.cpuLock==0);
|
||||
assert(AccCache.accLock==0);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
if(AccCache.AccPtr==(uint64_t)NULL){
|
||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||
DeviceBytes+=AccCache.bytes;
|
||||
}
|
||||
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||
HostToDeviceBytes+=AccCache.bytes;
|
||||
HostToDeviceXfer++;
|
||||
AccCache.state=Consistent;
|
||||
}
|
||||
|
||||
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
assert(AccCache.state!=Empty);
|
||||
assert(AccCache.cpuLock==0);
|
||||
assert(AccCache.accLock==0);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
if(AccCache.AccPtr==(uint64_t)NULL){
|
||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||
DeviceBytes+=AccCache.bytes;
|
||||
}
|
||||
AccCache.state=AccDirty;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// View management
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
||||
{
|
||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||
AcceleratorViewClose((uint64_t)Ptr);
|
||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||
CpuViewClose((uint64_t)Ptr);
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
||||
{
|
||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
||||
} else {
|
||||
assert(0);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||
{
|
||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||
if ( DeviceLRUBytes > 0){
|
||||
assert(LRU.size()>0);
|
||||
uint64_t victim = LRU.back();
|
||||
auto AccCacheIterator = EntryLookup(victim);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
Evict(AccCache);
|
||||
}
|
||||
}
|
||||
}
|
||||
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Find if present, otherwise get or force an empty
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
if ( EntryPresent(CpuPtr)==0 ){
|
||||
EvictVictims(bytes);
|
||||
EntryCreate(CpuPtr,bytes,mode,hint);
|
||||
}
|
||||
|
||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
|
||||
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
|
||||
|
||||
assert(AccCache.cpuLock==0); // Programming error
|
||||
|
||||
if(AccCache.state!=Empty) {
|
||||
assert(AccCache.CpuPtr == CpuPtr);
|
||||
assert(AccCache.bytes ==bytes);
|
||||
}
|
||||
/*
|
||||
* State transitions and actions
|
||||
*
|
||||
* Action State StateNext Flush Clone
|
||||
*
|
||||
* AccRead Empty Consistent - Y
|
||||
* AccWrite Empty AccDirty - Y
|
||||
* AccRead CpuDirty Consistent - Y
|
||||
* AccWrite CpuDirty AccDirty - Y
|
||||
* AccRead Consistent Consistent - -
|
||||
* AccWrite Consistent AccDirty - -
|
||||
* AccRead AccDirty AccDirty - -
|
||||
* AccWrite AccDirty AccDirty - -
|
||||
*/
|
||||
if(AccCache.state==Empty) {
|
||||
assert(AccCache.LRU_valid==0);
|
||||
AccCache.CpuPtr = CpuPtr;
|
||||
AccCache.AccPtr = (uint64_t)NULL;
|
||||
AccCache.bytes = bytes;
|
||||
AccCache.state = CpuDirty; // Cpu starts primary
|
||||
if(mode==AcceleratorWriteDiscard){
|
||||
CpuDiscard(AccCache);
|
||||
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
|
||||
} else if(mode==AcceleratorWrite){
|
||||
Clone(AccCache);
|
||||
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
|
||||
} else {
|
||||
Clone(AccCache);
|
||||
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
||||
}
|
||||
AccCache.accLock= 1;
|
||||
} else if(AccCache.state==CpuDirty ){
|
||||
if(mode==AcceleratorWriteDiscard) {
|
||||
CpuDiscard(AccCache);
|
||||
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
|
||||
} else if(mode==AcceleratorWrite) {
|
||||
Clone(AccCache);
|
||||
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
|
||||
} else {
|
||||
Clone(AccCache);
|
||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||
}
|
||||
AccCache.accLock++;
|
||||
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
|
||||
} else if(AccCache.state==Consistent) {
|
||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||
else
|
||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||
AccCache.accLock++;
|
||||
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
|
||||
} else if(AccCache.state==AccDirty) {
|
||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||
else
|
||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||
AccCache.accLock++;
|
||||
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
// If view is opened on device remove from LRU
|
||||
if(AccCache.LRU_valid==1){
|
||||
// must possibly remove from LRU as now locked on GPU
|
||||
LRUremove(AccCache);
|
||||
}
|
||||
|
||||
int transient =hint;
|
||||
AccCache.transient= transient? EvictNext : 0;
|
||||
|
||||
return AccCache.AccPtr;
|
||||
}
|
||||
////////////////////////////////////
|
||||
// look up & decrement lock count
|
||||
////////////////////////////////////
|
||||
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
||||
{
|
||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
|
||||
assert(AccCache.cpuLock==0);
|
||||
assert(AccCache.accLock>0);
|
||||
|
||||
AccCache.accLock--;
|
||||
|
||||
// Move to LRU queue if not locked and close on device
|
||||
if(AccCache.accLock==0) {
|
||||
LRUinsert(AccCache);
|
||||
}
|
||||
}
|
||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
||||
{
|
||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
|
||||
assert(AccCache.cpuLock>0);
|
||||
assert(AccCache.accLock==0);
|
||||
|
||||
AccCache.cpuLock--;
|
||||
}
|
||||
/*
|
||||
* Action State StateNext Flush Clone
|
||||
*
|
||||
* CpuRead Empty CpuDirty - -
|
||||
* CpuWrite Empty CpuDirty - -
|
||||
* CpuRead CpuDirty CpuDirty - -
|
||||
* CpuWrite CpuDirty CpuDirty - -
|
||||
* CpuRead Consistent Consistent - -
|
||||
* CpuWrite Consistent CpuDirty - -
|
||||
* CpuRead AccDirty Consistent Y -
|
||||
* CpuWrite AccDirty CpuDirty Y -
|
||||
*/
|
||||
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Find if present, otherwise get or force an empty
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
if ( EntryPresent(CpuPtr)==0 ){
|
||||
EvictVictims(bytes);
|
||||
EntryCreate(CpuPtr,bytes,mode,transient);
|
||||
}
|
||||
|
||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
|
||||
assert((mode==CpuRead)||(mode==CpuWrite));
|
||||
assert(AccCache.accLock==0); // Programming error
|
||||
|
||||
if(AccCache.state!=Empty) {
|
||||
assert(AccCache.CpuPtr == CpuPtr);
|
||||
assert(AccCache.bytes==bytes);
|
||||
}
|
||||
|
||||
if(AccCache.state==Empty) {
|
||||
AccCache.CpuPtr = CpuPtr;
|
||||
AccCache.AccPtr = (uint64_t)NULL;
|
||||
AccCache.bytes = bytes;
|
||||
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
|
||||
AccCache.accLock= 0;
|
||||
AccCache.cpuLock= 1;
|
||||
} else if(AccCache.state==CpuDirty ){
|
||||
// AccPtr dont care, deferred allocate
|
||||
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
|
||||
AccCache.cpuLock++;
|
||||
} else if(AccCache.state==Consistent) {
|
||||
assert(AccCache.AccPtr != (uint64_t)NULL);
|
||||
if(mode==CpuWrite)
|
||||
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
|
||||
else
|
||||
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
|
||||
AccCache.cpuLock++;
|
||||
} else if(AccCache.state==AccDirty) {
|
||||
assert(AccCache.AccPtr != (uint64_t)NULL);
|
||||
Flush(AccCache);
|
||||
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
|
||||
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
|
||||
AccCache.cpuLock++;
|
||||
} else {
|
||||
assert(0); // should be unreachable
|
||||
}
|
||||
|
||||
AccCache.transient= transient? EvictNext : 0;
|
||||
|
||||
return AccCache.CpuPtr;
|
||||
}
|
||||
void MemoryManager::NotifyDeletion(void *_ptr)
|
||||
{
|
||||
// Look up in ViewCache
|
||||
uint64_t ptr = (uint64_t)_ptr;
|
||||
if(EntryPresent(ptr)) {
|
||||
auto e = EntryLookup(ptr);
|
||||
AccDiscard(e->second);
|
||||
}
|
||||
}
|
||||
void MemoryManager::Print(void)
|
||||
{
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogDebug << "Memory Manager " << std::endl;
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
|
||||
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
|
||||
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
|
||||
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
|
||||
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
|
||||
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
|
||||
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
|
||||
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
|
||||
auto &AccCache = it->second;
|
||||
|
||||
std::string str;
|
||||
if ( AccCache.state==Empty ) str = std::string("Empty");
|
||||
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
|
||||
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
|
||||
if ( AccCache.state==Consistent)str = std::string("Consistent");
|
||||
|
||||
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
||||
<< "\t" << AccCache.cpuLock
|
||||
<< "\t" << AccCache.accLock
|
||||
<< "\t" << AccCache.LRU_valid<<std::endl;
|
||||
}
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
|
||||
};
|
||||
int MemoryManager::isOpen (void* _CpuPtr)
|
||||
{
|
||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||
if ( EntryPresent(CpuPtr) ){
|
||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
return AccCache.cpuLock+AccCache.accLock;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
24
Grid/allocator/MemoryManagerShared.cc
Normal file
24
Grid/allocator/MemoryManagerShared.cc
Normal file
@ -0,0 +1,24 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#ifdef GRID_UVM
|
||||
|
||||
#warning "Grid is assuming unified virtual memory address space"
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// View management is 1:1 address space mapping
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
uint64_t MemoryManager::DeviceBytes;
|
||||
uint64_t MemoryManager::DeviceLRUBytes;
|
||||
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
|
||||
uint64_t MemoryManager::HostToDeviceBytes;
|
||||
uint64_t MemoryManager::DeviceToHostBytes;
|
||||
uint64_t MemoryManager::HostToDeviceXfer;
|
||||
uint64_t MemoryManager::DeviceToHostXfer;
|
||||
|
||||
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
|
||||
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
|
||||
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
|
||||
void MemoryManager::Print(void){};
|
||||
void MemoryManager::NotifyDeletion(void *ptr){};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
#endif
|
67
Grid/allocator/MemoryStats.cc
Normal file
67
Grid/allocator/MemoryStats.cc
Normal file
@ -0,0 +1,67 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||
bool MemoryProfiler::debug = false;
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||
{
|
||||
#ifdef __linux__
|
||||
int fd = open("/proc/self/pagemap", O_RDONLY);
|
||||
assert(fd >= 0);
|
||||
const int page_size = 4096;
|
||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||
uint64_t pagedata[npages];
|
||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||
assert(ret == offset);
|
||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||
assert(ret == sizeof(uint64_t) * npages);
|
||||
int nhugepages = npages / 512;
|
||||
int n4ktotal, nnothuge;
|
||||
n4ktotal = 0;
|
||||
nnothuge = 0;
|
||||
for (int i = 0; i < nhugepages; ++i) {
|
||||
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
||||
for (int j = 0; j < 512; ++j) {
|
||||
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
||||
++n4ktotal;
|
||||
if (pageaddr != baseaddr + j * page_size)
|
||||
++nnothuge;
|
||||
}
|
||||
}
|
||||
int rank = CartesianCommunicator::RankWorld();
|
||||
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string sizeString(const size_t bytes)
|
||||
{
|
||||
constexpr unsigned int bufSize = 256;
|
||||
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
|
||||
char buf[256];
|
||||
size_t s = 0;
|
||||
double count = bytes;
|
||||
|
||||
while (count >= 1024 && s < 7)
|
||||
{
|
||||
s++;
|
||||
count /= 1024;
|
||||
}
|
||||
if (count - floor(count) == 0.0)
|
||||
{
|
||||
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
|
||||
}
|
||||
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
95
Grid/allocator/MemoryStats.h
Normal file
95
Grid/allocator/MemoryStats.h
Normal file
@ -0,0 +1,95 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/MemoryStats.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
std::string sizeString(size_t bytes);
|
||||
|
||||
struct MemoryStats
|
||||
{
|
||||
size_t totalAllocated{0}, maxAllocated{0},
|
||||
currentlyAllocated{0}, totalFreed{0};
|
||||
};
|
||||
|
||||
class MemoryProfiler
|
||||
{
|
||||
public:
|
||||
static MemoryStats *stats;
|
||||
static bool debug;
|
||||
};
|
||||
|
||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
||||
#define profilerDebugPrint \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
||||
<< std::endl; \
|
||||
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
||||
<< std::endl; \
|
||||
}
|
||||
|
||||
#define profilerAllocate(bytes) \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
s->totalAllocated += (bytes); \
|
||||
s->currentlyAllocated += (bytes); \
|
||||
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
|
||||
} \
|
||||
if (MemoryProfiler::debug) \
|
||||
{ \
|
||||
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
|
||||
profilerDebugPrint; \
|
||||
}
|
||||
|
||||
#define profilerFree(bytes) \
|
||||
if (MemoryProfiler::stats) \
|
||||
{ \
|
||||
auto s = MemoryProfiler::stats; \
|
||||
s->totalFreed += (bytes); \
|
||||
s->currentlyAllocated -= (bytes); \
|
||||
} \
|
||||
if (MemoryProfiler::debug) \
|
||||
{ \
|
||||
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
|
||||
profilerDebugPrint; \
|
||||
}
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -81,6 +81,7 @@ public:
|
||||
|
||||
bool _isCheckerBoarded;
|
||||
int LocallyPeriodic;
|
||||
Coordinate _checker_dim_mask;
|
||||
|
||||
public:
|
||||
|
||||
|
@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
|
||||
|
||||
public:
|
||||
int dummy;
|
||||
Coordinate _checker_dim_mask;
|
||||
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
||||
return 0;
|
||||
}
|
||||
@ -104,6 +105,7 @@ public:
|
||||
_ldimensions.resize(_ndimension);
|
||||
_rdimensions.resize(_ndimension);
|
||||
_simd_layout.resize(_ndimension);
|
||||
_checker_dim_mask.resize(_ndimension);;
|
||||
_lstart.resize(_ndimension);
|
||||
_lend.resize(_ndimension);
|
||||
|
||||
@ -114,6 +116,8 @@ public:
|
||||
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
{
|
||||
_checker_dim_mask[d]=0;
|
||||
|
||||
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
|
@ -36,11 +36,27 @@ static const int CbBlack=1;
|
||||
static const int Even =CbRed;
|
||||
static const int Odd =CbBlack;
|
||||
|
||||
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
|
||||
{
|
||||
int nd=rdim.size();
|
||||
Coordinate coor(nd);
|
||||
|
||||
Lexicographic::CoorFromIndex(coor,oindex,rdim);
|
||||
|
||||
int linear=0;
|
||||
for(int d=0;d<nd;d++){
|
||||
if(chk_dim_msk[d])
|
||||
linear=linear+coor[d];
|
||||
}
|
||||
return (linear&0x1);
|
||||
}
|
||||
|
||||
|
||||
// Specialise this for red black grids storing half the data like a chess board.
|
||||
class GridRedBlackCartesian : public GridBase
|
||||
{
|
||||
public:
|
||||
Coordinate _checker_dim_mask;
|
||||
// Coordinate _checker_dim_mask;
|
||||
int _checker_dim;
|
||||
std::vector<int> _checker_board;
|
||||
|
||||
|
@ -138,21 +138,6 @@ public:
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
void SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int xmit_to_rank,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||
|
||||
double StencilSendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
|
@ -43,8 +43,16 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
|
||||
|
||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||
if ( !flag ) {
|
||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||
|
||||
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
|
||||
nCommThreads=1;
|
||||
// wrong results here too
|
||||
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
|
||||
// other comms schemes are ok
|
||||
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
|
||||
#else
|
||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||
#endif
|
||||
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
|
||||
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
|
||||
assert(0);
|
||||
@ -294,60 +302,28 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||
SendToRecvFromComplete(reqs);
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
|
||||
}
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int sender,
|
||||
int receiver,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Status stat;
|
||||
assert(sender != receiver);
|
||||
int tag = sender;
|
||||
if ( _processor == sender ) {
|
||||
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||
}
|
||||
if ( _processor == receiver ) {
|
||||
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||
}
|
||||
}
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
|
||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
// Enforce no UVM in comms, device or host OK
|
||||
assert(acceleratorIsCommunicable(xmit));
|
||||
assert(acceleratorIsCommunicable(recv));
|
||||
|
||||
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
list.push_back(rrq);
|
||||
} else {
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
}
|
||||
}
|
||||
|
||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
||||
}
|
||||
// Basic Halo comms primitive
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
@ -403,15 +379,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
|
||||
return off_node_bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
void CartesianCommunicator::StencilBarrier(void)
|
||||
{
|
||||
MPI_Barrier (ShmComm);
|
||||
}
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
int nreq=list.size();
|
||||
|
||||
@ -422,6 +390,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
|
||||
assert(ierr==0);
|
||||
list.resize(0);
|
||||
}
|
||||
void CartesianCommunicator::StencilBarrier(void)
|
||||
{
|
||||
MPI_Barrier (ShmComm);
|
||||
}
|
||||
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
//{
|
||||
//}
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
@ -483,5 +458,3 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
|
@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
|
||||
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
||||
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
||||
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int xmit_to_rank,
|
||||
int recv_from_rank,
|
||||
int bytes)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
// Basic Halo comms primitive -- should never call in single node
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
bcopy(in,out,bytes*words);
|
||||
@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
SendToRecvFromComplete(list);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void){};
|
||||
|
@ -29,9 +29,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/GridCore.h>
|
||||
#include <pwd.h>
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
#include <cuda_runtime_api.h>
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#define header "SharedMemoryMpi: "
|
||||
@ -47,7 +50,12 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// Split into groups that can share memory
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
#ifndef GRID_MPI3_SHM_NONE
|
||||
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
|
||||
#else
|
||||
MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
|
||||
#endif
|
||||
|
||||
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
|
||||
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
|
||||
|
||||
@ -170,17 +178,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
|
||||
std::vector<int> primes({2,3,5});
|
||||
|
||||
int dim = 0;
|
||||
int last_dim = ndimension - 1;
|
||||
int AutoShmSize = 1;
|
||||
while(AutoShmSize != WorldShmSize) {
|
||||
for(int p=0;p<primes.size();p++) {
|
||||
int p;
|
||||
for(p=0;p<primes.size();p++) {
|
||||
int prime=primes[p];
|
||||
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
||||
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
||||
AutoShmSize*=prime;
|
||||
ShmDims[dim]*=prime;
|
||||
last_dim = dim;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (p == primes.size() && last_dim == dim) {
|
||||
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
dim=(dim+1) %ndimension;
|
||||
}
|
||||
}
|
||||
@ -413,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbfs mapping intended
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_NVCC
|
||||
#if defined(GRID_CUDA) ||defined(GRID_HIP)
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
void * ShmCommBuf ;
|
||||
@ -433,27 +448,18 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
||||
|
||||
#ifdef GRID_IBM_SUMMIT
|
||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
|
||||
#else
|
||||
std::cout << "setting device to WorldShmRank"<<std::endl;
|
||||
cudaSetDevice(WorldShmRank);
|
||||
#endif
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
auto err = cudaMalloc(&ShmCommBuf, bytes);
|
||||
if ( err != cudaSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
}
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
|
||||
@ -462,18 +468,30 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
for(int r=0;r<WorldShmSize;r++){
|
||||
|
||||
#ifndef GRID_MPI3_SHM_NONE
|
||||
//////////////////////////////////////////////////
|
||||
// If it is me, pass around the IPC access key
|
||||
//////////////////////////////////////////////////
|
||||
#ifdef GRID_CUDA
|
||||
cudaIpcMemHandle_t handle;
|
||||
|
||||
if ( r==WorldShmRank ) {
|
||||
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||
if ( err != cudaSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
hipIpcMemHandle_t handle;
|
||||
if ( r==WorldShmRank ) {
|
||||
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
|
||||
if ( err != hipSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
//////////////////////////////////////////////////
|
||||
// Share this IPC handle across the Shm Comm
|
||||
//////////////////////////////////////////////////
|
||||
@ -490,17 +508,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
// If I am not the source, overwrite thisBuf with remote buffer
|
||||
///////////////////////////////////////////////////////////////
|
||||
void * thisBuf = ShmCommBuf;
|
||||
#ifdef GRID_CUDA
|
||||
if ( r!=WorldShmRank ) {
|
||||
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
||||
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
||||
if ( err != cudaSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
if ( r!=WorldShmRank ) {
|
||||
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
|
||||
if ( err != hipSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Save a copy of the device buffers
|
||||
///////////////////////////////////////////////////////////////
|
||||
WorldShmCommBufs[r] = thisBuf;
|
||||
#else
|
||||
WorldShmCommBufs[r] = ShmCommBuf;
|
||||
#endif
|
||||
}
|
||||
|
||||
_ShmAllocBytes=bytes;
|
||||
@ -677,7 +709,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
{
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
cudaMemset(dest,0,bytes);
|
||||
#else
|
||||
bzero(dest,bytes);
|
||||
@ -685,7 +717,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
||||
{
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||
#else
|
||||
bcopy(src,dest,bytes);
|
||||
@ -705,7 +737,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// Split into groups that can share memory
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
#ifndef GRID_MPI3_SHM_NONE
|
||||
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
||||
#else
|
||||
MPI_Comm_split(comm, rank, 0, &ShmComm);
|
||||
#endif
|
||||
MPI_Comm_rank(ShmComm ,&ShmRank);
|
||||
MPI_Comm_size(ShmComm ,&ShmSize);
|
||||
ShmCommBufs.resize(ShmSize);
|
||||
|
@ -52,23 +52,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<typename Op, typename T1>
|
||||
auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
|
||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
|
||||
{
|
||||
return Cshift(closure(expr),dim,shift);
|
||||
}
|
||||
template <class Op, class T1, class T2>
|
||||
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
|
||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
|
||||
{
|
||||
return Cshift(closure(expr),dim,shift);
|
||||
}
|
||||
template <class Op, class T1, class T2, class T3>
|
||||
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
|
||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
||||
eval(0, expr.arg2),
|
||||
eval(0, expr.arg3)))>
|
||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
||||
{
|
||||
return Cshift(closure(expr),dim,shift);
|
||||
}
|
||||
|
@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern Vector<std::pair<int,int> > Cshift_table;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Gather for when there is no need to SIMD split
|
||||
///////////////////////////////////////////////////////////////////
|
||||
@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
||||
int e2=rhs.Grid()->_slice_block[dimension];
|
||||
int ent = 0;
|
||||
|
||||
static Vector<std::pair<int,int> > table; table.resize(e1*e2);
|
||||
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||
|
||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||
|
||||
auto rhs_v = rhs.View();
|
||||
if ( cbmask == 0x3 ) {
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*stride;
|
||||
int bo = n*e2;
|
||||
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
|
||||
Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -65,15 +67,20 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
||||
int o = n*stride;
|
||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb &cbmask ) {
|
||||
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
|
||||
Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
thread_for(i,ent,{
|
||||
buffer[table[i].first]=rhs_v[table[i].second];
|
||||
{
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
auto buffer_p = & buffer[0];
|
||||
auto table = &Cshift_table[0];
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Gather for when there *is* need to SIMD split
|
||||
@ -95,35 +102,37 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
||||
int e2=rhs.Grid()->_slice_block[dimension];
|
||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||
|
||||
auto rhs_v = rhs.View();
|
||||
if ( cbmask ==0x3){
|
||||
thread_for_collapse(2,n,e1,{
|
||||
for(int b=0;b<e2;b++){
|
||||
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for2d(n,e1,b,e2,1,{
|
||||
int o = n*n1;
|
||||
int offset = b+n*e2;
|
||||
|
||||
vobj temp =rhs_v[so+o+b];
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
|
||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||
// Test_cshift_red_black code.
|
||||
std::cout << " Dense packed buffer WARNING " <<std::endl;
|
||||
thread_for_collapse(2,n,e1,{
|
||||
for(int b=0;b<e2;b++){
|
||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||
accelerator_for2d(n,e1,b,e2,1,{
|
||||
|
||||
Coordinate coor;
|
||||
|
||||
int o=n*n1;
|
||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||
int oindex = o+b;
|
||||
|
||||
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||
|
||||
int ocb=1<<cb;
|
||||
int offset = b+n*e2;
|
||||
|
||||
if ( ocb & cbmask ) {
|
||||
vobj temp =rhs_v[so+o+b];
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -145,7 +154,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
||||
int e2=rhs.Grid()->_slice_block[dimension];
|
||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||
|
||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
||||
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||
|
||||
int ent =0;
|
||||
|
||||
if ( cbmask ==0x3 ) {
|
||||
@ -154,7 +164,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||
int bo =n*rhs.Grid()->_slice_block[dimension];
|
||||
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
|
||||
Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
|
||||
}
|
||||
}
|
||||
|
||||
@ -165,17 +175,21 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
table[ent++]=std::pair<int,int> (so+o+b,bo++);
|
||||
Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto rhs_v = rhs.View();
|
||||
thread_for(i,ent,{
|
||||
rhs_v[table[i].first]=buffer[table[i].second];
|
||||
{
|
||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||
auto buffer_p = & buffer[0];
|
||||
auto table = &Cshift_table[0];
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there *is* need to SIMD split
|
||||
@ -194,13 +208,13 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
int e2=rhs.Grid()->_slice_block[dimension];
|
||||
|
||||
if(cbmask ==0x3 ) {
|
||||
auto rhs_v = rhs.View();
|
||||
thread_for_collapse(2,n,e1,{
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||
accelerator_for2d(n,e1,b,e2,1,{
|
||||
int o = n*_slice_stride;
|
||||
int offset = b+n*_slice_block;
|
||||
merge(rhs_v[so+o+b],pointers,offset);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
|
||||
@ -208,7 +222,8 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
// Test_cshift_red_black code.
|
||||
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
||||
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
||||
auto rhs_v = rhs.View();
|
||||
assert(0); // This will fail if hit on GPU
|
||||
autoView( rhs_v, rhs, CpuWrite);
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||
@ -225,6 +240,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
//////////////////////////////////////////////////////
|
||||
// local to node block strided copies
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
||||
{
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
@ -239,14 +255,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||
int e2=rhs.Grid()->_slice_block[dimension];
|
||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
||||
|
||||
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||
|
||||
int ent=0;
|
||||
|
||||
if(cbmask == 0x3 ){
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*stride+b;
|
||||
table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -255,23 +273,24 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
int o =n*stride+b;
|
||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
||||
if ( ocb&cbmask ) {
|
||||
table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto rhs_v = rhs.View();
|
||||
auto lhs_v = lhs.View();
|
||||
thread_for(i,ent,{
|
||||
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||
{
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||
auto table = &Cshift_table[0];
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
||||
{
|
||||
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
|
||||
@ -285,30 +304,34 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
||||
int e2=rhs.Grid()->_slice_block [dimension];
|
||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||
|
||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
||||
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||
|
||||
int ent=0;
|
||||
|
||||
if ( cbmask == 0x3 ) {
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*stride;
|
||||
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||
}}
|
||||
} else {
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*stride;
|
||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||
if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||
}}
|
||||
}
|
||||
|
||||
auto rhs_v = rhs.View();
|
||||
auto lhs_v = lhs.View();
|
||||
thread_for(i,ent,{
|
||||
{
|
||||
autoView( rhs_v, rhs, AcceleratorRead);
|
||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||
auto table = &Cshift_table[0];
|
||||
accelerator_for(i,ent,1,{
|
||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Local to node Cshift
|
||||
|
4
Grid/cshift/Cshift_table.cc
Normal file
4
Grid/cshift/Cshift_table.cc
Normal file
@ -0,0 +1,4 @@
|
||||
#include <Grid/GridCore.h>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
Vector<std::pair<int,int> > Cshift_table;
|
||||
NAMESPACE_END(Grid);
|
@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
#include <Grid/lattice/Lattice_view.h>
|
||||
#include <Grid/lattice/Lattice_base.h>
|
||||
#include <Grid/lattice/Lattice_conformable.h>
|
||||
#include <Grid/lattice/Lattice_ET.h>
|
||||
@ -36,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/lattice/Lattice_reduction.h>
|
||||
#include <Grid/lattice/Lattice_peekpoke.h>
|
||||
//#include <Grid/lattice/Lattice_reality.h>
|
||||
#include <Grid/lattice/Lattice_real_imag.h>
|
||||
#include <Grid/lattice/Lattice_comparison_utils.h>
|
||||
#include <Grid/lattice/Lattice_comparison.h>
|
||||
#include <Grid/lattice/Lattice_coordinate.h>
|
||||
|
@ -42,9 +42,24 @@ NAMESPACE_BEGIN(Grid);
|
||||
////////////////////////////////////////////////////
|
||||
// Predicated where support
|
||||
////////////////////////////////////////////////////
|
||||
#ifdef GRID_SIMT
|
||||
// drop to scalar in SIMT; cleaner in fact
|
||||
template <class iobj, class vobj, class robj>
|
||||
accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
|
||||
const robj &iffalse) {
|
||||
accelerator_inline vobj predicatedWhere(const iobj &predicate,
|
||||
const vobj &iftrue,
|
||||
const robj &iffalse)
|
||||
{
|
||||
Integer mask = TensorRemove(predicate);
|
||||
typename std::remove_const<vobj>::type ret= iffalse;
|
||||
if (mask) ret=iftrue;
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
template <class iobj, class vobj, class robj>
|
||||
accelerator_inline vobj predicatedWhere(const iobj &predicate,
|
||||
const vobj &iftrue,
|
||||
const robj &iffalse)
|
||||
{
|
||||
typename std::remove_const<vobj>::type ret;
|
||||
|
||||
typedef typename vobj::scalar_object scalar_object;
|
||||
@ -68,6 +83,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
|
||||
merge(ret, falsevals);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
//Specialization of getVectorType for lattices
|
||||
@ -81,26 +97,62 @@ struct getVectorType<Lattice<T> >{
|
||||
//-- recursive evaluation of expressions; --
|
||||
// handle leaves of syntax tree
|
||||
///////////////////////////////////////////////////
|
||||
template<class sobj> accelerator_inline
|
||||
template<class sobj,
|
||||
typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr>
|
||||
accelerator_inline
|
||||
sobj eval(const uint64_t ss, const sobj &arg)
|
||||
{
|
||||
return arg;
|
||||
}
|
||||
|
||||
template <class lobj> accelerator_inline
|
||||
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
|
||||
auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
|
||||
{
|
||||
return arg[ss];
|
||||
return arg(ss);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////
|
||||
//-- recursive evaluation of expressions; --
|
||||
// whole vector return, used only for expression return type inference
|
||||
///////////////////////////////////////////////////
|
||||
template<class sobj> accelerator_inline
|
||||
sobj vecEval(const uint64_t ss, const sobj &arg)
|
||||
{
|
||||
return arg;
|
||||
}
|
||||
template <class lobj> accelerator_inline
|
||||
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
|
||||
const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg)
|
||||
{
|
||||
auto view = arg.AcceleratorView(ViewRead);
|
||||
return view[ss];
|
||||
return arg[ss];
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// handle nodes in syntax tree- eval one operand
|
||||
// vecEval needed (but never called as all expressions offloaded) to infer the return type
|
||||
// in SIMT contexts of closure.
|
||||
///////////////////////////////////////////////////
|
||||
template <typename Op, typename T1> accelerator_inline
|
||||
auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
||||
-> decltype(expr.op.func( vecEval(ss, expr.arg1)))
|
||||
{
|
||||
return expr.op.func( vecEval(ss, expr.arg1) );
|
||||
}
|
||||
// vecEval two operands
|
||||
template <typename Op, typename T1, typename T2> accelerator_inline
|
||||
auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
-> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
|
||||
{
|
||||
return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
|
||||
}
|
||||
// vecEval three operands
|
||||
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
|
||||
auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||
-> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
|
||||
{
|
||||
return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// handle nodes in syntax tree- eval one operand coalesced
|
||||
///////////////////////////////////////////////////
|
||||
template <typename Op, typename T1> accelerator_inline
|
||||
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
||||
@ -108,23 +160,41 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
||||
{
|
||||
return expr.op.func( eval(ss, expr.arg1) );
|
||||
}
|
||||
///////////////////////
|
||||
// eval two operands
|
||||
///////////////////////
|
||||
template <typename Op, typename T1, typename T2> accelerator_inline
|
||||
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
|
||||
{
|
||||
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
|
||||
}
|
||||
///////////////////////
|
||||
// eval three operands
|
||||
///////////////////////
|
||||
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
|
||||
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||
-> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
|
||||
-> decltype(expr.op.func(eval(ss, expr.arg1),
|
||||
eval(ss, expr.arg2),
|
||||
eval(ss, expr.arg3)))
|
||||
{
|
||||
return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
|
||||
#ifdef GRID_SIMT
|
||||
// Handles Nsimd (vInteger) != Nsimd(ComplexD)
|
||||
typedef decltype(vecEval(ss, expr.arg2)) rvobj;
|
||||
typedef typename std::remove_reference<rvobj>::type vobj;
|
||||
|
||||
const int Nsimd = vobj::vector_type::Nsimd();
|
||||
|
||||
auto vpred = vecEval(ss,expr.arg1);
|
||||
|
||||
ExtractBuffer<Integer> mask(Nsimd);
|
||||
extract<vInteger, Integer>(TensorRemove(vpred), mask);
|
||||
|
||||
int s = acceleratorSIMTlane(Nsimd);
|
||||
return expr.op.func(mask[s],
|
||||
eval(ss, expr.arg2),
|
||||
eval(ss, expr.arg3));
|
||||
#else
|
||||
return expr.op.func(eval(ss, expr.arg1),
|
||||
eval(ss, expr.arg2),
|
||||
eval(ss, expr.arg3));
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@ -180,16 +250,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
|
||||
cb = lat.Checkerboard();
|
||||
}
|
||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void CBFromExpression(int &cb, const T1 ¬lat) // non-lattice leaf
|
||||
{
|
||||
}
|
||||
|
||||
inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf
|
||||
template <typename Op, typename T1> inline
|
||||
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
|
||||
{
|
||||
CBFromExpression(cb, expr.arg1); // recurse AST
|
||||
}
|
||||
|
||||
template <typename Op, typename T1, typename T2> inline
|
||||
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
{
|
||||
@ -204,13 +270,74 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
|
||||
CBFromExpression(cb, expr.arg3); // recurse AST
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// ViewOpen
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
|
||||
{
|
||||
lat.ViewOpen(AcceleratorRead);
|
||||
}
|
||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void ExpressionViewOpen(T1 ¬lat) {}
|
||||
|
||||
template <typename Op, typename T1> inline
|
||||
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
|
||||
{
|
||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||
}
|
||||
|
||||
template <typename Op, typename T1, typename T2> inline
|
||||
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
{
|
||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||
ExpressionViewOpen(expr.arg2); // rrecurse AST
|
||||
}
|
||||
template <typename Op, typename T1, typename T2, typename T3>
|
||||
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||
{
|
||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||
ExpressionViewOpen(expr.arg2); // recurse AST
|
||||
ExpressionViewOpen(expr.arg3); // recurse AST
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// ViewClose
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
|
||||
{
|
||||
lat.ViewClose();
|
||||
}
|
||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||
inline void ExpressionViewClose(T1 ¬lat) {}
|
||||
|
||||
template <typename Op, typename T1> inline
|
||||
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
|
||||
{
|
||||
ExpressionViewClose(expr.arg1); // recurse AST
|
||||
}
|
||||
template <typename Op, typename T1, typename T2> inline
|
||||
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
{
|
||||
ExpressionViewClose(expr.arg1); // recurse AST
|
||||
ExpressionViewClose(expr.arg2); // recurse AST
|
||||
}
|
||||
template <typename Op, typename T1, typename T2, typename T3>
|
||||
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||
{
|
||||
ExpressionViewClose(expr.arg1); // recurse AST
|
||||
ExpressionViewClose(expr.arg2); // recurse AST
|
||||
ExpressionViewClose(expr.arg3); // recurse AST
|
||||
}
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Unary operators and funcs
|
||||
////////////////////////////////////////////
|
||||
#define GridUnopClass(name, ret) \
|
||||
template <class arg> \
|
||||
struct name { \
|
||||
static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
|
||||
template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
|
||||
};
|
||||
|
||||
GridUnopClass(UnarySub, -a);
|
||||
@ -221,8 +348,6 @@ GridUnopClass(UnaryTrace, trace(a));
|
||||
GridUnopClass(UnaryTranspose, transpose(a));
|
||||
GridUnopClass(UnaryTa, Ta(a));
|
||||
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
|
||||
GridUnopClass(UnaryReal, real(a));
|
||||
GridUnopClass(UnaryImag, imag(a));
|
||||
GridUnopClass(UnaryToReal, toReal(a));
|
||||
GridUnopClass(UnaryToComplex, toComplex(a));
|
||||
GridUnopClass(UnaryTimesI, timesI(a));
|
||||
@ -241,10 +366,10 @@ GridUnopClass(UnaryExp, exp(a));
|
||||
// Binary operators
|
||||
////////////////////////////////////////////
|
||||
#define GridBinOpClass(name, combination) \
|
||||
template <class left, class right> \
|
||||
struct name { \
|
||||
template <class _left, class _right> \
|
||||
static auto accelerator_inline \
|
||||
func(const left &lhs, const right &rhs) \
|
||||
func(const _left &lhs, const _right &rhs) \
|
||||
-> decltype(combination) const \
|
||||
{ \
|
||||
return combination; \
|
||||
@ -264,10 +389,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
|
||||
// Trinary conditional op
|
||||
////////////////////////////////////////////////////
|
||||
#define GridTrinOpClass(name, combination) \
|
||||
template <class predicate, class left, class right> \
|
||||
struct name { \
|
||||
template <class _predicate,class _left, class _right> \
|
||||
static auto accelerator_inline \
|
||||
func(const predicate &pred, const left &lhs, const right &rhs) \
|
||||
func(const _predicate &pred, const _left &lhs, const _right &rhs) \
|
||||
-> decltype(combination) const \
|
||||
{ \
|
||||
return combination; \
|
||||
@ -275,17 +400,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
|
||||
};
|
||||
|
||||
GridTrinOpClass(TrinaryWhere,
|
||||
(predicatedWhere<predicate,
|
||||
typename std::remove_reference<left>::type,
|
||||
typename std::remove_reference<right>::type>(pred, lhs,rhs)));
|
||||
(predicatedWhere<
|
||||
typename std::remove_reference<_predicate>::type,
|
||||
typename std::remove_reference<_left>::type,
|
||||
typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Operator syntactical glue
|
||||
////////////////////////////////////////////
|
||||
|
||||
#define GRID_UNOP(name) name<decltype(eval(0, arg))>
|
||||
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
|
||||
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
|
||||
#define GRID_UNOP(name) name
|
||||
#define GRID_BINOP(name) name
|
||||
#define GRID_TRINOP(name) name
|
||||
|
||||
#define GRID_DEF_UNOP(op, name) \
|
||||
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
|
||||
@ -337,8 +462,6 @@ GRID_DEF_UNOP(trace, UnaryTrace);
|
||||
GRID_DEF_UNOP(transpose, UnaryTranspose);
|
||||
GRID_DEF_UNOP(Ta, UnaryTa);
|
||||
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
|
||||
GRID_DEF_UNOP(real, UnaryReal);
|
||||
GRID_DEF_UNOP(imag, UnaryImag);
|
||||
GRID_DEF_UNOP(toReal, UnaryToReal);
|
||||
GRID_DEF_UNOP(toComplex, UnaryToComplex);
|
||||
GRID_DEF_UNOP(timesI, UnaryTimesI);
|
||||
@ -371,29 +494,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
|
||||
/////////////////////////////////////////////////////////////
|
||||
template <class Op, class T1>
|
||||
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
|
||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
|
||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))>
|
||||
{
|
||||
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
|
||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
|
||||
return ret;
|
||||
}
|
||||
template <class Op, class T1, class T2>
|
||||
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
|
||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>
|
||||
{
|
||||
Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
|
||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
|
||||
return ret;
|
||||
}
|
||||
template <class Op, class T1, class T2, class T3>
|
||||
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
||||
eval(0, expr.arg2),
|
||||
eval(0, expr.arg3)))>
|
||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
|
||||
vecEval(0, expr.arg2),
|
||||
vecEval(0, expr.arg3)))>
|
||||
{
|
||||
Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
||||
eval(0, expr.arg2),
|
||||
eval(0, expr.arg3)))> ret(expr);
|
||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
|
||||
vecEval(0, expr.arg2),
|
||||
vecEval(0, expr.arg3)))> ret(expr);
|
||||
return ret;
|
||||
}
|
||||
#define EXPRESSION_CLOSURE(function) \
|
||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
|
||||
auto function(Expression &expr) -> decltype(function(closure(expr))) \
|
||||
{ \
|
||||
return function(closure(expr)); \
|
||||
}
|
||||
|
||||
|
||||
#undef GRID_UNOP
|
||||
#undef GRID_BINOP
|
||||
|
@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
|
||||
template<class obj1,class obj2,class obj3> inline
|
||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
conformable(ret,rhs);
|
||||
conformable(lhs,rhs);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
@ -56,13 +56,13 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
conformable(ret,rhs);
|
||||
conformable(lhs,rhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto lhs_t=lhs_v(ss);
|
||||
auto rhs_t=rhs_v(ss);
|
||||
auto tmp =ret_v(ss);
|
||||
mac(&tmp,&lhs_t,&rhs_t);
|
||||
coalescedWrite(ret_v[ss],tmp);
|
||||
});
|
||||
@ -73,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
conformable(ret,rhs);
|
||||
conformable(lhs,rhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto lhs_t=lhs_v(ss);
|
||||
@ -89,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
conformable(ret,rhs);
|
||||
conformable(lhs,rhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto lhs_t=lhs_v(ss);
|
||||
@ -108,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
|
||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
conformable(lhs,ret);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
mult(&tmp,&lhs_v(ss),&rhs);
|
||||
@ -121,10 +121,10 @@ template<class obj1,class obj2,class obj3> inline
|
||||
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
conformable(ret,lhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto tmp =ret_v(ss);
|
||||
auto lhs_t=lhs_v(ss);
|
||||
mac(&tmp,&lhs_t,&rhs);
|
||||
coalescedWrite(ret_v[ss],tmp);
|
||||
@ -135,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
|
||||
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
conformable(ret,lhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto lhs_t=lhs_v(ss);
|
||||
@ -148,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
|
||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
conformable(lhs,ret);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto lhs_t=lhs_v(ss);
|
||||
@ -165,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
|
||||
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||
ret.Checkerboard() = rhs.Checkerboard();
|
||||
conformable(ret,rhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto rhs_v = lhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( rhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto rhs_t=rhs_v(ss);
|
||||
@ -179,10 +179,10 @@ template<class obj1,class obj2,class obj3> inline
|
||||
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||
ret.Checkerboard() = rhs.Checkerboard();
|
||||
conformable(ret,rhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto rhs_v = lhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( rhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto tmp =ret_v(ss);
|
||||
auto rhs_t=rhs_v(ss);
|
||||
mac(&tmp,&lhs,&rhs_t);
|
||||
coalescedWrite(ret_v[ss],tmp);
|
||||
@ -193,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
|
||||
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||
ret.Checkerboard() = rhs.Checkerboard();
|
||||
conformable(ret,rhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto rhs_v = lhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( rhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto rhs_t=rhs_v(ss);
|
||||
@ -206,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
|
||||
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||
ret.Checkerboard() = rhs.Checkerboard();
|
||||
conformable(ret,rhs);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto rhs_v = lhs.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( rhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto rhs_t=rhs_v(ss);
|
||||
@ -221,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
|
||||
ret.Checkerboard() = x.Checkerboard();
|
||||
conformable(ret,x);
|
||||
conformable(x,y);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto x_v = x.AcceleratorView(ViewRead);
|
||||
auto y_v = y.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( x_v , x, AcceleratorRead);
|
||||
autoView( y_v , y, AcceleratorRead);
|
||||
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||
auto tmp = a*x_v(ss)+y_v(ss);
|
||||
coalescedWrite(ret_v[ss],tmp);
|
||||
@ -234,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
||||
ret.Checkerboard() = x.Checkerboard();
|
||||
conformable(ret,x);
|
||||
conformable(x,y);
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
auto x_v = x.AcceleratorView(ViewRead);
|
||||
auto y_v = y.AcceleratorView(ViewRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( x_v , x, AcceleratorRead);
|
||||
autoView( y_v , y, AcceleratorRead);
|
||||
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||
coalescedWrite(ret_v[ss],tmp);
|
||||
|
@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#pragma once
|
||||
|
||||
#define STREAMING_STORES
|
||||
@ -37,180 +38,6 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern int GridCshiftPermuteMap[4][16];
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Base class which can be used by traits to pick up behaviour
|
||||
///////////////////////////////////////////////////////////////////
|
||||
class LatticeBase {};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Conformable checks; same instance of Grid required
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
|
||||
{
|
||||
assert(lhs == rhs);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Advise the LatticeAccelerator class
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
enum LatticeAcceleratorAdvise {
|
||||
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
|
||||
// significantly influence performance of bulk storage.
|
||||
AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures
|
||||
// enables read-only copies of memory to be kept on
|
||||
// host and device.
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// View Access Mode
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
enum ViewMode {
|
||||
ViewRead = 0x1,
|
||||
ViewWrite = 0x2,
|
||||
ViewReadWrite = 0x3
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Minimal base class containing only data valid to access from accelerator
|
||||
// _odata will be a managed pointer in CUDA
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Force access to lattice through a view object.
|
||||
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
|
||||
// strict since host could could in principle direct access through the lattice object
|
||||
// Need to decide programming model.
|
||||
#define LATTICE_VIEW_STRICT
|
||||
template<class vobj> class LatticeAccelerator : public LatticeBase
|
||||
{
|
||||
protected:
|
||||
GridBase *_grid;
|
||||
int checkerboard;
|
||||
vobj *_odata; // A managed pointer
|
||||
uint64_t _odata_size;
|
||||
public:
|
||||
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
|
||||
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
|
||||
accelerator_inline int Checkerboard(void) const { return checkerboard; };
|
||||
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
|
||||
accelerator_inline void Conformable(GridBase * &grid) const
|
||||
{
|
||||
if (grid) conformable(grid, _grid);
|
||||
else grid = _grid;
|
||||
};
|
||||
|
||||
accelerator_inline void Advise(int advise) {
|
||||
#ifdef GRID_NVCC
|
||||
#ifndef __CUDA_ARCH__ // only on host
|
||||
if (advise & AdviseInfrequentUse) {
|
||||
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
|
||||
}
|
||||
if (advise & AdviseReadMostly) {
|
||||
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
|
||||
#ifdef GRID_NVCC
|
||||
#ifndef __CUDA_ARCH__ // only on host
|
||||
int target;
|
||||
cudaGetDevice(&target);
|
||||
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
|
||||
#ifdef GRID_NVCC
|
||||
#ifndef __CUDA_ARCH__ // only on host
|
||||
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// A View class which provides accessor to the data.
|
||||
// This will be safe to call from accelerator_for and is trivially copy constructible
|
||||
// The copy constructor for this will need to be used by device lambda functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj>
|
||||
class LatticeView : public LatticeAccelerator<vobj>
|
||||
{
|
||||
public:
|
||||
|
||||
|
||||
// Rvalue
|
||||
#ifdef __CUDA_ARCH__
|
||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
|
||||
#else
|
||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||
#endif
|
||||
|
||||
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
||||
|
||||
accelerator_inline uint64_t begin(void) const { return 0;};
|
||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||
|
||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Lattice expression types used by ET to assemble the AST
|
||||
//
|
||||
// Need to be able to detect code paths according to the whether a lattice object or not
|
||||
// so introduce some trait type things
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class LatticeExpressionBase {};
|
||||
|
||||
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
|
||||
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
||||
|
||||
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
||||
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
|
||||
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
||||
|
||||
template <typename Op, typename _T1>
|
||||
class LatticeUnaryExpression : public LatticeExpressionBase
|
||||
{
|
||||
public:
|
||||
typedef typename ViewMap<_T1>::Type T1;
|
||||
Op op;
|
||||
T1 arg1;
|
||||
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
|
||||
};
|
||||
|
||||
template <typename Op, typename _T1, typename _T2>
|
||||
class LatticeBinaryExpression : public LatticeExpressionBase
|
||||
{
|
||||
public:
|
||||
typedef typename ViewMap<_T1>::Type T1;
|
||||
typedef typename ViewMap<_T2>::Type T2;
|
||||
Op op;
|
||||
T1 arg1;
|
||||
T2 arg2;
|
||||
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
|
||||
};
|
||||
|
||||
template <typename Op, typename _T1, typename _T2, typename _T3>
|
||||
class LatticeTrinaryExpression : public LatticeExpressionBase
|
||||
{
|
||||
public:
|
||||
typedef typename ViewMap<_T1>::Type T1;
|
||||
typedef typename ViewMap<_T2>::Type T2;
|
||||
typedef typename ViewMap<_T3>::Type T3;
|
||||
Op op;
|
||||
T1 arg1;
|
||||
T2 arg2;
|
||||
T3 arg3;
|
||||
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// The real lattice class, with normal copy and assignment semantics.
|
||||
// This contains extra (host resident) grid pointer data that may be accessed by host code
|
||||
@ -253,28 +80,23 @@ private:
|
||||
}
|
||||
}
|
||||
public:
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
void SetViewMode(ViewMode mode) {
|
||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
||||
accessor.ViewClose();
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// Return a view object that may be dereferenced in site loops.
|
||||
// The view is trivially copy constructible and may be copied to an accelerator device
|
||||
// in device lambdas
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
|
||||
{ // and HostView for thread_for
|
||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
||||
return accessor;
|
||||
}
|
||||
|
||||
LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const
|
||||
LatticeView<vobj> View (ViewMode mode) const
|
||||
{
|
||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
||||
accessor.AcceleratorPrefetch(mode);
|
||||
return accessor;
|
||||
}
|
||||
|
||||
LatticeView<vobj> HostView(int mode = ViewReadWrite) const
|
||||
{
|
||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
||||
accessor.HostPrefetch(mode);
|
||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
||||
return accessor;
|
||||
}
|
||||
|
||||
@ -298,11 +120,15 @@ public:
|
||||
assert( (cb==Odd) || (cb==Even));
|
||||
this->checkerboard=cb;
|
||||
|
||||
auto me = AcceleratorView(ViewWrite);
|
||||
accelerator_for(ss,me.size(),1,{
|
||||
auto tmp = eval(ss,expr);
|
||||
vstream(me[ss],tmp);
|
||||
auto exprCopy = expr;
|
||||
ExpressionViewOpen(exprCopy);
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
auto tmp = eval(ss,exprCopy);
|
||||
coalescedWrite(me[ss],tmp);
|
||||
});
|
||||
me.ViewClose();
|
||||
ExpressionViewClose(exprCopy);
|
||||
return *this;
|
||||
}
|
||||
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
|
||||
@ -317,11 +143,15 @@ public:
|
||||
assert( (cb==Odd) || (cb==Even));
|
||||
this->checkerboard=cb;
|
||||
|
||||
auto me = AcceleratorView(ViewWrite);
|
||||
accelerator_for(ss,me.size(),1,{
|
||||
auto tmp = eval(ss,expr);
|
||||
vstream(me[ss],tmp);
|
||||
auto exprCopy = expr;
|
||||
ExpressionViewOpen(exprCopy);
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
auto tmp = eval(ss,exprCopy);
|
||||
coalescedWrite(me[ss],tmp);
|
||||
});
|
||||
me.ViewClose();
|
||||
ExpressionViewClose(exprCopy);
|
||||
return *this;
|
||||
}
|
||||
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
|
||||
@ -335,11 +165,15 @@ public:
|
||||
CBFromExpression(cb,expr);
|
||||
assert( (cb==Odd) || (cb==Even));
|
||||
this->checkerboard=cb;
|
||||
auto me = AcceleratorView(ViewWrite);
|
||||
accelerator_for(ss,me.size(),1,{
|
||||
auto tmp = eval(ss,expr);
|
||||
vstream(me[ss],tmp);
|
||||
auto exprCopy = expr;
|
||||
ExpressionViewOpen(exprCopy);
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
auto tmp = eval(ss,exprCopy);
|
||||
coalescedWrite(me[ss],tmp);
|
||||
});
|
||||
me.ViewClose();
|
||||
ExpressionViewClose(exprCopy);
|
||||
return *this;
|
||||
}
|
||||
//GridFromExpression is tricky to do
|
||||
@ -390,10 +224,11 @@ public:
|
||||
}
|
||||
|
||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||
auto me = View();
|
||||
auto me = View(CpuWrite);
|
||||
thread_for(ss,me.size(),{
|
||||
me[ss]= r;
|
||||
});
|
||||
me.ViewClose();
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -403,11 +238,12 @@ public:
|
||||
///////////////////////////////////////////
|
||||
// user defined constructor
|
||||
///////////////////////////////////////////
|
||||
Lattice(GridBase *grid) {
|
||||
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
|
||||
this->_grid = grid;
|
||||
resize(this->_grid->oSites());
|
||||
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
|
||||
this->checkerboard=0;
|
||||
SetViewMode(mode);
|
||||
}
|
||||
|
||||
// virtual ~Lattice(void) = default;
|
||||
@ -445,11 +281,12 @@ public:
|
||||
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
|
||||
conformable(*this,r);
|
||||
this->checkerboard = r.Checkerboard();
|
||||
auto me = AcceleratorView(ViewWrite);
|
||||
auto him= r.AcceleratorView(ViewRead);
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
auto him= r.View(AcceleratorRead);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
coalescedWrite(me[ss],him(ss));
|
||||
});
|
||||
me.ViewClose(); him.ViewClose();
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -459,11 +296,12 @@ public:
|
||||
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
|
||||
this->checkerboard = r.Checkerboard();
|
||||
conformable(*this,r);
|
||||
auto me = AcceleratorView(ViewWrite);
|
||||
auto him= r.AcceleratorView(ViewRead);
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
auto him= r.View(AcceleratorRead);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
coalescedWrite(me[ss],him(ss));
|
||||
});
|
||||
me.ViewClose(); him.ViewClose();
|
||||
return *this;
|
||||
}
|
||||
///////////////////////////////////////////
|
||||
|
@ -51,20 +51,23 @@ template<class VField, class Matrix>
|
||||
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
{
|
||||
typedef decltype(basis[0]) Field;
|
||||
typedef decltype(basis[0].View()) View;
|
||||
auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
|
||||
Vector<View> basis_v(basis.size(),tmp_v);
|
||||
typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
|
||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||
|
||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
||||
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
||||
GridBase* grid = basis[0].Grid();
|
||||
|
||||
for(int k=0;k<basis.size();k++){
|
||||
basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
|
||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||
}
|
||||
|
||||
#ifndef GRID_NVCC
|
||||
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
|
||||
int max_threads = thread_max();
|
||||
Vector < vobj > Bt(Nm * max_threads);
|
||||
thread_region
|
||||
{
|
||||
std::vector < vobj > B(Nm); // Thread private
|
||||
vobj* B = &Bt[Nm * thread_num()];
|
||||
thread_for_in_region(ss, grid->oSites(),{
|
||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||
|
||||
@ -79,6 +82,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
});
|
||||
}
|
||||
#else
|
||||
View *basis_vp = &basis_v[0];
|
||||
|
||||
int nrot = j1-j0;
|
||||
if (!nrot) // edge case not handled gracefully by Cuda
|
||||
return;
|
||||
@ -90,8 +95,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
auto Bp=&Bt[0];
|
||||
|
||||
// GPU readable copy of matrix
|
||||
Vector<double> Qt_jv(Nm*Nm);
|
||||
double *Qt_p = & Qt_jv[0];
|
||||
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
||||
Coeff_t *Qt_p = & Qt_jv[0];
|
||||
thread_for(i,Nm*Nm,{
|
||||
int j = i/Nm;
|
||||
int k = i%Nm;
|
||||
@ -133,26 +138,30 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
});
|
||||
}
|
||||
#endif
|
||||
|
||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||
}
|
||||
|
||||
// Extract a single rotated vector
|
||||
template<class Field>
|
||||
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
|
||||
{
|
||||
typedef decltype(basis[0].AcceleratorView()) View;
|
||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||
typedef typename Field::vector_object vobj;
|
||||
GridBase* grid = basis[0].Grid();
|
||||
|
||||
result.Checkerboard() = basis[0].Checkerboard();
|
||||
auto result_v=result.AcceleratorView(ViewWrite);
|
||||
Vector<View> basis_v(basis.size(),result_v);
|
||||
|
||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||
for(int k=0;k<basis.size();k++){
|
||||
basis_v[k] = basis[k].AcceleratorView(ViewRead);
|
||||
basis_v.push_back(basis[k].View(AcceleratorRead));
|
||||
}
|
||||
vobj zz=Zero();
|
||||
Vector<double> Qt_jv(Nm);
|
||||
double * Qt_j = & Qt_jv[0];
|
||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||
|
||||
autoView(result_v,result,AcceleratorWrite);
|
||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||
auto B=coalescedRead(zz);
|
||||
for(int k=k0; k<k1; ++k){
|
||||
@ -160,6 +169,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
||||
}
|
||||
coalescedWrite(result_v[ss], B);
|
||||
});
|
||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||
}
|
||||
|
||||
template<class Field>
|
||||
|
@ -42,34 +42,6 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
typedef iScalar<vInteger> vPredicate ;
|
||||
|
||||
/*
|
||||
template <class iobj, class vobj, class robj> accelerator_inline
|
||||
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
|
||||
{
|
||||
typename std::remove_const<vobj>::type ret;
|
||||
|
||||
typedef typename vobj::scalar_object scalar_object;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
const int Nsimd = vobj::vector_type::Nsimd();
|
||||
|
||||
ExtractBuffer<Integer> mask(Nsimd);
|
||||
ExtractBuffer<scalar_object> truevals(Nsimd);
|
||||
ExtractBuffer<scalar_object> falsevals(Nsimd);
|
||||
|
||||
extract(iftrue, truevals);
|
||||
extract(iffalse, falsevals);
|
||||
extract<vInteger, Integer>(TensorRemove(predicate), mask);
|
||||
|
||||
for (int s = 0; s < Nsimd; s++) {
|
||||
if (mask[s]) falsevals[s] = truevals[s];
|
||||
}
|
||||
|
||||
merge(ret, falsevals);
|
||||
return ret;
|
||||
}
|
||||
*/
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// compare lattice to lattice
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@ -78,9 +50,9 @@ template<class vfunctor,class lobj,class robj>
|
||||
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
||||
{
|
||||
Lattice<vPredicate> ret(rhs.Grid());
|
||||
auto lhs_v = lhs.View();
|
||||
auto rhs_v = rhs.View();
|
||||
auto ret_v = ret.View();
|
||||
autoView( lhs_v, lhs, CpuRead);
|
||||
autoView( rhs_v, rhs, CpuRead);
|
||||
autoView( ret_v, ret, CpuWrite);
|
||||
thread_for( ss, rhs_v.size(), {
|
||||
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
|
||||
});
|
||||
@ -93,8 +65,8 @@ template<class vfunctor,class lobj,class robj>
|
||||
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
||||
{
|
||||
Lattice<vPredicate> ret(lhs.Grid());
|
||||
auto lhs_v = lhs.View();
|
||||
auto ret_v = ret.View();
|
||||
autoView( lhs_v, lhs, CpuRead);
|
||||
autoView( ret_v, ret, CpuWrite);
|
||||
thread_for( ss, lhs_v.size(), {
|
||||
ret_v[ss]=op(lhs_v[ss],rhs);
|
||||
});
|
||||
@ -107,8 +79,8 @@ template<class vfunctor,class lobj,class robj>
|
||||
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
||||
{
|
||||
Lattice<vPredicate> ret(rhs.Grid());
|
||||
auto rhs_v = rhs.View();
|
||||
auto ret_v = ret.View();
|
||||
autoView( rhs_v, rhs, CpuRead);
|
||||
autoView( ret_v, ret, CpuWrite);
|
||||
thread_for( ss, rhs_v.size(), {
|
||||
ret_v[ss]=op(lhs,rhs_v[ss]);
|
||||
});
|
||||
|
@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
||||
GridBase *grid = l.Grid();
|
||||
int Nsimd = grid->iSites();
|
||||
|
||||
auto l_v = l.View();
|
||||
autoView(l_v, l, CpuWrite);
|
||||
thread_for( o, grid->oSites(), {
|
||||
vector_type vI;
|
||||
Coordinate gcoor;
|
||||
@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
||||
});
|
||||
};
|
||||
|
||||
// LatticeCoordinate();
|
||||
// FIXME for debug; deprecate this; made obscelete by
|
||||
template<class vobj> void lex_sites(Lattice<vobj> &l){
|
||||
auto l_v = l.View();
|
||||
Real *v_ptr = (Real *)&l_v[0];
|
||||
size_t o_len = l.Grid()->oSites();
|
||||
size_t v_len = sizeof(vobj)/sizeof(vRealF);
|
||||
size_t vec_len = vRealF::Nsimd();
|
||||
|
||||
for(int i=0;i<o_len;i++){
|
||||
for(int j=0;j<v_len;j++){
|
||||
for(int vv=0;vv<vec_len;vv+=2){
|
||||
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
|
||||
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
|
||||
}
|
||||
}}
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@ -43,8 +43,8 @@ template<class vobj>
|
||||
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
||||
{
|
||||
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
||||
auto rhs_v = rhs.View();
|
||||
auto ret_v = ret.View();
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
|
||||
});
|
||||
@ -56,9 +56,9 @@ template<class vobj>
|
||||
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
||||
{
|
||||
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
||||
auto lhs_v = lhs.View();
|
||||
auto rhs_v = rhs.View();
|
||||
auto ret_v = ret.View();
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
|
||||
});
|
||||
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
|
||||
typedef decltype(coalescedRead(ll())) sll;
|
||||
typedef decltype(coalescedRead(rr())) srr;
|
||||
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
|
||||
auto lhs_v = lhs.View();
|
||||
auto rhs_v = rhs.View();
|
||||
auto ret_v = ret.View();
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
accelerator_for(ss,rhs_v.size(),1,{
|
||||
// FIXME had issues with scalar version of outer
|
||||
// Use vector [] operator and don't read coalesce this loop
|
||||
|
@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
||||
int block =FullGrid->_slice_block [Orthog];
|
||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||
int ostride=FullGrid->_ostride[Orthog];
|
||||
auto X_v = X.View();
|
||||
auto Y_v = Y.View();
|
||||
auto R_v = R.View();
|
||||
autoView( X_v , X, CpuRead);
|
||||
autoView( Y_v , Y, CpuRead);
|
||||
autoView( R_v , R, CpuWrite);
|
||||
thread_region
|
||||
{
|
||||
std::vector<vobj> s_x(Nblock);
|
||||
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||
int ostride=FullGrid->_ostride[Orthog];
|
||||
|
||||
auto X_v = X.View();
|
||||
auto R_v = R.View();
|
||||
autoView( X_v , X, CpuRead);
|
||||
autoView( R_v , R, CpuWrite);
|
||||
|
||||
thread_region
|
||||
{
|
||||
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
||||
int ostride=FullGrid->_ostride[Orthog];
|
||||
|
||||
typedef typename vobj::vector_typeD vector_typeD;
|
||||
auto lhs_v = lhs.View();
|
||||
auto rhs_v = rhs.View();
|
||||
autoView( lhs_v , lhs, CpuRead);
|
||||
autoView( rhs_v , rhs, CpuRead);
|
||||
thread_region {
|
||||
std::vector<vobj> Left(Nblock);
|
||||
std::vector<vobj> Right(Nblock);
|
||||
|
@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
|
||||
{
|
||||
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
|
||||
ret.Checkerboard()=lhs.Checkerboard();
|
||||
auto ret_v = ret.View();
|
||||
auto lhs_v = lhs.View();
|
||||
thread_for( ss, lhs_v.size(), {
|
||||
autoView( ret_v, ret, AcceleratorWrite);
|
||||
autoView( lhs_v, lhs, AcceleratorRead);
|
||||
accelerator_for( ss, lhs_v.size(), 1, {
|
||||
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
|
||||
});
|
||||
return ret;
|
||||
@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
|
||||
{
|
||||
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
|
||||
ret.Checkerboard()=lhs.Checkerboard();
|
||||
auto ret_v = ret.View();
|
||||
auto lhs_v = lhs.View();
|
||||
thread_for( ss, lhs_v.size(), {
|
||||
autoView( ret_v, ret, AcceleratorWrite);
|
||||
autoView( lhs_v, lhs, AcceleratorRead);
|
||||
accelerator_for( ss, lhs_v.size(), 1, {
|
||||
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
|
||||
});
|
||||
return ret;
|
||||
@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
|
||||
template<int Index,class vobj>
|
||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
|
||||
{
|
||||
auto rhs_v = rhs.View();
|
||||
auto lhs_v = lhs.View();
|
||||
thread_for( ss, lhs_v.size(), {
|
||||
autoView( rhs_v, rhs, AcceleratorRead);
|
||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||
accelerator_for( ss, lhs_v.size(), 1, {
|
||||
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
|
||||
});
|
||||
}
|
||||
template<int Index,class vobj>
|
||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
|
||||
{
|
||||
auto rhs_v = rhs.View();
|
||||
auto lhs_v = lhs.View();
|
||||
thread_for( ss, lhs_v.size(), {
|
||||
autoView( rhs_v, rhs, AcceleratorRead);
|
||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||
accelerator_for( ss, lhs_v.size(), 1, {
|
||||
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
|
||||
});
|
||||
}
|
||||
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
|
||||
|
||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||
ExtractBuffer<sobj> buf(Nsimd);
|
||||
auto l_v = l.View();
|
||||
autoView( l_v , l, CpuWrite);
|
||||
if ( rank == grid->ThisRank() ) {
|
||||
extract(l_v[odx],buf);
|
||||
buf[idx] = s;
|
||||
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||
|
||||
ExtractBuffer<sobj> buf(Nsimd);
|
||||
auto l_v = l.View();
|
||||
autoView( l_v , l, CpuWrite);
|
||||
extract(l_v[odx],buf);
|
||||
|
||||
s = buf[idx];
|
||||
@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
||||
return;
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Peek a scalar object from the SIMD array
|
||||
//////////////////////////////////////////////////////////
|
||||
// Must be CPU read view
|
||||
template<class vobj,class sobj>
|
||||
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
|
||||
|
||||
GridBase *grid = l.Grid();
|
||||
|
||||
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
||||
{
|
||||
GridBase *grid = l.getGrid();
|
||||
assert(l.mode==CpuRead);
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
|
||||
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||
|
||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
@ -173,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
|
||||
idx= grid->iIndex(site);
|
||||
odx= grid->oIndex(site);
|
||||
|
||||
auto l_v = l.View();
|
||||
scalar_type * vp = (scalar_type *)&l_v[odx];
|
||||
scalar_type * vp = (scalar_type *)&l[odx];
|
||||
scalar_type * pt = (scalar_type *)&s;
|
||||
|
||||
for(int w=0;w<words;w++){
|
||||
@ -183,18 +182,27 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
|
||||
|
||||
return;
|
||||
};
|
||||
|
||||
template<class vobj,class sobj>
|
||||
inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
|
||||
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
|
||||
{
|
||||
autoView(lv,l,CpuRead);
|
||||
peekLocalSite(s,lv,site);
|
||||
return;
|
||||
};
|
||||
|
||||
GridBase *grid=l.Grid();
|
||||
// Must be CPU write view
|
||||
template<class vobj,class sobj>
|
||||
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
||||
{
|
||||
GridBase *grid=l.getGrid();
|
||||
assert(l.mode==CpuWrite);
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
|
||||
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||
|
||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
@ -202,13 +210,19 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
|
||||
idx= grid->iIndex(site);
|
||||
odx= grid->oIndex(site);
|
||||
|
||||
auto l_v = l.View();
|
||||
scalar_type * vp = (scalar_type *)&l_v[odx];
|
||||
scalar_type * vp = (scalar_type *)&l[odx];
|
||||
scalar_type * pt = (scalar_type *)&s;
|
||||
for(int w=0;w<words;w++){
|
||||
vp[idx+w*Nsimd] = pt[w];
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
template<class vobj,class sobj>
|
||||
inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
|
||||
{
|
||||
autoView(lv,l,CpuWrite);
|
||||
pokeLocalSite(s,lv,site);
|
||||
return;
|
||||
};
|
||||
|
||||
|
79
Grid/lattice/Lattice_real_imag.h
Normal file
79
Grid/lattice/Lattice_real_imag.h
Normal file
@ -0,0 +1,79 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/lattice/Lattice_reality.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: neo <cossu@post.kek.jp>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_LATTICE_REAL_IMAG_H
|
||||
#define GRID_LATTICE_REAL_IMAG_H
|
||||
|
||||
|
||||
// FIXME .. this is the sector of the code
|
||||
// I am most worried about the directions
|
||||
// The choice of burying complex in the SIMD
|
||||
// is making the use of "real" and "imag" very cumbersome
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs.Grid());
|
||||
|
||||
autoView( lhs_v, lhs, AcceleratorRead);
|
||||
autoView( ret_v, ret, AcceleratorWrite);
|
||||
|
||||
ret.Checkerboard()=lhs.Checkerboard();
|
||||
accelerator_for( ss, lhs_v.size(), 1, {
|
||||
ret_v[ss] =real(lhs_v[ss]);
|
||||
});
|
||||
return ret;
|
||||
};
|
||||
template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs.Grid());
|
||||
|
||||
autoView( lhs_v, lhs, AcceleratorRead);
|
||||
autoView( ret_v, ret, AcceleratorWrite);
|
||||
|
||||
ret.Checkerboard()=lhs.Checkerboard();
|
||||
accelerator_for( ss, lhs_v.size(), 1, {
|
||||
ret_v[ss] =imag(lhs_v[ss]);
|
||||
});
|
||||
return ret;
|
||||
};
|
||||
|
||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||
auto real(const Expression &expr) -> decltype(real(closure(expr)))
|
||||
{
|
||||
return real(closure(expr));
|
||||
}
|
||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||
auto imag(const Expression &expr) -> decltype(imag(closure(expr)))
|
||||
{
|
||||
return imag(closure(expr));
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
@ -40,9 +40,11 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs.Grid());
|
||||
|
||||
autoView( lhs_v, lhs, AcceleratorRead);
|
||||
autoView( ret_v, ret, AcceleratorWrite);
|
||||
|
||||
ret.Checkerboard()=lhs.Checkerboard();
|
||||
auto lhs_v = lhs.View();
|
||||
auto ret_v = ret.View();
|
||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
||||
});
|
||||
@ -51,9 +53,11 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
||||
|
||||
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs.Grid());
|
||||
|
||||
autoView( lhs_v, lhs, AcceleratorRead);
|
||||
autoView( ret_v, ret, AcceleratorWrite);
|
||||
|
||||
ret.Checkerboard() = lhs.Checkerboard();
|
||||
auto lhs_v = lhs.View();
|
||||
auto ret_v = ret.View();
|
||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
|
||||
});
|
||||
|
@ -25,7 +25,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#include <Grid/Grid_Eigen_Dense.h>
|
||||
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
||||
#endif
|
||||
|
||||
@ -39,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
|
||||
const int Nsimd = vobj::Nsimd();
|
||||
// const int Nsimd = vobj::Nsimd();
|
||||
const int nthread = GridThread::GetThreads();
|
||||
|
||||
Vector<sobj> sumarray(nthread);
|
||||
for(int i=0;i<nthread;i++){
|
||||
sumarray[i]=Zero();
|
||||
}
|
||||
|
||||
thread_for(thr,nthread, {
|
||||
int nwork, mywork, myoff;
|
||||
nwork = osites;
|
||||
GridThread::GetWork(nwork,thr,mywork,myoff);
|
||||
vobj vvsum=Zero();
|
||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||
vvsum = vvsum + arg[ss];
|
||||
}
|
||||
sumarray[thr]=Reduce(vvsum);
|
||||
});
|
||||
|
||||
sobj ssum=Zero(); // sum across threads
|
||||
for(int i=0;i<nthread;i++){
|
||||
ssum = ssum+sumarray[i];
|
||||
}
|
||||
return ssum;
|
||||
}
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_objectD sobj;
|
||||
|
||||
const int nthread = GridThread::GetThreads();
|
||||
|
||||
Vector<sobj> sumarray(nthread);
|
||||
@ -63,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
||||
ssum = ssum+sumarray[i];
|
||||
}
|
||||
|
||||
return ssum;
|
||||
typedef typename vobj::scalar_object ssobj;
|
||||
ssobj ret = ssum;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||
{
|
||||
#ifdef GRID_NVCC
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
return sum_gpu(arg,osites);
|
||||
#else
|
||||
return sum_cpu(arg,osites);
|
||||
#endif
|
||||
}
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
return sumD_gpu(arg,osites);
|
||||
#else
|
||||
return sumD_cpu(arg,osites);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||
{
|
||||
auto arg_v = arg.View();
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
autoView( arg_v, arg, AcceleratorRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum(&arg_v[0],osites);
|
||||
auto ssum= sum_gpu(&arg_v[0],osites);
|
||||
#else
|
||||
autoView(arg_v, arg, CpuRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||
#endif
|
||||
arg.Grid()->GlobalSum(ssum);
|
||||
return ssum;
|
||||
}
|
||||
@ -102,42 +151,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
||||
|
||||
GridBase *grid = left.Grid();
|
||||
|
||||
// Might make all code paths go this way.
|
||||
auto left_v = left.AcceleratorView(ViewRead);
|
||||
auto right_v=right.AcceleratorView(ViewRead);
|
||||
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites = grid->oSites();
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
// Might make all code paths go this way.
|
||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
{
|
||||
autoView( left_v , left, AcceleratorRead);
|
||||
autoView( right_v,right, AcceleratorRead);
|
||||
|
||||
// GPU - SIMT lane compliance...
|
||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto x_l = left_v(ss);
|
||||
auto y_l = right_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||
})
|
||||
|
||||
// This is in single precision and fails some tests
|
||||
// Need a sumD that sums in double
|
||||
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
|
||||
#else
|
||||
// CPU
|
||||
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
accelerator_for( ss, sites, 1,{
|
||||
auto x_l = left_v[ss];
|
||||
auto y_l = right_v[ss];
|
||||
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
||||
})
|
||||
nrm = TensorRemove(sum(inner_tmp_v,sites));
|
||||
#endif
|
||||
});
|
||||
}
|
||||
|
||||
// This is in single precision and fails some tests
|
||||
auto anrm = sum(inner_tmp_v,sites);
|
||||
nrm = anrm;
|
||||
return nrm;
|
||||
}
|
||||
|
||||
@ -175,40 +211,24 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
|
||||
GridBase *grid = x.Grid();
|
||||
|
||||
auto x_v=x.AcceleratorView(ViewRead);
|
||||
auto y_v=y.AcceleratorView(ViewRead);
|
||||
auto z_v=z.AcceleratorView(ViewWrite);
|
||||
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites = grid->oSites();
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
// GPU
|
||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
autoView( x_v, x, AcceleratorRead);
|
||||
autoView( y_v, y, AcceleratorRead);
|
||||
autoView( z_v, z, AcceleratorWrite);
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||
coalescedWrite(z_v[ss],tmp);
|
||||
});
|
||||
|
||||
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
|
||||
#else
|
||||
// CPU
|
||||
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||
accelerator_for( ss, sites, 1,{
|
||||
auto tmp = a*x_v[ss]+b*y_v[ss];
|
||||
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
||||
z_v[ss]=tmp;
|
||||
});
|
||||
// Already promoted to double
|
||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||
#endif
|
||||
grid->GlobalSum(nrm);
|
||||
return nrm;
|
||||
}
|
||||
@ -224,47 +244,29 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
||||
|
||||
GridBase *grid = left.Grid();
|
||||
|
||||
auto left_v=left.AcceleratorView(ViewRead);
|
||||
auto right_v=right.AcceleratorView(ViewRead);
|
||||
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites = grid->oSites();
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
// GPU
|
||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||
typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
|
||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
Vector<norm_t> norm_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
auto norm_tmp_v = &norm_tmp[0];
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto left_tmp = left_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
|
||||
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
|
||||
});
|
||||
|
||||
tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
|
||||
tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
|
||||
#else
|
||||
// CPU
|
||||
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
|
||||
typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
Vector<norm_t> norm_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
auto norm_tmp_v = &norm_tmp[0];
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto left_tmp = left_v(ss);
|
||||
inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
|
||||
{
|
||||
autoView(left_v,left, AcceleratorRead);
|
||||
autoView(right_v,right,AcceleratorRead);
|
||||
accelerator_for( ss, sites, 1,{
|
||||
auto left_tmp = left_v[ss];
|
||||
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
|
||||
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
|
||||
});
|
||||
// Already promoted to double
|
||||
}
|
||||
|
||||
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
|
||||
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
|
||||
#endif
|
||||
|
||||
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
|
||||
ip = tmp[0];
|
||||
nrm = real(tmp[1]);
|
||||
@ -335,7 +337,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
||||
|
||||
// sum over reduced dimension planes, breaking out orthog dir
|
||||
// Parallel over orthog direction
|
||||
auto Data_v=Data.View();
|
||||
autoView( Data_v, Data, CpuRead);
|
||||
thread_for( r,rd, {
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
for(int n=0;n<e1;n++){
|
||||
@ -413,8 +415,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
|
||||
auto lhv=lhs.View();
|
||||
auto rhv=rhs.View();
|
||||
autoView( lhv, lhs, CpuRead);
|
||||
autoView( rhv, rhs, CpuRead);
|
||||
thread_for( r,rd,{
|
||||
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
@ -521,14 +523,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
||||
|
||||
tensor_reduced at; at=av;
|
||||
|
||||
auto Rv=R.View();
|
||||
auto Xv=X.View();
|
||||
auto Yv=Y.View();
|
||||
thread_for_collapse(2, n, e1, {
|
||||
for(int b=0;b<e2;b++){
|
||||
autoView( Rv, R, CpuWrite);
|
||||
autoView( Xv, X, CpuRead);
|
||||
autoView( Yv, Y, CpuRead);
|
||||
thread_for2d( n, e1, b,e2, {
|
||||
int ss= so+n*stride+b;
|
||||
Rv[ss] = at*Xv[ss]+Yv[ss];
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
@ -581,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||
int ostride=FullGrid->_ostride[Orthog];
|
||||
|
||||
auto X_v=X.View();
|
||||
auto Y_v=Y.View();
|
||||
auto R_v=R.View();
|
||||
autoView( X_v, X, CpuRead);
|
||||
autoView( Y_v, Y, CpuRead);
|
||||
autoView( R_v, R, CpuWrite);
|
||||
thread_region
|
||||
{
|
||||
Vector<vobj> s_x(Nblock);
|
||||
@ -628,13 +628,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
||||
// int nl=1;
|
||||
|
||||
//FIXME package in a convenient iterator
|
||||
// thread_for2d_in_region
|
||||
//Should loop over a plane orthogonal to direction "Orthog"
|
||||
int stride=FullGrid->_slice_stride[Orthog];
|
||||
int block =FullGrid->_slice_block [Orthog];
|
||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||
int ostride=FullGrid->_ostride[Orthog];
|
||||
auto R_v = R.View();
|
||||
auto X_v = X.View();
|
||||
autoView( R_v, R, CpuWrite);
|
||||
autoView( X_v, X, CpuRead);
|
||||
thread_region
|
||||
{
|
||||
std::vector<vobj> s_x(Nblock);
|
||||
@ -692,8 +693,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
||||
|
||||
typedef typename vobj::vector_typeD vector_typeD;
|
||||
|
||||
auto lhs_v=lhs.View();
|
||||
auto rhs_v=rhs.View();
|
||||
autoView( lhs_v, lhs, CpuRead);
|
||||
autoView( rhs_v, rhs, CpuRead);
|
||||
thread_region
|
||||
{
|
||||
std::vector<vobj> Left(Nblock);
|
||||
|
@ -1,7 +1,14 @@
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#define WARP_SIZE 32
|
||||
#ifdef GRID_HIP
|
||||
extern hipDeviceProp_t *gpu_props;
|
||||
#define WARP_SIZE 64
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
extern cudaDeviceProp *gpu_props;
|
||||
#define WARP_SIZE 32
|
||||
#endif
|
||||
|
||||
__device__ unsigned int retirementCount = 0;
|
||||
|
||||
template <class Iterator>
|
||||
@ -19,7 +26,12 @@ template <class Iterator>
|
||||
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
||||
|
||||
int device;
|
||||
#ifdef GRID_CUDA
|
||||
cudaGetDevice(&device);
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
hipGetDevice(&device);
|
||||
#endif
|
||||
|
||||
Iterator warpSize = gpu_props[device].warpSize;
|
||||
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
||||
@ -53,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
||||
|
||||
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
||||
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
||||
__syncwarp();
|
||||
acceleratorSynchronise();
|
||||
|
||||
const Iterator VEC = WARP_SIZE;
|
||||
const Iterator vid = tid & (VEC-1);
|
||||
@ -67,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
||||
beta += temp;
|
||||
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
||||
}
|
||||
__syncwarp();
|
||||
acceleratorSynchronise();
|
||||
}
|
||||
__syncthreads();
|
||||
acceleratorSynchroniseAll();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
beta = Zero();
|
||||
@ -79,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
||||
}
|
||||
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
||||
}
|
||||
__syncthreads();
|
||||
acceleratorSynchroniseAll();
|
||||
}
|
||||
|
||||
|
||||
@ -147,7 +159,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
||||
sobj *smem = (sobj *)shmem_pointer;
|
||||
|
||||
// wait until all outstanding memory instructions in this thread are finished
|
||||
__threadfence();
|
||||
acceleratorFence();
|
||||
|
||||
if (tid==0) {
|
||||
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
||||
@ -156,7 +168,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
||||
}
|
||||
|
||||
// each thread must read the correct value of amLast
|
||||
__syncthreads();
|
||||
acceleratorSynchroniseAll();
|
||||
|
||||
if (amLast) {
|
||||
// reduce buffer[0], ..., buffer[gridDim.x-1]
|
||||
@ -199,13 +211,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||
sobj *buffer_v = &buffer[0];
|
||||
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaError err = cudaGetLastError();
|
||||
if ( cudaSuccess != err ) {
|
||||
printf("Cuda error %s\n",cudaGetErrorString( err ));
|
||||
exit(0);
|
||||
}
|
||||
accelerator_barrier();
|
||||
auto result = buffer_v[0];
|
||||
return result;
|
||||
}
|
||||
|
@ -375,7 +375,7 @@ public:
|
||||
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
||||
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
||||
|
||||
auto l_v = l.View();
|
||||
autoView(l_v, l, CpuWrite);
|
||||
thread_for( ss, osites, {
|
||||
ExtractBuffer<scalar_object> buf(Nsimd);
|
||||
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
||||
@ -462,7 +462,7 @@ public:
|
||||
|
||||
{
|
||||
// Obtain one reseeded generator per thread
|
||||
int Nthread = GridThread::GetThreads();
|
||||
int Nthread = 32; // Hardwire a good level or parallelism
|
||||
std::vector<RngEngine> seeders(Nthread);
|
||||
for(int t=0;t<Nthread;t++){
|
||||
seeders[t] = Reseed(master_engine);
|
||||
|
@ -42,8 +42,8 @@ template<class vobj>
|
||||
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
|
||||
{
|
||||
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
|
||||
auto ret_v = ret.View();
|
||||
auto lhs_v = lhs.View();
|
||||
autoView(ret_v , ret, AcceleratorWrite);
|
||||
autoView(lhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
|
||||
});
|
||||
@ -58,8 +58,8 @@ template<int Index,class vobj>
|
||||
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
|
||||
{
|
||||
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
|
||||
auto ret_v = ret.View();
|
||||
auto lhs_v = lhs.View();
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
|
||||
});
|
||||
|
@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// remove and insert a half checkerboard
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
||||
{
|
||||
half.Checkerboard() = cb;
|
||||
|
||||
auto half_v = half.View();
|
||||
auto full_v = full.View();
|
||||
autoView( half_v, half, CpuWrite);
|
||||
autoView( full_v, full, CpuRead);
|
||||
thread_for(ss, full.Grid()->oSites(),{
|
||||
int cbos;
|
||||
Coordinate coor;
|
||||
@ -64,11 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
||||
{
|
||||
int cb = half.Checkerboard();
|
||||
auto half_v = half.View();
|
||||
auto full_v = full.View();
|
||||
autoView( half_v , half, CpuRead);
|
||||
autoView( full_v , full, CpuWrite);
|
||||
thread_for(ss,full.Grid()->oSites(),{
|
||||
|
||||
Coordinate coor;
|
||||
@ -96,15 +97,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
|
||||
out = in;
|
||||
}
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
|
||||
((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
|
||||
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
|
||||
}
|
||||
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
|
||||
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
|
||||
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
|
||||
}
|
||||
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
|
||||
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
|
||||
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -151,9 +152,8 @@ accelerator_inline void convertType(T & out, const T & in) {
|
||||
|
||||
template<typename T1,typename T2>
|
||||
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
||||
auto out_v = out.AcceleratorView(ViewWrite);
|
||||
auto in_v = in.AcceleratorView(ViewRead);
|
||||
|
||||
autoView( out_v , out,AcceleratorWrite);
|
||||
autoView( in_v , in ,AcceleratorRead);
|
||||
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
|
||||
convertType(out_v[ss],in_v(ss));
|
||||
});
|
||||
@ -164,19 +164,20 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj>
|
||||
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
|
||||
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
|
||||
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
|
||||
{
|
||||
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
|
||||
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
|
||||
Lattice<iScalar<t_inner>> ret(lhs.Grid());
|
||||
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||
|
||||
{
|
||||
autoView(ret_v, ret,AcceleratorWrite);
|
||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
|
||||
});
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -194,9 +195,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
Lattice<iScalar<CComplex>> ip(coarse);
|
||||
Lattice<vobj> fineDataRed = fineData;
|
||||
|
||||
// auto fineData_ = fineData.View();
|
||||
auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
|
||||
auto ip_ = ip.AcceleratorView(ViewReadWrite);
|
||||
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||
autoView( ip_ , ip, AcceleratorWrite);
|
||||
for(int v=0;v<nbasis;v++) {
|
||||
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
|
||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||
@ -210,68 +210,6 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj,class CComplex,int nbasis>
|
||||
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
const Lattice<vobj> &fineData,
|
||||
const std::vector<Lattice<vobj> > &Basis)
|
||||
{
|
||||
typedef iVector<CComplex,nbasis > coarseSiteData;
|
||||
coarseSiteData elide;
|
||||
typedef decltype(coalescedRead(elide)) ScalarComplex;
|
||||
GridBase * fine = fineData.Grid();
|
||||
GridBase * coarse= coarseData.Grid();
|
||||
int _ndimension = coarse->_ndimension;
|
||||
|
||||
// checks
|
||||
assert( nbasis == Basis.size() );
|
||||
subdivides(coarse,fine);
|
||||
for(int i=0;i<nbasis;i++){
|
||||
conformable(Basis[i],fineData);
|
||||
}
|
||||
|
||||
Coordinate block_r (_ndimension);
|
||||
|
||||
for(int d=0 ; d<_ndimension;d++){
|
||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
|
||||
}
|
||||
int blockVol = fine->oSites()/coarse->oSites();
|
||||
|
||||
coarseData=Zero();
|
||||
|
||||
auto fineData_ = fineData.View();
|
||||
auto coarseData_ = coarseData.View();
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
|
||||
// Otherwise do fine inner product per site, and make the update atomic
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
|
||||
|
||||
auto sc=sci/nbasis;
|
||||
auto i=sci%nbasis;
|
||||
auto Basis_ = Basis[i].View();
|
||||
|
||||
Coordinate coor_c(_ndimension);
|
||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
||||
|
||||
int sf;
|
||||
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
|
||||
|
||||
for(int sb=0;sb<blockVol;sb++){
|
||||
|
||||
Coordinate coor_b(_ndimension);
|
||||
Coordinate coor_f(_ndimension);
|
||||
|
||||
Lexicographic::CoorFromIndex(coor_b,sb,block_r);
|
||||
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
|
||||
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
||||
|
||||
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
|
||||
}
|
||||
coalescedWrite(coarseData_[sc](i),reduce);
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
template<class vobj,class vobj2,class CComplex>
|
||||
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||
@ -298,10 +236,12 @@ template<class vobj,class vobj2,class CComplex>
|
||||
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||
}
|
||||
|
||||
auto fineZ_ = fineZ.AcceleratorView(ViewWrite);
|
||||
auto fineX_ = fineX.AcceleratorView(ViewRead);
|
||||
auto fineY_ = fineY.AcceleratorView(ViewRead);
|
||||
auto coarseA_= coarseA.AcceleratorView(ViewRead);
|
||||
autoView( fineZ_ , fineZ, AcceleratorWrite);
|
||||
autoView( fineX_ , fineX, AcceleratorRead);
|
||||
autoView( fineY_ , fineY, AcceleratorRead);
|
||||
autoView( coarseA_, coarseA, AcceleratorRead);
|
||||
Coordinate fine_rdimensions = fine->_rdimensions;
|
||||
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
||||
|
||||
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
||||
|
||||
@ -309,12 +249,12 @@ template<class vobj,class vobj2,class CComplex>
|
||||
Coordinate coor_c(_ndimension);
|
||||
Coordinate coor_f(_ndimension);
|
||||
|
||||
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
|
||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
|
||||
|
||||
// z = A x + y
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
typename vobj2::tensor_reduced::scalar_object cA;
|
||||
typename vobj::scalar_object cAx;
|
||||
#else
|
||||
@ -344,15 +284,16 @@ template<class vobj,class CComplex>
|
||||
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
|
||||
Lattice<dotp> coarse_inner(coarse);
|
||||
|
||||
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
|
||||
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
|
||||
|
||||
// Precision promotion
|
||||
fine_inner = localInnerProductD(fineX,fineY);
|
||||
fine_inner = localInnerProductD<vobj>(fineX,fineY);
|
||||
blockSum(coarse_inner,fine_inner);
|
||||
{
|
||||
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
|
||||
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
|
||||
accelerator_for(ss, coarse->oSites(), 1, {
|
||||
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -370,15 +311,16 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
||||
Lattice<dotp> coarse_inner(coarse);
|
||||
|
||||
// Precision promotion?
|
||||
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
|
||||
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
|
||||
|
||||
fine_inner = localInnerProduct(fineX,fineY);
|
||||
blockSum(coarse_inner,fine_inner);
|
||||
{
|
||||
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
|
||||
autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
|
||||
accelerator_for(ss, coarse->oSites(), 1, {
|
||||
CoarseInner_[ss] = coarse_inner_[ss];
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj,class CComplex>
|
||||
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
|
||||
@ -408,14 +350,19 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
||||
}
|
||||
int blockVol = fine->oSites()/coarse->oSites();
|
||||
|
||||
auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
|
||||
auto fineData_ = fineData.AcceleratorView(ViewRead);
|
||||
// Turn this around to loop threaded over sc and interior loop
|
||||
// over sf would thread better
|
||||
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||
autoView( fineData_ , fineData, AcceleratorRead);
|
||||
|
||||
Coordinate fine_rdimensions = fine->_rdimensions;
|
||||
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
||||
|
||||
accelerator_for(sc,coarse->oSites(),1,{
|
||||
|
||||
// One thread per sub block
|
||||
Coordinate coor_c(_ndimension);
|
||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
|
||||
coarseData_[sc]=Zero();
|
||||
|
||||
for(int sb=0;sb<blockVol;sb++){
|
||||
@ -425,7 +372,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
||||
Coordinate coor_f(_ndimension);
|
||||
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
|
||||
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
|
||||
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
||||
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
|
||||
|
||||
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
|
||||
}
|
||||
@ -510,8 +457,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
for(int d=0 ; d<_ndimension;d++){
|
||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||
}
|
||||
auto fineData_ = fineData.View();
|
||||
auto coarseData_ = coarseData.View();
|
||||
autoView( fineData_ , fineData, AcceleratorWrite);
|
||||
autoView( coarseData_ , coarseData, AcceleratorRead);
|
||||
|
||||
// Loop with a cache friendly loop ordering
|
||||
accelerator_for(sf,fine->oSites(),1,{
|
||||
@ -524,7 +471,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||
|
||||
for(int i=0;i<nbasis;i++) {
|
||||
auto basis_ = Basis[i].View();
|
||||
/* auto basis_ = Basis[i], );*/
|
||||
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
|
||||
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
|
||||
}
|
||||
@ -543,7 +490,14 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
fineData=Zero();
|
||||
for(int i=0;i<nbasis;i++) {
|
||||
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
||||
auto ip_ = ip.AcceleratorView(ViewRead);
|
||||
|
||||
//Lattice<CComplex> cip(coarse);
|
||||
//autoView( cip_ , cip, AcceleratorWrite);
|
||||
//autoView( ip_ , ip, AcceleratorRead);
|
||||
//accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
|
||||
// coalescedWrite(cip_[sc], ip_(sc)());
|
||||
// });
|
||||
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
|
||||
blockZAXPY(fineData,ip,Basis[i],fineData);
|
||||
}
|
||||
}
|
||||
@ -571,15 +525,17 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
|
||||
assert(ig->lSites() == og->lSites());
|
||||
}
|
||||
|
||||
autoView(in_v,in,CpuRead);
|
||||
autoView(out_v,out,CpuWrite);
|
||||
thread_for(idx, ig->lSites(),{
|
||||
sobj s;
|
||||
ssobj ss;
|
||||
|
||||
Coordinate lcoor(ni);
|
||||
ig->LocalIndexToLocalCoor(idx,lcoor);
|
||||
peekLocalSite(s,in,lcoor);
|
||||
peekLocalSite(s,in_v,lcoor);
|
||||
ss=s;
|
||||
pokeLocalSite(ss,out,lcoor);
|
||||
pokeLocalSite(ss,out_v,lcoor);
|
||||
});
|
||||
}
|
||||
|
||||
@ -614,8 +570,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
Coordinate rdt = Tg->_rdimensions;
|
||||
Coordinate ist = Tg->_istride;
|
||||
Coordinate ost = Tg->_ostride;
|
||||
auto t_v = To.AcceleratorView(ViewWrite);
|
||||
auto f_v = From.AcceleratorView(ViewRead);
|
||||
|
||||
autoView( t_v , To, AcceleratorWrite);
|
||||
autoView( f_v , From, AcceleratorRead);
|
||||
accelerator_for(idx,Fg->lSites(),1,{
|
||||
sobj s;
|
||||
Coordinate Fcoor(nd);
|
||||
@ -638,8 +595,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
for(int w=0;w<words;w++){
|
||||
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
|
||||
}
|
||||
// peekLocalSite(s,From,Fcoor);
|
||||
// pokeLocalSite(s,To ,Tcoor);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -670,6 +625,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
||||
}
|
||||
|
||||
// the above should guarantee that the operations are local
|
||||
autoView(lowDimv,lowDim,CpuRead);
|
||||
autoView(higherDimv,higherDim,CpuWrite);
|
||||
thread_for(idx,lg->lSites(),{
|
||||
sobj s;
|
||||
Coordinate lcoor(nl);
|
||||
@ -682,8 +639,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
||||
hcoor[d]=lcoor[ddl++];
|
||||
}
|
||||
}
|
||||
peekLocalSite(s,lowDim,lcoor);
|
||||
pokeLocalSite(s,higherDim,hcoor);
|
||||
peekLocalSite(s,lowDimv,lcoor);
|
||||
pokeLocalSite(s,higherDimv,hcoor);
|
||||
});
|
||||
}
|
||||
|
||||
@ -711,6 +668,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
||||
}
|
||||
}
|
||||
// the above should guarantee that the operations are local
|
||||
autoView(lowDimv,lowDim,CpuWrite);
|
||||
autoView(higherDimv,higherDim,CpuRead);
|
||||
thread_for(idx,lg->lSites(),{
|
||||
sobj s;
|
||||
Coordinate lcoor(nl);
|
||||
@ -723,8 +682,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
||||
hcoor[d]=lcoor[ddl++];
|
||||
}
|
||||
}
|
||||
peekLocalSite(s,higherDim,hcoor);
|
||||
pokeLocalSite(s,lowDim,lcoor);
|
||||
peekLocalSite(s,higherDimv,hcoor);
|
||||
pokeLocalSite(s,lowDimv,lcoor);
|
||||
});
|
||||
|
||||
}
|
||||
@ -752,6 +711,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
||||
}
|
||||
|
||||
// the above should guarantee that the operations are local
|
||||
autoView(lowDimv,lowDim,CpuRead);
|
||||
autoView(higherDimv,higherDim,CpuWrite);
|
||||
thread_for(idx,lg->lSites(),{
|
||||
sobj s;
|
||||
Coordinate lcoor(nl);
|
||||
@ -760,8 +721,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
||||
if( lcoor[orthog] == slice_lo ) {
|
||||
hcoor=lcoor;
|
||||
hcoor[orthog] = slice_hi;
|
||||
peekLocalSite(s,lowDim,lcoor);
|
||||
pokeLocalSite(s,higherDim,hcoor);
|
||||
peekLocalSite(s,lowDimv,lcoor);
|
||||
pokeLocalSite(s,higherDimv,hcoor);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -789,6 +750,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
|
||||
}
|
||||
|
||||
// the above should guarantee that the operations are local
|
||||
autoView(lowDimv,lowDim,CpuWrite);
|
||||
autoView(higherDimv,higherDim,CpuRead);
|
||||
thread_for(idx,lg->lSites(),{
|
||||
sobj s;
|
||||
Coordinate lcoor(nl);
|
||||
@ -797,8 +760,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
|
||||
if( lcoor[orthog] == slice_lo ) {
|
||||
hcoor=lcoor;
|
||||
hcoor[orthog] = slice_hi;
|
||||
peekLocalSite(s,higherDim,hcoor);
|
||||
pokeLocalSite(s,lowDim,lcoor);
|
||||
peekLocalSite(s,higherDimv,hcoor);
|
||||
pokeLocalSite(s,lowDimv,lcoor);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -862,7 +825,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
||||
}
|
||||
|
||||
//loop over outer index
|
||||
auto in_v = in.View();
|
||||
autoView( in_v , in, CpuRead);
|
||||
thread_for(in_oidx,in_grid->oSites(),{
|
||||
//Assemble vector of pointers to output elements
|
||||
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
||||
@ -955,7 +918,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
||||
icoor[lane].resize(ndim);
|
||||
grid->iCoorFromIindex(icoor[lane],lane);
|
||||
}
|
||||
auto out_v = out.View();
|
||||
autoView( out_v , out, CpuWrite);
|
||||
thread_for(oidx, grid->oSites(),{
|
||||
//Assemble vector of pointers to output elements
|
||||
ExtractPointerArray<sobj> ptrs(nsimd);
|
||||
@ -1058,7 +1021,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
||||
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
||||
|
||||
auto out_v = out.View();
|
||||
autoView( out_v , out, CpuWrite);
|
||||
thread_for(out_oidx,out_grid->oSites(),{
|
||||
Coordinate out_ocoor(ndim);
|
||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
||||
|
@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid);
|
||||
template<class vobj>
|
||||
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs.Grid());
|
||||
auto ret_v = ret.View();
|
||||
auto lhs_v = lhs.View();
|
||||
autoView( ret_v, ret, AcceleratorWrite);
|
||||
autoView( lhs_v, lhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
||||
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
|
||||
});
|
||||
@ -58,8 +58,8 @@ template<int Index,class vobj>
|
||||
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
|
||||
{
|
||||
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
|
||||
auto ret_v = ret.View();
|
||||
auto lhs_v = lhs.View();
|
||||
autoView( ret_v, ret, AcceleratorWrite);
|
||||
autoView( lhs_v, lhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
||||
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
|
||||
});
|
||||
|
@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
||||
Lattice<obj> ret_i(rhs_i.Grid());
|
||||
auto rhs = rhs_i.View();
|
||||
auto ret = ret_i.View();
|
||||
autoView( rhs, rhs_i, AcceleratorRead);
|
||||
autoView( ret, ret_i, AcceleratorWrite);
|
||||
ret.Checkerboard() = rhs.Checkerboard();
|
||||
accelerator_for(ss,rhs.size(),1,{
|
||||
ret[ss]=pow(rhs[ss],y);
|
||||
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
||||
}
|
||||
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
||||
Lattice<obj> ret_i(rhs_i.Grid());
|
||||
auto rhs = rhs_i.View();
|
||||
auto ret = ret_i.View();
|
||||
autoView( rhs , rhs_i, AcceleratorRead);
|
||||
autoView( ret , ret_i, AcceleratorWrite);
|
||||
ret.Checkerboard() = rhs.Checkerboard();
|
||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||
coalescedWrite(ret[ss],mod(rhs(ss),y));
|
||||
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
||||
|
||||
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
||||
Lattice<obj> ret_i(rhs_i.Grid());
|
||||
auto ret = ret_i.View();
|
||||
auto rhs = rhs_i.View();
|
||||
autoView( ret , ret_i, AcceleratorWrite);
|
||||
autoView( rhs , rhs_i, AcceleratorRead);
|
||||
ret.Checkerboard() = rhs_i.Checkerboard();
|
||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||
coalescedWrite(ret[ss],div(rhs(ss),y));
|
||||
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
||||
|
||||
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
|
||||
Lattice<obj> ret_i(rhs_i.Grid());
|
||||
auto rhs = rhs_i.View();
|
||||
auto ret = ret_i.View();
|
||||
autoView( rhs , rhs_i, AcceleratorRead);
|
||||
autoView( ret , ret_i, AcceleratorWrite);
|
||||
ret.Checkerboard() = rhs.Checkerboard();
|
||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
|
||||
|
168
Grid/lattice/Lattice_view.h
Normal file
168
Grid/lattice/Lattice_view.h
Normal file
@ -0,0 +1,168 @@
|
||||
#pragma once
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Base class which can be used by traits to pick up behaviour
|
||||
///////////////////////////////////////////////////////////////////
|
||||
class LatticeBase {};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Conformable checks; same instance of Grid required
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
|
||||
{
|
||||
assert(lhs == rhs);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Minimal base class containing only data valid to access from accelerator
|
||||
// _odata will be a managed pointer in CUDA
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Force access to lattice through a view object.
|
||||
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
|
||||
// strict since host could could in principle direct access through the lattice object
|
||||
// Need to decide programming model.
|
||||
#define LATTICE_VIEW_STRICT
|
||||
template<class vobj> class LatticeAccelerator : public LatticeBase
|
||||
{
|
||||
protected:
|
||||
//public:
|
||||
GridBase *_grid;
|
||||
int checkerboard;
|
||||
vobj *_odata; // A managed pointer
|
||||
uint64_t _odata_size;
|
||||
ViewAdvise advise;
|
||||
public:
|
||||
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
|
||||
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
|
||||
accelerator_inline int Checkerboard(void) const { return checkerboard; };
|
||||
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
|
||||
accelerator_inline ViewAdvise Advise(void) const { return advise; };
|
||||
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
|
||||
accelerator_inline void Conformable(GridBase * &grid) const
|
||||
{
|
||||
if (grid) conformable(grid, _grid);
|
||||
else grid = _grid;
|
||||
};
|
||||
// Host only
|
||||
GridBase * getGrid(void) const { return _grid; };
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// A View class which provides accessor to the data.
|
||||
// This will be safe to call from accelerator_for and is trivially copy constructible
|
||||
// The copy constructor for this will need to be used by device lambda functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj>
|
||||
class LatticeView : public LatticeAccelerator<vobj>
|
||||
{
|
||||
public:
|
||||
// Rvalue
|
||||
ViewMode mode;
|
||||
void * cpu_ptr;
|
||||
#ifdef GRID_SIMT
|
||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
|
||||
return coalescedRead(this->_odata[i]);
|
||||
}
|
||||
#else
|
||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||
#endif
|
||||
|
||||
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
||||
|
||||
accelerator_inline uint64_t begin(void) const { return 0;};
|
||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||
|
||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
|
||||
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
|
||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
|
||||
{
|
||||
this->ViewOpen(mode);
|
||||
}
|
||||
|
||||
// Host functions
|
||||
void ViewOpen(ViewMode mode)
|
||||
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
|
||||
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
|
||||
this->cpu_ptr = (void *)this->_odata;
|
||||
this->mode = mode;
|
||||
this->_odata =(vobj *)
|
||||
MemoryManager::ViewOpen(this->cpu_ptr,
|
||||
this->_odata_size*sizeof(vobj),
|
||||
mode,
|
||||
this->advise);
|
||||
}
|
||||
void ViewClose(void)
|
||||
{ // Inform the manager
|
||||
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
|
||||
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
|
||||
}
|
||||
|
||||
};
|
||||
// Little autoscope assister
|
||||
template<class View>
|
||||
class ViewCloser
|
||||
{
|
||||
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||
public:
|
||||
ViewCloser(View &_v) : v(_v) {};
|
||||
~ViewCloser() { v.ViewClose(); }
|
||||
};
|
||||
|
||||
#define autoView(l_v,l,mode) \
|
||||
auto l_v = l.View(mode); \
|
||||
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Lattice expression types used by ET to assemble the AST
|
||||
//
|
||||
// Need to be able to detect code paths according to the whether a lattice object or not
|
||||
// so introduce some trait type things
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class LatticeExpressionBase {};
|
||||
|
||||
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
|
||||
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
||||
|
||||
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
||||
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
|
||||
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
||||
|
||||
template <typename Op, typename _T1>
|
||||
class LatticeUnaryExpression : public LatticeExpressionBase
|
||||
{
|
||||
public:
|
||||
typedef typename ViewMap<_T1>::Type T1;
|
||||
Op op;
|
||||
T1 arg1;
|
||||
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
|
||||
};
|
||||
|
||||
template <typename Op, typename _T1, typename _T2>
|
||||
class LatticeBinaryExpression : public LatticeExpressionBase
|
||||
{
|
||||
public:
|
||||
typedef typename ViewMap<_T1>::Type T1;
|
||||
typedef typename ViewMap<_T2>::Type T2;
|
||||
Op op;
|
||||
T1 arg1;
|
||||
T2 arg2;
|
||||
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
|
||||
};
|
||||
|
||||
template <typename Op, typename _T1, typename _T2, typename _T3>
|
||||
class LatticeTrinaryExpression : public LatticeExpressionBase
|
||||
{
|
||||
public:
|
||||
typedef typename ViewMap<_T1>::Type T1;
|
||||
typedef typename ViewMap<_T2>::Type T2;
|
||||
typedef typename ViewMap<_T3>::Type T3;
|
||||
Op op;
|
||||
T1 arg1;
|
||||
T2 arg2;
|
||||
T3 arg3;
|
||||
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
|
||||
};
|
||||
NAMESPACE_END(Grid);
|
@ -130,6 +130,8 @@ public:
|
||||
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
||||
|
||||
if ( log.active ) {
|
||||
std::ios_base::fmtflags f(stream.flags());
|
||||
|
||||
stream << log.background()<< std::left;
|
||||
if (log.topWidth > 0)
|
||||
{
|
||||
@ -152,6 +154,8 @@ public:
|
||||
<< now << log.background() << " : " ;
|
||||
}
|
||||
stream << log.colour();
|
||||
stream.flags(f);
|
||||
|
||||
return stream;
|
||||
} else {
|
||||
return devnull;
|
||||
|
@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <sys/syscall.h>
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
||||
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
||||
#else
|
||||
@ -112,7 +112,6 @@ class PerformanceCounter {
|
||||
private:
|
||||
|
||||
typedef struct {
|
||||
public:
|
||||
uint32_t type;
|
||||
uint64_t config;
|
||||
const char *name;
|
||||
|
@ -12773,7 +12773,7 @@ namespace pugi
|
||||
#undef PUGI__THROW_ERROR
|
||||
#undef PUGI__CHECK_ERROR
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
#pragma pop
|
||||
#endif
|
||||
|
||||
|
@ -115,18 +115,21 @@ public:
|
||||
PokeIndex<LorentzIndex>(Uadj, U, mu);
|
||||
}
|
||||
|
||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||
autoView(Umu_v,Umu,CpuRead);
|
||||
autoView(Uadj_v,Uadj,CpuRead);
|
||||
autoView(Uds_v,Uds,CpuWrite);
|
||||
thread_for( lidx, GaugeGrid->lSites(), {
|
||||
Coordinate lcoor;
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||
|
||||
peekLocalSite(ScalarUmu, Umu, lcoor);
|
||||
peekLocalSite(ScalarUmu, Umu_v, lcoor);
|
||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||
|
||||
peekLocalSite(ScalarUmu, Uadj, lcoor);
|
||||
peekLocalSite(ScalarUmu, Uadj_v, lcoor);
|
||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
|
||||
|
||||
pokeLocalSite(ScalarUds, Uds, lcoor);
|
||||
}
|
||||
pokeLocalSite(ScalarUds, Uds_v, lcoor);
|
||||
});
|
||||
}
|
||||
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
|
||||
|
@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||
NAMESPACE_CHECK(Wilson5D);
|
||||
|
||||
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
|
||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
|
||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
|
||||
NAMESPACE_CHECK(Staggered);
|
||||
@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
|
||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
|
||||
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
|
||||
|
||||
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
|
||||
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
|
||||
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
|
||||
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
||||
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
||||
|
@ -96,11 +96,11 @@ public:
|
||||
int sl = St._simd_layout[direction];
|
||||
Coordinate icoor;
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
_Spinor tmp;
|
||||
|
||||
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
||||
int s = SIMTlane(Nsimd);
|
||||
int s = acceleratorSIMTlane(Nsimd);
|
||||
St.iCoorFromIindex(icoor,s);
|
||||
|
||||
int mmu = mu % Nd;
|
||||
@ -233,14 +233,16 @@ public:
|
||||
Uconj = where(coor==neglink,-Uconj,Uconj);
|
||||
}
|
||||
|
||||
auto U_v = U.View();
|
||||
auto Uds_v = Uds.View();
|
||||
auto Uconj_v = Uconj.View();
|
||||
auto Utmp_v= Utmp.View();
|
||||
{
|
||||
autoView( U_v , U, CpuRead);
|
||||
autoView( Uconj_v , Uconj, CpuRead);
|
||||
autoView( Uds_v , Uds, CpuWrite);
|
||||
autoView( Utmp_v, Utmp, CpuWrite);
|
||||
thread_foreach(ss,U_v,{
|
||||
Uds_v[ss](0)(mu) = U_v[ss]();
|
||||
Uds_v[ss](1)(mu) = Uconj_v[ss]();
|
||||
});
|
||||
}
|
||||
|
||||
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
||||
Uconj = adj(Cshift(Uconj,mu,-1));
|
||||
@ -250,19 +252,25 @@ public:
|
||||
Utmp = where(coor==0,Uconj,Utmp);
|
||||
}
|
||||
|
||||
{
|
||||
autoView( Uds_v , Uds, CpuWrite);
|
||||
autoView( Utmp_v, Utmp, CpuWrite);
|
||||
thread_foreach(ss,Utmp_v,{
|
||||
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
||||
});
|
||||
|
||||
}
|
||||
Utmp = Uconj;
|
||||
if ( Params.twists[mu] ) {
|
||||
Utmp = where(coor==0,U,Utmp);
|
||||
}
|
||||
|
||||
{
|
||||
autoView( Uds_v , Uds, CpuWrite);
|
||||
autoView( Utmp_v, Utmp, CpuWrite);
|
||||
thread_foreach(ss,Utmp_v,{
|
||||
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -272,11 +280,14 @@ public:
|
||||
GaugeLinkField link(mat.Grid());
|
||||
// use lorentz for flavour as hack.
|
||||
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
||||
auto link_v = link.View();
|
||||
auto tmp_v = tmp.View();
|
||||
|
||||
{
|
||||
autoView( link_v , link, CpuWrite);
|
||||
autoView( tmp_v , tmp, CpuRead);
|
||||
thread_foreach(ss,tmp_v,{
|
||||
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
|
||||
});
|
||||
}
|
||||
PokeIndex<LorentzIndex>(mat, link, mu);
|
||||
return;
|
||||
}
|
||||
@ -306,9 +317,10 @@ public:
|
||||
|
||||
GaugeLinkField tmp(mat.Grid());
|
||||
tmp = Zero();
|
||||
auto tmp_v = tmp.View();
|
||||
auto Atilde_v = Atilde.View();
|
||||
auto Btilde_v = Btilde.View();
|
||||
{
|
||||
autoView( tmp_v , tmp, CpuWrite);
|
||||
autoView( Atilde_v , Atilde, CpuRead);
|
||||
autoView( Btilde_v , Btilde, CpuRead);
|
||||
thread_for(ss,tmp.Grid()->oSites(),{
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
int sF = s + Ls * ss;
|
||||
@ -316,6 +328,7 @@ public:
|
||||
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
||||
}
|
||||
});
|
||||
}
|
||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||
return;
|
||||
}
|
||||
|
@ -208,7 +208,7 @@ public:
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
// Comms buffer
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
|
194
Grid/qcd/action/fermion/NaiveStaggeredFermion.h
Normal file
194
Grid/qcd/action/fermion/NaiveStaggeredFermion.h
Normal file
@ -0,0 +1,194 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi, Peter Boyle
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
|
||||
#define GRID_QCD_NAIVE_STAG_FERMION_H
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
class NaiveStaggeredFermionStatic {
|
||||
public:
|
||||
static const std::vector<int> directions;
|
||||
static const std::vector<int> displacements;
|
||||
static const int npoint = 8;
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
typedef StaggeredKernels<Impl> Kernels;
|
||||
|
||||
FermionField _tmp;
|
||||
FermionField &tmp(void) { return _tmp; }
|
||||
|
||||
////////////////////////////////////////
|
||||
// Performance monitoring
|
||||
////////////////////////////////////////
|
||||
void Report(void);
|
||||
void ZeroCounters(void);
|
||||
double DhopTotalTime;
|
||||
double DhopCalls;
|
||||
double DhopCommTime;
|
||||
double DhopComputeTime;
|
||||
double DhopComputeTime2;
|
||||
double DhopFaceTime;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Implement the abstract base
|
||||
///////////////////////////////////////////////////////////////
|
||||
GridBase *GaugeGrid(void) { return _grid; }
|
||||
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
|
||||
GridBase *FermionGrid(void) { return _grid; }
|
||||
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// override multiply; cut number routines if pass dagger argument
|
||||
// and also make interface more uniformly consistent
|
||||
//////////////////////////////////////////////////////////////////
|
||||
void M(const FermionField &in, FermionField &out);
|
||||
void Mdag(const FermionField &in, FermionField &out);
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// half checkerboard operations
|
||||
/////////////////////////////////////////////////////////
|
||||
void Meooe(const FermionField &in, FermionField &out);
|
||||
void MeooeDag(const FermionField &in, FermionField &out);
|
||||
void Mooee(const FermionField &in, FermionField &out);
|
||||
void MooeeDag(const FermionField &in, FermionField &out);
|
||||
void MooeeInv(const FermionField &in, FermionField &out);
|
||||
void MooeeInvDag(const FermionField &in, FermionField &out);
|
||||
|
||||
////////////////////////
|
||||
// Derivative interface
|
||||
////////////////////////
|
||||
// Interface calls an internal routine
|
||||
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// non-hermitian hopping term; half cb or both
|
||||
///////////////////////////////////////////////////////////////
|
||||
void Dhop (const FermionField &in, FermionField &out, int dag);
|
||||
void DhopOE(const FermionField &in, FermionField &out, int dag);
|
||||
void DhopEO(const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Multigrid assistance; force term uses too
|
||||
///////////////////////////////////////////////////////////////
|
||||
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
|
||||
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
|
||||
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Extra methods added by derived
|
||||
///////////////////////////////////////////////////////////////
|
||||
void DerivInternal(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
GaugeField &mat,
|
||||
const FermionField &A, const FermionField &B, int dag);
|
||||
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Grid own interface Constructor
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
RealD _c1, RealD _u0,
|
||||
const ImplParams &p = ImplParams());
|
||||
NaiveStaggeredFermion(GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
RealD _c1, RealD _u0,
|
||||
const ImplParams &p = ImplParams());
|
||||
|
||||
// DoubleStore impl dependent
|
||||
void ImportGauge (const GaugeField &_U );
|
||||
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
||||
void CopyGaugeCheckerboards(void);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Data members require to support the functionality
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
// protected:
|
||||
public:
|
||||
// any other parameters of action ???
|
||||
virtual int isTrivialEE(void) { return 1; };
|
||||
virtual RealD Mass(void) { return mass; }
|
||||
RealD mass;
|
||||
RealD u0;
|
||||
RealD c1;
|
||||
|
||||
GridBase *_grid;
|
||||
GridBase *_cbgrid;
|
||||
|
||||
// Defines the stencils for even and odd
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
///////////////////////////////////////////////////////////////
|
||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
PropagatorField &src,
|
||||
Current curr_type,
|
||||
unsigned int mu);
|
||||
void SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
PropagatorField &srct,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax,
|
||||
ComplexField &lattice_cmplx);
|
||||
};
|
||||
|
||||
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
|
||||
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
@ -49,21 +49,35 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
||||
|
||||
public:
|
||||
|
||||
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||
|
||||
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
|
||||
protected:
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Generic Nc kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
template<int Naik>
|
||||
static accelerator_inline
|
||||
void DhopSiteGeneric(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteGenericInt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteGenericExt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
@ -71,15 +85,21 @@ public:
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Nc=3 specific kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteHand(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteHandInt(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteHandExt(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
@ -87,27 +107,11 @@ public:
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Asm Nc=3 specific kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
void DhopSiteAsm(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Generic interface; fan out to right routine
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
|
||||
|
||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
|
||||
|
||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
|
||||
|
||||
public:
|
||||
|
||||
|
@ -113,20 +113,7 @@ public:
|
||||
|
||||
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
||||
{
|
||||
GridBase *GaugeGrid = U_ds.Grid();
|
||||
thread_for(lidx, GaugeGrid->lSites(),{
|
||||
|
||||
SiteScalarGaugeLink ScalarU;
|
||||
SiteDoubledGaugeField ScalarUds;
|
||||
|
||||
Coordinate lcoor;
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||
peekLocalSite(ScalarUds, U_ds, lcoor);
|
||||
|
||||
peekLocalSite(ScalarU, U, lcoor);
|
||||
ScalarUds(mu) = ScalarU();
|
||||
|
||||
});
|
||||
assert(0);
|
||||
}
|
||||
inline void DoubleStore(GridBase *GaugeGrid,
|
||||
DoubledGaugeField &UUUds, // for Naik term
|
||||
|
@ -257,15 +257,16 @@ private:
|
||||
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
||||
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
||||
|
||||
public:
|
||||
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
||||
// using the DeGrand-Rossi basis for the gamma matrices
|
||||
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
auto T_v = T.View();
|
||||
auto F_v = F.View();
|
||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||
autoView(T_v,T,AcceleratorWrite);
|
||||
autoView(F_v,F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
||||
@ -281,9 +282,9 @@ private:
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
auto T_v = T.View();
|
||||
auto F_v = F.View();
|
||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||
autoView(T_v, T,AcceleratorWrite);
|
||||
autoView(F_v, F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 1) = -F_v[i]()();
|
||||
T_v[i]()(1, 0) = F_v[i]()();
|
||||
@ -299,9 +300,9 @@ private:
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
auto T_v = T.View();
|
||||
auto F_v = F.View();
|
||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||
autoView(T_v,T,AcceleratorWrite);
|
||||
autoView(F_v,F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
||||
@ -317,9 +318,9 @@ private:
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
auto T_v = T.View();
|
||||
auto F_v = F.View();
|
||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||
autoView( T_v , T, AcceleratorWrite);
|
||||
autoView( F_v , F, AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
||||
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
||||
@ -335,9 +336,9 @@ private:
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
auto T_v = T.View();
|
||||
auto F_v = F.View();
|
||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||
autoView( T_v ,T,AcceleratorWrite);
|
||||
autoView( F_v ,F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 1) = -(F_v[i]()());
|
||||
T_v[i]()(1, 0) = (F_v[i]()());
|
||||
@ -354,9 +355,9 @@ private:
|
||||
|
||||
T = Zero();
|
||||
|
||||
auto T_v = T.View();
|
||||
auto F_v = F.View();
|
||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||
autoView( T_v , T,AcceleratorWrite);
|
||||
autoView( F_v , F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
||||
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
||||
|
@ -74,6 +74,20 @@ public:
|
||||
FermionField _tmp;
|
||||
FermionField &tmp(void) { return _tmp; }
|
||||
|
||||
void Report(void);
|
||||
void ZeroCounters(void);
|
||||
double DhopCalls;
|
||||
double DhopCommTime;
|
||||
double DhopComputeTime;
|
||||
double DhopComputeTime2;
|
||||
double DhopFaceTime;
|
||||
double DhopTotalTime;
|
||||
|
||||
double DerivCalls;
|
||||
double DerivCommTime;
|
||||
double DerivComputeTime;
|
||||
double DerivDhopComputeTime;
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// override multiply; cut number routines if pass dagger argument
|
||||
// and also make interface more uniformly consistent
|
||||
@ -196,5 +210,3 @@ typedef WilsonFermion<WilsonImplF> WilsonFermionF;
|
||||
typedef WilsonFermion<WilsonImplD> WilsonFermionD;
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
|
@ -215,7 +215,7 @@ public:
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
// Comms buffer
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
|
||||
};
|
||||
|
@ -106,10 +106,10 @@ public:
|
||||
const _SpinorField & phi,
|
||||
int mu)
|
||||
{
|
||||
auto out_v= out.View();
|
||||
auto phi_v= phi.View();
|
||||
auto Umu_v= Umu.View();
|
||||
thread_for(sss,out.Grid()->oSites(),{
|
||||
autoView( out_v, out, AcceleratorWrite);
|
||||
autoView( phi_v, phi, AcceleratorRead);
|
||||
autoView( Umu_v, Umu, AcceleratorRead);
|
||||
accelerator_for(sss,out.Grid()->oSites(),1,{
|
||||
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
||||
});
|
||||
}
|
||||
@ -191,18 +191,19 @@ public:
|
||||
int Ls=Btilde.Grid()->_fdimensions[0];
|
||||
GaugeLinkField tmp(mat.Grid());
|
||||
tmp = Zero();
|
||||
auto tmp_v = tmp.View();
|
||||
auto Btilde_v = Btilde.View();
|
||||
auto Atilde_v = Atilde.View();
|
||||
thread_for(sss,tmp.Grid()->oSites(),{
|
||||
{
|
||||
autoView( tmp_v , tmp, AcceleratorWrite);
|
||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||
accelerator_for(sss,tmp.Grid()->oSites(),1,{
|
||||
int sU=sss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
|
||||
}
|
||||
});
|
||||
}
|
||||
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
||||
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||
#ifdef GRID_NVCC
|
||||
#ifdef GRID_CUDA
|
||||
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
|
||||
PropagatorField tmp(UGrid);
|
||||
PropagatorField Utmp(UGrid);
|
||||
LatticeInteger zz (UGrid); zz=0.0;
|
||||
PropagatorField zz (UGrid); zz=0.0;
|
||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||
for (int s=0;s<Ls;s++) {
|
||||
|
||||
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef GRID_NVCC
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||
////////////////////////////////////////////////
|
||||
// GENERAL CAYLEY CASE
|
||||
@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField tmp(UGrid);
|
||||
PropagatorField Utmp(UGrid);
|
||||
|
||||
LatticeInteger zz (UGrid); zz=0.0;
|
||||
PropagatorField zz (UGrid); zz=0.0;
|
||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||
|
||||
for(int s=0;s<Ls;s++){
|
||||
|
@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i,AcceleratorRead);
|
||||
autoView(phi , phi_i,AcceleratorRead);
|
||||
autoView(chi , chi_i,AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
auto pdiag = &diag[0];
|
||||
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i,AcceleratorRead);
|
||||
autoView(phi , phi_i,AcceleratorRead);
|
||||
autoView(chi , chi_i,AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
auto pdiag = &diag[0];
|
||||
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i,AcceleratorRead);
|
||||
autoView(chi , chi_i,AcceleratorWrite);
|
||||
|
||||
int Ls=this->Ls;
|
||||
|
||||
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
||||
GridBase *grid=psi_i.Grid();
|
||||
int Ls=this->Ls;
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i,AcceleratorRead);
|
||||
autoView(chi , chi_i,AcceleratorWrite);
|
||||
|
||||
auto plee = & lee [0];
|
||||
auto pdee = & dee [0];
|
||||
|
@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi, psi_i,CpuRead);
|
||||
autoView(phi, phi_i,CpuRead);
|
||||
autoView(chi, chi_i,CpuWrite);
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
const int nsimd= Simd::Nsimd();
|
||||
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi=psi_i.View();
|
||||
auto phi=phi_i.View();
|
||||
auto chi=chi_i.View();
|
||||
autoView(psi,psi_i,CpuRead);
|
||||
autoView(phi,phi_i,CpuRead);
|
||||
autoView(chi,chi_i,CpuWrite);
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd= Simd::Nsimd();
|
||||
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
|
||||
Vector<iSinglet<Simd> > &Matm)
|
||||
{
|
||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i,CpuRead);
|
||||
autoView(chi , chi_i,CpuWrite);
|
||||
#ifndef AVX512
|
||||
{
|
||||
SiteHalfSpinor BcastP;
|
||||
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
|
||||
EnableIf<Impl::LsVectorised,int> sfinae=0;
|
||||
#ifndef AVX512
|
||||
{
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i,CpuRead);
|
||||
autoView(chi , chi_i,CpuWrite);
|
||||
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
|
||||
}
|
||||
#else
|
||||
{
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i,CpuRead);
|
||||
autoView(chi , chi_i,CpuWrite);
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %zmm0
|
||||
|
@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
int Ls = this->Ls;
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto phi = phi_i.View();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView( phi , phi_i, AcceleratorRead);
|
||||
autoView( psi , psi_i, AcceleratorRead);
|
||||
autoView( chi , chi_i, AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
||||
GridBase* grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView( psi , psi_i, AcceleratorRead);
|
||||
autoView( phi , phi_i, AcceleratorRead);
|
||||
autoView( chi , chi_i, AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi=psi_i.View();
|
||||
auto chi=chi_i.View();
|
||||
autoView( psi, psi_i, AcceleratorRead);
|
||||
autoView( chi, chi_i, AcceleratorWrite);
|
||||
int Ls = this->Ls;
|
||||
|
||||
auto plee = & this->lee[0];
|
||||
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView( psi, psi_i, AcceleratorRead);
|
||||
autoView( chi, chi_i, AcceleratorWrite);
|
||||
int Ls = this->Ls;
|
||||
|
||||
auto plee = & this->lee[0];
|
||||
|
@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
|
||||
|
||||
Compressor compressor;
|
||||
Stencil.HaloExchange(in,compressor);
|
||||
auto Umu_v = Umu.View();
|
||||
auto UUUmu_v = UUUmu.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
autoView( Umu_v , Umu, CpuRead);
|
||||
autoView( UUUmu_v , UUUmu, CpuRead);
|
||||
autoView( in_v , in, CpuRead);
|
||||
autoView( out_v , out, CpuWrite);
|
||||
thread_for( ss,Umu.Grid()->oSites(),{
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sU=ss;
|
||||
@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
}
|
||||
|
||||
@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
|
||||
Compressor compressor;
|
||||
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
||||
DhopFaceTime-=usecond();
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
DhopCommTime -=usecond();
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
st.CommunicateBegin(requests);
|
||||
|
||||
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
double ctime=0;
|
||||
double ptime=0;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
// Remove explicit thread mapping introduced for OPA reasons.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||
DhopComputeTime-=usecond();
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
if (tid >= ncomms) {
|
||||
double start = usecond();
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = U.Grid()->oSites(); // 4d vol
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
int interior=1;
|
||||
int exterior=0;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
|
||||
// do the compute
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
|
||||
if (dag == DaggerYes) {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
// Interior = 1; Exterior = 0; must implement for staggered
|
||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
|
||||
}
|
||||
} else {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
// Interior = 1; Exterior = 0;
|
||||
int sU = ss;
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
|
||||
}
|
||||
}
|
||||
ptime = usecond() - start;
|
||||
} else {
|
||||
double start = usecond();
|
||||
st.CommunicateThreaded();
|
||||
ctime = usecond() - start;
|
||||
}
|
||||
}
|
||||
DhopCommTime += ctime;
|
||||
DhopComputeTime+=ptime;
|
||||
|
||||
// First to enter, last to leave timing
|
||||
st.CollateThreads();
|
||||
DhopComputeTime+=usecond();
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
DhopComputeTime2-=usecond();
|
||||
st.CommunicateComplete(requests);
|
||||
DhopCommTime +=usecond();
|
||||
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
thread_for( ss,sz,{
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
|
||||
});
|
||||
} else {
|
||||
int sz=st.surface_list.size();
|
||||
thread_for( ss,sz,{
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
|
||||
});
|
||||
DhopComputeTime2-=usecond();
|
||||
{
|
||||
int interior=0;
|
||||
int exterior=1;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
DhopComputeTime2+=usecond();
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||
Compressor compressor;
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
|
||||
|
||||
|
||||
//double t1=usecond();
|
||||
DhopTotalTime -= usecond();
|
||||
DhopCommTime -= usecond();
|
||||
@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||
|
||||
DhopComputeTime -= usecond();
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
if (dag == DaggerYes) {
|
||||
thread_for( ss,U.Grid()->oSites(),{
|
||||
int sU=ss;
|
||||
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
|
||||
});
|
||||
} else {
|
||||
thread_for( ss,U.Grid()->oSites(),{
|
||||
int sU=ss;
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
|
||||
});
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=1;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
//double t2=usecond();
|
||||
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
||||
|
||||
}
|
||||
/*CHANGE END*/
|
||||
|
@ -258,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
|
||||
////////////////////////
|
||||
// Call the single hop
|
||||
////////////////////////
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto B_v = B.View();
|
||||
auto Btilde_v = Btilde.View();
|
||||
autoView( U_v , U, CpuRead);
|
||||
autoView( UUU_v , UUU, CpuRead);
|
||||
autoView( B_v , B, CpuWrite);
|
||||
autoView( Btilde_v , Btilde, CpuWrite);
|
||||
thread_for(sss,B.Grid()->oSites(),{
|
||||
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
||||
});
|
||||
@ -386,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
|
||||
|
||||
Compressor compressor;
|
||||
Stencil.HaloExchange(in, compressor);
|
||||
auto Umu_v = Umu.View();
|
||||
auto UUUmu_v = UUUmu.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
autoView( Umu_v , Umu, CpuRead);
|
||||
autoView( UUUmu_v , UUUmu, CpuRead);
|
||||
autoView( in_v , in, CpuRead);
|
||||
autoView( out_v , out, CpuWrite);
|
||||
thread_for( sss, in.Grid()->oSites(),{
|
||||
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
||||
});
|
||||
@ -403,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
}
|
||||
template <class Impl>
|
||||
@ -417,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
Compressor compressor;
|
||||
int len = U.Grid()->oSites();
|
||||
|
||||
@ -426,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
||||
DhopFaceTime -= usecond();
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
DhopFaceTime += usecond();
|
||||
|
||||
DhopCommTime -=usecond();
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
st.CommunicateBegin(requests);
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMergeSHM(compressor);
|
||||
DhopFaceTime+= usecond();
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
// Removed explicit thread comms
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
DhopComputeTime -= usecond();
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
|
||||
if (tid >= ncomms) {
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = len;
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
|
||||
// do the compute
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
if (dag == DaggerYes) {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
// Interior = 1; Exterior = 0; must implement for staggered
|
||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
|
||||
}
|
||||
} else {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
// Interior = 1; Exterior = 0;
|
||||
int sU = ss;
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
st.CommunicateThreaded();
|
||||
}
|
||||
int interior=1;
|
||||
int exterior=0;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
|
||||
st.CommunicateComplete(requests);
|
||||
DhopCommTime +=usecond();
|
||||
|
||||
// First to enter, last to leave timing
|
||||
DhopFaceTime -= usecond();
|
||||
st.CommsMerge(compressor);
|
||||
@ -487,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
||||
|
||||
DhopComputeTime2 -= usecond();
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
thread_for(ss,sz,{
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
|
||||
});
|
||||
} else {
|
||||
int sz=st.surface_list.size();
|
||||
thread_for(ss,sz,{
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
|
||||
});
|
||||
}
|
||||
int interior=0;
|
||||
int exterior=1;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
DhopComputeTime2 += usecond();
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -528,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
|
||||
st.HaloExchange(in, compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
DhopComputeTime -= usecond();
|
||||
if (dag == DaggerYes) {
|
||||
thread_for(sss, in.Grid()->oSites(),{
|
||||
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
|
||||
});
|
||||
} else {
|
||||
thread_for(sss, in.Grid()->oSites(),{
|
||||
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
|
||||
});
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=1;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
|
@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(phi , phi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(phi , phi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
auto pm = this->pm;
|
||||
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(phi , phi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(phi , phi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
auto plee = & this->lee [0];
|
||||
auto pdee = & this->dee [0];
|
||||
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
auto pm = this->pm;
|
||||
auto plee = & this->lee [0];
|
||||
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
auto plee = & this->lee [0];
|
||||
auto pdee = & this->dee [0];
|
||||
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
int Ls = this->Ls;
|
||||
|
||||
auto pm = this->pm;
|
||||
|
@ -0,0 +1,499 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi, Peter Boyle
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/////////////////////////////////
|
||||
// Constructor and gauge import
|
||||
/////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
||||
RealD _mass,
|
||||
RealD _c1, RealD _u0,
|
||||
const ImplParams &p)
|
||||
: Kernels(p),
|
||||
_grid(&Fgrid),
|
||||
_cbgrid(&Hgrid),
|
||||
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
|
||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||
mass(_mass),
|
||||
Lebesgue(_grid),
|
||||
LebesgueEvenOdd(_cbgrid),
|
||||
Umu(&Fgrid),
|
||||
UmuEven(&Hgrid),
|
||||
UmuOdd(&Hgrid),
|
||||
_tmp(&Hgrid)
|
||||
{
|
||||
int vol4;
|
||||
int LLs=1;
|
||||
c1=_c1;
|
||||
u0=_u0;
|
||||
vol4= _grid->oSites();
|
||||
Stencil.BuildSurfaceList(LLs,vol4);
|
||||
vol4= _cbgrid->oSites();
|
||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
RealD _c1, RealD _u0,
|
||||
const ImplParams &p)
|
||||
: NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
|
||||
{
|
||||
ImportGauge(_U);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Momentum space propagator should be
|
||||
// https://arxiv.org/pdf/hep-lat/9712010.pdf
|
||||
//
|
||||
// mom space action.
|
||||
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
|
||||
//
|
||||
// must track through staggered flavour/spin reduction in literature to
|
||||
// turn to free propagator for the one component chi field, a la page 4/5
|
||||
// of above link to implmement fourier based solver.
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
|
||||
{
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||
}
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
|
||||
{
|
||||
GaugeLinkField U(GaugeGrid());
|
||||
DoubledGaugeField _UUU(GaugeGrid());
|
||||
////////////////////////////////////////////////////////
|
||||
// Double Store should take two fields for Naik and one hop separately.
|
||||
// Discard teh Naik as Naive
|
||||
////////////////////////////////////////////////////////
|
||||
Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Apply scale factors to get the right fermion Kinetic term
|
||||
// Could pass coeffs into the double store to save work.
|
||||
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
|
||||
////////////////////////////////////////////////////////
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
U = PeekIndex<LorentzIndex>(Umu, mu);
|
||||
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
|
||||
|
||||
U = PeekIndex<LorentzIndex>(Umu, mu+4);
|
||||
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
|
||||
|
||||
}
|
||||
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Implement the interface
|
||||
/////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Dhop(in, out, DaggerNo);
|
||||
axpy(out, mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Dhop(in, out, DaggerYes);
|
||||
axpy(out, mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerNo);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerNo);
|
||||
}
|
||||
}
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerYes);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerYes);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
typename FermionField::scalar_type scal(mass);
|
||||
out = scal * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Mooee(in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
out = (1.0 / (mass)) * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
MooeeInv(in, out);
|
||||
}
|
||||
|
||||
///////////////////////////////////
|
||||
// Internal
|
||||
///////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
GaugeField & mat,
|
||||
const FermionField &A, const FermionField &B, int dag)
|
||||
{
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
||||
Compressor compressor;
|
||||
|
||||
FermionField Btilde(B.Grid());
|
||||
FermionField Atilde(B.Grid());
|
||||
Atilde = A;
|
||||
|
||||
st.HaloExchange(B, compressor);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
////////////////////////
|
||||
// Call the single hop
|
||||
////////////////////////
|
||||
autoView( U_v , U, CpuRead);
|
||||
autoView( B_v , B, CpuWrite);
|
||||
autoView( Btilde_v , Btilde, CpuWrite);
|
||||
thread_for(sss,B.Grid()->oSites(),{
|
||||
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
||||
});
|
||||
|
||||
assert(0);// need to figure out the force interface with a blasted three link term.
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
|
||||
conformable(U.Grid(), _grid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
conformable(U.Grid(), mat.Grid());
|
||||
|
||||
mat.Checkerboard() = U.Checkerboard();
|
||||
|
||||
DerivInternal(Stencil, Umu, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
|
||||
conformable(U.Grid(), _cbgrid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
conformable(U.Grid(), mat.Grid());
|
||||
|
||||
assert(V.Checkerboard() == Even);
|
||||
assert(U.Checkerboard() == Odd);
|
||||
mat.Checkerboard() = Odd;
|
||||
|
||||
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
|
||||
conformable(U.Grid(), _cbgrid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
conformable(U.Grid(), mat.Grid());
|
||||
|
||||
assert(V.Checkerboard() == Odd);
|
||||
assert(U.Checkerboard() == Even);
|
||||
mat.Checkerboard() = Even;
|
||||
|
||||
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
||||
{
|
||||
DhopCalls+=2;
|
||||
conformable(in.Grid(), _grid); // verifies full grid
|
||||
conformable(in.Grid(), out.Grid());
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
||||
{
|
||||
DhopCalls+=1;
|
||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||
|
||||
assert(in.Checkerboard() == Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
|
||||
{
|
||||
DhopCalls+=1;
|
||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||
|
||||
assert(in.Checkerboard() == Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
|
||||
{
|
||||
DhopDir(in, out, dir, disp);
|
||||
}
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
|
||||
{
|
||||
assert(0); // Not implemented yet
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
|
||||
{
|
||||
|
||||
Compressor compressor;
|
||||
Stencil.HaloExchange(in, compressor);
|
||||
autoView( Umu_v , Umu, CpuRead);
|
||||
autoView( in_v , in, CpuRead);
|
||||
autoView( out_v , out, CpuWrite);
|
||||
// thread_for( sss, in.Grid()->oSites(),{
|
||||
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
||||
// });
|
||||
assert(0);
|
||||
};
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||
else
|
||||
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||
}
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
Compressor compressor;
|
||||
int len = U.Grid()->oSites();
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
|
||||
DhopFaceTime -= usecond();
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
DhopFaceTime += usecond();
|
||||
|
||||
DhopCommTime -=usecond();
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
st.CommunicateBegin(requests);
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMergeSHM(compressor);
|
||||
DhopFaceTime+= usecond();
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Removed explicit thread comms
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
DhopComputeTime -= usecond();
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=0;
|
||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
|
||||
st.CommunicateComplete(requests);
|
||||
DhopCommTime +=usecond();
|
||||
|
||||
// First to enter, last to leave timing
|
||||
DhopFaceTime -= usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime -= usecond();
|
||||
|
||||
DhopComputeTime2 -= usecond();
|
||||
{
|
||||
int interior=0;
|
||||
int exterior=1;
|
||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||
}
|
||||
DhopComputeTime2 += usecond();
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
|
||||
DhopCommTime -= usecond();
|
||||
Compressor compressor;
|
||||
st.HaloExchange(in, compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
DhopComputeTime -= usecond();
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=1;
|
||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Reporting
|
||||
////////////////////////////////////////////////////////////////
|
||||
template<class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::Report(void)
|
||||
{
|
||||
Coordinate latt = _grid->GlobalDimensions();
|
||||
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = _grid->_Nprocessors;
|
||||
RealD NN = _grid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
|
||||
<< DhopCalls << std::endl;
|
||||
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
|
||||
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
|
||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
|
||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
|
||||
// Average the compute time
|
||||
_grid->GlobalSum(DhopComputeTime);
|
||||
DhopComputeTime/=NP;
|
||||
|
||||
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||
|
||||
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
template<class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
|
||||
{
|
||||
DhopCalls = 0;
|
||||
DhopTotalTime = 0;
|
||||
DhopCommTime = 0;
|
||||
DhopComputeTime = 0;
|
||||
DhopFaceTime = 0;
|
||||
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Conserved current - not yet implemented.
|
||||
////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
PropagatorField &src,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
PropagatorField &src,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax,
|
||||
ComplexField &lattice_cmplx)
|
||||
{
|
||||
assert(0);
|
||||
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
SiteSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
assert(0);
|
||||
@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
||||
gauge3 =(uint64_t)&UU[sU]( T );
|
||||
|
||||
|
||||
// This is the single precision 5th direction vectorised kernel
|
||||
#include <Grid/simd/Intel512single.h>
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
SiteSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
||||
StencilEntry *SE2;
|
||||
StencilEntry *SE3;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
// for(int s=0;s<LLs;s++){
|
||||
|
||||
int sF=s+LLs*sU;
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
// Xp, Yp, Zp, Tp
|
||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
||||
}
|
||||
|
||||
#include <Grid/simd/Intel512double.h>
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
SiteSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
||||
StencilEntry *SE2;
|
||||
StencilEntry *SE3;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
// Xp, Yp, Zp, Tp
|
||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
||||
// This is the single precision 5th direction vectorised kernel
|
||||
|
||||
#include <Grid/simd/Intel512single.h>
|
||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
SiteSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
||||
StencilEntry *SE2;
|
||||
StencilEntry *SE3;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
|
||||
int sF=s+LLs*sU;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
// Xp, Yp, Zp, Tp
|
||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
||||
}
|
||||
|
||||
#include <Grid/simd/Intel512double.h>
|
||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
SiteSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
||||
StencilEntry *SE2;
|
||||
StencilEntry *SE3;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
|
||||
int sF=s+LLs*sU;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
// Xp, Yp, Zp, Tp
|
||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
|
@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
||||
@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
||||
if (Naik) {
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
||||
@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
||||
|
||||
}
|
||||
if ( dag ) {
|
||||
result()()(0) = - even_0 - odd_0;
|
||||
result()()(1) = - even_1 - odd_1;
|
||||
@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
|
||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||
@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
||||
if (Naik) {
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
||||
@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
||||
|
||||
}
|
||||
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
||||
if ( dag ) {
|
||||
result()()(0) = - even_0 - odd_0;
|
||||
@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
|
||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||
@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
||||
if (Naik) {
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
||||
@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
||||
|
||||
}
|
||||
// Add sum of all exterior connected stencil legs
|
||||
if ( nmu ) {
|
||||
if ( dag ) {
|
||||
@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||
@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||
|
||||
*/
|
||||
#undef LOAD_CHI
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
||||
// Int, Ext, Int+Ext cases for comms overlap
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag) {
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag)
|
||||
{
|
||||
const SiteSpinor *chi_p;
|
||||
SiteSpinor chi;
|
||||
SiteSpinor Uchi;
|
||||
@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
int ptype;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
//
|
||||
// int sF=LLs*sU+s;
|
||||
{
|
||||
skew = 0;
|
||||
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
||||
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
||||
@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
||||
if ( Naik ) {
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
}
|
||||
if ( dag ) {
|
||||
Uchi = - Uchi;
|
||||
}
|
||||
@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
// Only contributions from interior of our node
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
||||
const SiteSpinor *chi_p;
|
||||
SiteSpinor chi;
|
||||
@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
|
||||
int ptype;
|
||||
int skew ;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
// int sF=LLs*sU+s;
|
||||
{
|
||||
skew = 0;
|
||||
Uchi=Zero();
|
||||
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
||||
@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
|
||||
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
||||
if ( Naik ) {
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
}
|
||||
if ( dag ) {
|
||||
Uchi = - Uchi;
|
||||
}
|
||||
@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
|
||||
// Only contributions from exterior of our node
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
||||
const SiteSpinor *chi_p;
|
||||
// SiteSpinor chi;
|
||||
@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
|
||||
int nmu=0;
|
||||
int skew ;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
// for(int s=0;s<LLs;s++){
|
||||
// int sF=LLs*sU+s;
|
||||
{
|
||||
skew = 0;
|
||||
Uchi=Zero();
|
||||
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
||||
@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
|
||||
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
||||
if ( Naik ) {
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
|
||||
}
|
||||
if ( nmu ) {
|
||||
if ( dag ) {
|
||||
out[sF] = out[sF] - Uchi;
|
||||
@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Driving / wrapping routine to select right kernel
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
int dag=1;
|
||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
int dag=0;
|
||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,
|
||||
int dag,int interior,int exterior)
|
||||
{
|
||||
switch(Opt) {
|
||||
#ifdef AVX512
|
||||
case OptInlineAsm:
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else {
|
||||
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
|
||||
assert(0);
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
case OptHandUnroll:
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( interior ) {
|
||||
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( exterior ) {
|
||||
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
}
|
||||
break;
|
||||
case OptGeneric:
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( interior ) {
|
||||
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( exterior ) {
|
||||
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
|
||||
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
|
||||
{
|
||||
// Disp should be either +1,-1,+3,-3
|
||||
// What about "dag" ?
|
||||
@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
|
||||
assert(0);
|
||||
}
|
||||
|
||||
#define KERNEL_CALLNB(A,improved) \
|
||||
const uint64_t NN = Nsite*Ls; \
|
||||
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
||||
int sF = ss; \
|
||||
int sU = ss/Ls; \
|
||||
ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
|
||||
});
|
||||
|
||||
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
|
||||
|
||||
#define ASM_CALL(A) \
|
||||
const uint64_t NN = Nsite*Ls; \
|
||||
thread_for( ss, NN, { \
|
||||
int sF = ss; \
|
||||
int sU = ss/Ls; \
|
||||
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
|
||||
});
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||
{
|
||||
GridBase *FGrid=in.Grid();
|
||||
GridBase *UGrid=U.Grid();
|
||||
typedef StaggeredKernels<Impl> ThisKernel;
|
||||
autoView( UUU_v , UUU, AcceleratorRead);
|
||||
autoView( U_v , U, AcceleratorRead);
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
autoView( st_v , st, AcceleratorRead);
|
||||
SiteSpinor * buf = st.CommBuf();
|
||||
|
||||
int Ls=1;
|
||||
if(FGrid->Nd()==UGrid->Nd()+1){
|
||||
Ls = FGrid->_rdimensions[0];
|
||||
}
|
||||
int Nsite = UGrid->oSites();
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
|
||||
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
|
||||
#endif
|
||||
}
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||
{
|
||||
GridBase *FGrid=in.Grid();
|
||||
GridBase *UGrid=U.Grid();
|
||||
typedef StaggeredKernels<Impl> ThisKernel;
|
||||
autoView( UUU_v , U, AcceleratorRead);
|
||||
autoView( U_v , U, AcceleratorRead);
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
autoView( st_v , st, AcceleratorRead);
|
||||
SiteSpinor * buf = st.CommBuf();
|
||||
|
||||
int Ls=1;
|
||||
if(FGrid->Nd()==UGrid->Nd()+1){
|
||||
Ls = FGrid->_rdimensions[0];
|
||||
}
|
||||
int Nsite = UGrid->oSites();
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#undef KERNEL_CALLNB
|
||||
#undef KERNEL_CALL
|
||||
#undef ASM_CALL
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
|
@ -98,11 +98,13 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
Coordinate lcoor;
|
||||
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||
|
||||
for (int site = 0; site < lvol; site++)
|
||||
{
|
||||
autoView(CTv,CloverTerm,CpuRead);
|
||||
autoView(CTIv,CloverTermInv,CpuWrite);
|
||||
for (int site = 0; site < lvol; site++) {
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
peekLocalSite(Qx, CloverTerm, lcoor);
|
||||
peekLocalSite(Qx, CTv, lcoor);
|
||||
Qxinv = Zero();
|
||||
//if (csw!=0){
|
||||
for (int j = 0; j < Ns; j++)
|
||||
@ -123,7 +125,8 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
|
||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
||||
// }
|
||||
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
|
||||
pokeLocalSite(Qxinv, CTIv, lcoor);
|
||||
}
|
||||
}
|
||||
|
||||
// Separate the even and odd parts
|
||||
|
@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
|
||||
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
||||
|
||||
// FIXME Need a Lattice acosh
|
||||
|
||||
{
|
||||
autoView(cosha_v,cosha,CpuRead);
|
||||
autoView(a_v,a,CpuWrite);
|
||||
for(int idx=0;idx<_grid->lSites();idx++){
|
||||
Coordinate lcoor(Nd);
|
||||
Tcomplex cc;
|
||||
// RealD sgn;
|
||||
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
||||
peekLocalSite(cc,cosha,lcoor);
|
||||
peekLocalSite(cc,cosha_v,lcoor);
|
||||
assert((double)real(cc)>=1.0);
|
||||
assert(fabs((double)imag(cc))<=1.0e-15);
|
||||
cc = ScalComplex(::acosh(real(cc)),0.0);
|
||||
pokeLocalSite(cc,a,lcoor);
|
||||
pokeLocalSite(cc,a_v,lcoor);
|
||||
}
|
||||
}
|
||||
|
||||
Wea = ( exp( a) * abs(W) );
|
||||
@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
|
||||
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
||||
|
||||
// FIXME Need a Lattice acosh
|
||||
{
|
||||
autoView(cosha_v,cosha,CpuRead);
|
||||
autoView(a_v,a,CpuWrite);
|
||||
for(int idx=0;idx<_grid->lSites();idx++){
|
||||
Coordinate lcoor(Nd);
|
||||
Tcomplex cc;
|
||||
// RealD sgn;
|
||||
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
||||
peekLocalSite(cc,cosha,lcoor);
|
||||
peekLocalSite(cc,cosha_v,lcoor);
|
||||
assert((double)real(cc)>=1.0);
|
||||
assert(fabs((double)imag(cc))<=1.0e-15);
|
||||
cc = ScalComplex(::acosh(real(cc)),0.0);
|
||||
pokeLocalSite(cc,a,lcoor);
|
||||
}
|
||||
pokeLocalSite(cc,a_v,lcoor);
|
||||
}}
|
||||
|
||||
Wea = ( exp( a) * abs(W) );
|
||||
Wema= ( exp(-a) * abs(W) );
|
||||
|
@ -67,9 +67,99 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
diag_mass = 4.0 + mass;
|
||||
}
|
||||
|
||||
int vol4;
|
||||
vol4=Fgrid.oSites();
|
||||
Stencil.BuildSurfaceList(1,vol4);
|
||||
vol4=Hgrid.oSites();
|
||||
StencilEven.BuildSurfaceList(1,vol4);
|
||||
StencilOdd.BuildSurfaceList(1,vol4);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::Report(void)
|
||||
{
|
||||
RealD NP = _grid->_Nprocessors;
|
||||
RealD NN = _grid->NodeCount();
|
||||
RealD volume = 1;
|
||||
Coordinate latt = _grid->GlobalDimensions();
|
||||
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
|
||||
if ( DhopCalls > 0 ) {
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
|
||||
|
||||
// Average the compute time
|
||||
_grid->GlobalSum(DhopComputeTime);
|
||||
DhopComputeTime/=NP;
|
||||
RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||
|
||||
RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||
|
||||
}
|
||||
|
||||
if ( DerivCalls > 0 ) {
|
||||
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " <<DerivCalls <<std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
|
||||
|
||||
// how to count flops here?
|
||||
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call ? : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node ? : " << mflops/NP << std::endl;
|
||||
|
||||
// how to count flops here?
|
||||
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) ? : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl; }
|
||||
|
||||
if (DerivCalls > 0 || DhopCalls > 0){
|
||||
std::cout << GridLogMessage << "WilsonFermion Stencil" <<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
if ( DhopCalls > 0){
|
||||
std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
|
||||
std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
|
||||
std::cout << GridLogMessage << "WilsonFermion StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::ZeroCounters(void) {
|
||||
DhopCalls = 0; // ok
|
||||
DhopCommTime = 0;
|
||||
DhopComputeTime = 0;
|
||||
DhopComputeTime2= 0;
|
||||
DhopFaceTime = 0;
|
||||
DhopTotalTime = 0;
|
||||
|
||||
DerivCalls = 0; // ok
|
||||
DerivCommTime = 0;
|
||||
DerivComputeTime = 0;
|
||||
DerivDhopComputeTime = 0;
|
||||
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
Stencil.ZeroCountersi();
|
||||
StencilEven.ZeroCountersi();
|
||||
StencilOdd.ZeroCountersi();
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
{
|
||||
@ -229,6 +319,7 @@ template <class Impl>
|
||||
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
GaugeField &mat, const FermionField &A,
|
||||
const FermionField &B, int dag) {
|
||||
DerivCalls++;
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
||||
Compressor compressor(dag);
|
||||
@ -237,8 +328,11 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
FermionField Atilde(B.Grid());
|
||||
Atilde = A;
|
||||
|
||||
DerivCommTime-=usecond();
|
||||
st.HaloExchange(B, compressor);
|
||||
DerivCommTime+=usecond();
|
||||
|
||||
DerivComputeTime-=usecond();
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Flip gamma (1+g)<->(1-g) if dag
|
||||
@ -246,6 +340,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
int gamma = mu;
|
||||
if (!dag) gamma += Nd;
|
||||
|
||||
DerivDhopComputeTime -= usecond();
|
||||
int Ls=1;
|
||||
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
|
||||
|
||||
@ -253,7 +348,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
// spin trace outer product
|
||||
//////////////////////////////////////////////////
|
||||
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
|
||||
DerivDhopComputeTime += usecond();
|
||||
}
|
||||
DerivComputeTime += usecond();
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -387,13 +484,14 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
DhopTotalTime-=usecond();
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||
|
||||
DhopTotalTime+=usecond();
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -412,38 +510,53 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
|
||||
/////////////////////////////
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
st.Prepare();
|
||||
DhopFaceTime-=usecond();
|
||||
st.HaloGather(in,compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
DhopCommTime -=usecond();
|
||||
st.CommunicateBegin(requests);
|
||||
|
||||
/////////////////////////////
|
||||
// Overlap with comms
|
||||
/////////////////////////////
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMergeSHM(compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
/////////////////////////////
|
||||
// do the compute interior
|
||||
/////////////////////////////
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
DhopComputeTime-=usecond();
|
||||
if (dag == DaggerYes) {
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
||||
} else {
|
||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
||||
}
|
||||
DhopComputeTime+=usecond();
|
||||
|
||||
/////////////////////////////
|
||||
// Complete comms
|
||||
/////////////////////////////
|
||||
st.CommunicateComplete(requests);
|
||||
DhopCommTime +=usecond();
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
/////////////////////////////
|
||||
// do the compute exterior
|
||||
/////////////////////////////
|
||||
|
||||
DhopComputeTime2-=usecond();
|
||||
if (dag == DaggerYes) {
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
||||
} else {
|
||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
||||
}
|
||||
DhopComputeTime2+=usecond();
|
||||
};
|
||||
|
||||
|
||||
@ -455,14 +568,18 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||
{
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
Compressor compressor(dag);
|
||||
DhopCommTime-=usecond();
|
||||
st.HaloExchange(in, compressor);
|
||||
DhopCommTime+=usecond();
|
||||
|
||||
DhopComputeTime-=usecond();
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
if (dag == DaggerYes) {
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
||||
} else {
|
||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
||||
}
|
||||
DhopComputeTime+=usecond();
|
||||
};
|
||||
/*Change ends */
|
||||
|
||||
@ -483,32 +600,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
conformable(_grid, q_in_1.Grid());
|
||||
conformable(_grid, q_in_2.Grid());
|
||||
conformable(_grid, q_out.Grid());
|
||||
#if 0
|
||||
PropagatorField tmp1(_grid), tmp2(_grid);
|
||||
q_out = Zero();
|
||||
|
||||
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
|
||||
// Inefficient comms method but not performance critical.
|
||||
tmp1 = Cshift(q_in_1, mu, 1);
|
||||
tmp2 = Cshift(q_in_2, mu, 1);
|
||||
auto tmp1_v = tmp1.View();
|
||||
auto tmp2_v = tmp2.View();
|
||||
auto q_in_1_v=q_in_1.View();
|
||||
auto q_in_2_v=q_in_2.View();
|
||||
auto q_out_v = q_out.View();
|
||||
auto Umu_v = Umu.View();
|
||||
thread_for(sU, Umu.Grid()->oSites(),{
|
||||
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
|
||||
q_in_2_v[sU],
|
||||
q_out_v[sU],
|
||||
Umu_v, sU, mu);
|
||||
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
|
||||
tmp2_v[sU],
|
||||
q_out_v[sU],
|
||||
Umu_v, sU, mu);
|
||||
});
|
||||
#else
|
||||
#endif
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
@ -524,62 +616,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
{
|
||||
conformable(_grid, q_in.Grid());
|
||||
conformable(_grid, q_out.Grid());
|
||||
#if 0
|
||||
|
||||
// Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
|
||||
Complex i(0.0,1.0);
|
||||
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
|
||||
unsigned int tshift = (mu == Tp) ? 1 : 0;
|
||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
||||
|
||||
q_out = Zero();
|
||||
LatticeInteger coords(_grid);
|
||||
LatticeCoordinate(coords, Tp);
|
||||
|
||||
// Need q(x + mu) and q(x - mu).
|
||||
tmp = Cshift(q_in, mu, 1);
|
||||
tmpFwd = tmp*lattice_cmplx;
|
||||
tmp = lattice_cmplx*q_in;
|
||||
tmpBwd = Cshift(tmp, mu, -1);
|
||||
|
||||
auto coords_v = coords.View();
|
||||
auto tmpFwd_v = tmpFwd.View();
|
||||
auto tmpBwd_v = tmpBwd.View();
|
||||
auto Umu_v = Umu.View();
|
||||
auto q_out_v = q_out.View();
|
||||
|
||||
thread_for(sU, Umu.Grid()->oSites(), {
|
||||
|
||||
// Compute the sequential conserved current insertion only if our simd
|
||||
// object contains a timeslice we need.
|
||||
vPredicate t_mask;
|
||||
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
|
||||
Integer timeSlices = Reduce(t_mask());
|
||||
|
||||
if (timeSlices > 0) {
|
||||
Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],
|
||||
q_out_v[sU],
|
||||
Umu_v, sU, mu, t_mask);
|
||||
}
|
||||
|
||||
// Repeat for backward direction.
|
||||
t_mask() = ((coords_v[sU] >= (tmin + tshift)) &&
|
||||
(coords_v[sU] <= (tmax + tshift)));
|
||||
|
||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
||||
unsigned int t0 = 0;
|
||||
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
|
||||
|
||||
timeSlices = Reduce(t_mask());
|
||||
|
||||
if (timeSlices > 0) {
|
||||
Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],
|
||||
q_out_v[sU],
|
||||
Umu_v, sU, mu, t_mask);
|
||||
}
|
||||
});
|
||||
#else
|
||||
#endif
|
||||
assert(0);
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
574
Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
Normal file
574
Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
Normal file
@ -0,0 +1,574 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h
|
||||
|
||||
Copyright (C) 2020
|
||||
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
//#if defined(A64FXASM)
|
||||
#if defined(A64FX)
|
||||
|
||||
// safety include
|
||||
#include <arm_sve.h>
|
||||
|
||||
// undefine everything related to kernels
|
||||
#include <simd/Fujitsu_A64FX_undef.h>
|
||||
|
||||
// enable A64FX body
|
||||
#define WILSONKERNELSASMBODYA64FX
|
||||
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are A64FX specialise the single precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
#if defined(DSLASHINTRIN)
|
||||
//#pragma message ("A64FX Dslash: intrin")
|
||||
#include <simd/Fujitsu_A64FX_intrin_single.h>
|
||||
#else
|
||||
#pragma message ("A64FX Dslash: asm")
|
||||
#include <simd/Fujitsu_A64FX_asm_single.h>
|
||||
#endif
|
||||
|
||||
/// Switch off the 5d vectorised code optimisations
|
||||
#undef DWFVEC5D
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
// undefine
|
||||
#include <simd/Fujitsu_A64FX_undef.h>
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are A64FX specialise the double precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(DSLASHINTRIN)
|
||||
#include <simd/Fujitsu_A64FX_intrin_double.h>
|
||||
#else
|
||||
#include <simd/Fujitsu_A64FX_asm_double.h>
|
||||
#endif
|
||||
|
||||
// former KNL
|
||||
//#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||
//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
|
||||
|
||||
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, undag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, dag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||
#else
|
||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// undefs
|
||||
#undef WILSONKERNELSASMBODYA64FX
|
||||
#include <simd/Fujitsu_A64FX_undef.h>
|
||||
|
||||
#endif //A64FXASM
|
@ -0,0 +1,380 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: WilsonKernelsAsmBodyA64FX.h
|
||||
|
||||
Copyright (C) 2020
|
||||
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifdef KERNEL_DAG
|
||||
#define DIR0_PROJ XP_PROJ
|
||||
#define DIR1_PROJ YP_PROJ
|
||||
#define DIR2_PROJ ZP_PROJ
|
||||
#define DIR3_PROJ TP_PROJ
|
||||
#define DIR4_PROJ XM_PROJ
|
||||
#define DIR5_PROJ YM_PROJ
|
||||
#define DIR6_PROJ ZM_PROJ
|
||||
#define DIR7_PROJ TM_PROJ
|
||||
#define DIR0_RECON XP_RECON
|
||||
#define DIR1_RECON YP_RECON_ACCUM
|
||||
#define DIR2_RECON ZP_RECON_ACCUM
|
||||
#define DIR3_RECON TP_RECON_ACCUM
|
||||
#define DIR4_RECON XM_RECON_ACCUM
|
||||
#define DIR5_RECON YM_RECON_ACCUM
|
||||
#define DIR6_RECON ZM_RECON_ACCUM
|
||||
#define DIR7_RECON TM_RECON_ACCUM
|
||||
#else
|
||||
#define DIR0_PROJ XM_PROJ
|
||||
#define DIR1_PROJ YM_PROJ
|
||||
#define DIR2_PROJ ZM_PROJ
|
||||
#define DIR3_PROJ TM_PROJ
|
||||
#define DIR4_PROJ XP_PROJ
|
||||
#define DIR5_PROJ YP_PROJ
|
||||
#define DIR6_PROJ ZP_PROJ
|
||||
#define DIR7_PROJ TP_PROJ
|
||||
#define DIR0_RECON XM_RECON
|
||||
#define DIR1_RECON YM_RECON_ACCUM
|
||||
#define DIR2_RECON ZM_RECON_ACCUM
|
||||
#define DIR3_RECON TM_RECON_ACCUM
|
||||
#define DIR4_RECON XP_RECON_ACCUM
|
||||
#define DIR5_RECON YP_RECON_ACCUM
|
||||
#define DIR6_RECON ZP_RECON_ACCUM
|
||||
#define DIR7_RECON TP_RECON_ACCUM
|
||||
#endif
|
||||
|
||||
//using namespace std;
|
||||
|
||||
#undef SHOW
|
||||
//#define SHOW
|
||||
|
||||
#undef WHERE
|
||||
|
||||
#ifdef INTERIOR_AND_EXTERIOR
|
||||
#define WHERE "INT_AND_EXT"
|
||||
#endif
|
||||
|
||||
#ifdef INTERIOR
|
||||
#define WHERE "INT"
|
||||
#endif
|
||||
|
||||
#ifdef EXTERIOR
|
||||
#define WHERE "EXT"
|
||||
#endif
|
||||
|
||||
//#pragma message("here")
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Comms then compute kernel
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef INTERIOR_AND_EXTERIOR
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(base); \
|
||||
LOAD_TABLE(PERMUTE_DIR); \
|
||||
PROJ; \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
} else { \
|
||||
LOAD_CHI(base); \
|
||||
} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
PREFETCH_CHIMU_L2(basep); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||
|
||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Pre comms kernel -- prefetch like normal because it is mostly right
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef INTERIOR
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(base); \
|
||||
LOAD_TABLE(PERMUTE_DIR); \
|
||||
PROJ; \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
if ( local || st.same_node[Dir] ) { \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
PREFETCH_CHIMU_L2(basep); \
|
||||
} else { PREFETCH_CHIMU(base); } \
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||
|
||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Post comms kernel
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef EXTERIOR
|
||||
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
nmu=0; \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_1(Dir); \
|
||||
PREFETCH_CHIMU(base); \
|
||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||
MULT_2SPIN_2; \
|
||||
if (s == 0) { \
|
||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||
} \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
||||
|
||||
#endif
|
||||
{
|
||||
int nmu;
|
||||
int local,perm, ptype;
|
||||
uint64_t base;
|
||||
uint64_t basep;
|
||||
const uint64_t plocal =(uint64_t) & in[0];
|
||||
|
||||
MASK_REGS;
|
||||
int nmax=U.oSites();
|
||||
for(int site=0;site<Ns;site++) {
|
||||
#ifndef EXTERIOR
|
||||
// int sU =lo.Reorder(ssU);
|
||||
int sU =ssU;
|
||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||
// int sUn=lo.Reorder(ssn);
|
||||
int sUn=ssn;
|
||||
LOCK_GAUGE(0);
|
||||
#else
|
||||
int sU =ssU;
|
||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||
int sUn=ssn;
|
||||
#endif
|
||||
for(int s=0;s<Ls;s++) {
|
||||
ss =sU*Ls+s;
|
||||
ssn=sUn*Ls+s;
|
||||
int ent=ss*8;// 2*Ndim
|
||||
int nent=ssn*8;
|
||||
|
||||
uint64_t delta_base, delta_base_p;
|
||||
|
||||
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
float rescale = 64. * 12.;
|
||||
std::cout << "=================================================================" << std::endl;
|
||||
std::cout << "ss = " << ss << " ssn = " << ssn << std::endl;
|
||||
std::cout << "sU = " << sU << " ssU = " << ssU << std::endl;
|
||||
std::cout << " " << std::endl;
|
||||
|
||||
|
||||
std::cout << "Dir = " << Xp << " " << WHERE<< std::endl;
|
||||
|
||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||
std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl;
|
||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
|
||||
|
||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||
std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl;
|
||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
|
||||
|
||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||
std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl;
|
||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
|
||||
|
||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||
std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl;
|
||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
|
||||
|
||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||
std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl;
|
||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
|
||||
|
||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||
std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl;
|
||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
|
||||
|
||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||
std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl;
|
||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
|
||||
|
||||
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||
std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl;
|
||||
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
#ifdef EXTERIOR
|
||||
if (nmu==0) break;
|
||||
// if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
|
||||
#endif
|
||||
base = (uint64_t) &out[ss];
|
||||
basep= st.GetPFInfo(nent,plocal); ent++;
|
||||
basep = (uint64_t) &out[ssn];
|
||||
RESULT(base,basep);
|
||||
|
||||
#ifdef SHOW
|
||||
std::cout << "Dir = FINAL " << WHERE<< std::endl;;
|
||||
|
||||
base_ss = base;
|
||||
std::cout << "base = " << (base - (uint64_t) &out[0])/rescale << std::endl;
|
||||
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
#endif
|
||||
|
||||
}
|
||||
ssU++;
|
||||
UNLOCK_GAUGE(0);
|
||||
}
|
||||
}
|
||||
|
||||
#undef DIR0_PROJ
|
||||
#undef DIR1_PROJ
|
||||
#undef DIR2_PROJ
|
||||
#undef DIR3_PROJ
|
||||
#undef DIR4_PROJ
|
||||
#undef DIR5_PROJ
|
||||
#undef DIR6_PROJ
|
||||
#undef DIR7_PROJ
|
||||
#undef DIR0_RECON
|
||||
#undef DIR1_RECON
|
||||
#undef DIR2_RECON
|
||||
#undef DIR3_RECON
|
||||
#undef DIR4_RECON
|
||||
#undef DIR5_RECON
|
||||
#undef DIR6_RECON
|
||||
#undef DIR7_RECON
|
||||
#undef ASM_LEG
|
||||
#undef ASM_LEG_XP
|
||||
#undef RESULT
|
@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_RESULT_EXT(ss,F)
|
||||
|
||||
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
nmu = 0; \
|
||||
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
|
@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl> void
|
||||
template<class Impl> accelerator_inline void
|
||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
template<class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
template<class Impl> accelerator_inline void
|
||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
template<class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
template<class Impl> accelerator_inline void
|
||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
|
||||
HAND_RESULT_EXT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
template<class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
|
@ -0,0 +1,943 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
|
||||
|
||||
#undef LOAD_CHIMU
|
||||
#undef LOAD_CHI
|
||||
#undef MULT_2SPIN
|
||||
#undef PERMUTE_DIR
|
||||
#undef XP_PROJ
|
||||
#undef YP_PROJ
|
||||
#undef ZP_PROJ
|
||||
#undef TP_PROJ
|
||||
#undef XM_PROJ
|
||||
#undef YM_PROJ
|
||||
#undef ZM_PROJ
|
||||
#undef TM_PROJ
|
||||
#undef XP_RECON
|
||||
#undef XP_RECON_ACCUM
|
||||
#undef XM_RECON
|
||||
#undef XM_RECON_ACCUM
|
||||
#undef YP_RECON_ACCUM
|
||||
#undef YM_RECON_ACCUM
|
||||
#undef ZP_RECON_ACCUM
|
||||
#undef ZM_RECON_ACCUM
|
||||
#undef TP_RECON_ACCUM
|
||||
#undef TM_RECON_ACCUM
|
||||
#undef ZERO_RESULT
|
||||
#undef Chimu_00
|
||||
#undef Chimu_01
|
||||
#undef Chimu_02
|
||||
#undef Chimu_10
|
||||
#undef Chimu_11
|
||||
#undef Chimu_12
|
||||
#undef Chimu_20
|
||||
#undef Chimu_21
|
||||
#undef Chimu_22
|
||||
#undef Chimu_30
|
||||
#undef Chimu_31
|
||||
#undef Chimu_32
|
||||
#undef HAND_STENCIL_LEG
|
||||
#undef HAND_STENCIL_LEG_INT
|
||||
#undef HAND_STENCIL_LEG_EXT
|
||||
#undef HAND_RESULT
|
||||
#undef HAND_RESULT_INT
|
||||
#undef HAND_RESULT_EXT
|
||||
|
||||
#define REGISTER
|
||||
|
||||
#define LOAD_CHIMU \
|
||||
{const SiteSpinor & ref (in[offset]); \
|
||||
Chimu_00=ref()(0)(0);\
|
||||
Chimu_01=ref()(0)(1);\
|
||||
Chimu_02=ref()(0)(2);\
|
||||
Chimu_10=ref()(1)(0);\
|
||||
Chimu_11=ref()(1)(1);\
|
||||
Chimu_12=ref()(1)(2);\
|
||||
Chimu_20=ref()(2)(0);\
|
||||
Chimu_21=ref()(2)(1);\
|
||||
Chimu_22=ref()(2)(2);\
|
||||
Chimu_30=ref()(3)(0);\
|
||||
Chimu_31=ref()(3)(1);\
|
||||
Chimu_32=ref()(3)(2);\
|
||||
std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \
|
||||
std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \
|
||||
std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \
|
||||
std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \
|
||||
std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \
|
||||
std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \
|
||||
std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \
|
||||
std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \
|
||||
std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \
|
||||
std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \
|
||||
std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \
|
||||
std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \
|
||||
std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \
|
||||
}
|
||||
|
||||
#define LOAD_CHI\
|
||||
{const SiteHalfSpinor &ref(buf[offset]); \
|
||||
Chi_00 = ref()(0)(0);\
|
||||
Chi_01 = ref()(0)(1);\
|
||||
Chi_02 = ref()(0)(2);\
|
||||
Chi_10 = ref()(1)(0);\
|
||||
Chi_11 = ref()(1)(1);\
|
||||
Chi_12 = ref()(1)(2);\
|
||||
std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl; \
|
||||
}
|
||||
|
||||
// To splat or not to splat depends on the implementation
|
||||
#define MULT_2SPIN(A)\
|
||||
{auto & ref(U[sU](A)); \
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||
UChi_00 = U_00*Chi_00;\
|
||||
UChi_10 = U_00*Chi_10;\
|
||||
UChi_01 = U_10*Chi_00;\
|
||||
UChi_11 = U_10*Chi_10;\
|
||||
UChi_02 = U_20*Chi_00;\
|
||||
UChi_12 = U_20*Chi_10;\
|
||||
UChi_00+= U_01*Chi_01;\
|
||||
UChi_10+= U_01*Chi_11;\
|
||||
UChi_01+= U_11*Chi_01;\
|
||||
UChi_11+= U_11*Chi_11;\
|
||||
UChi_02+= U_21*Chi_01;\
|
||||
UChi_12+= U_21*Chi_11;\
|
||||
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||
UChi_00+= U_00*Chi_02;\
|
||||
UChi_10+= U_00*Chi_12;\
|
||||
UChi_01+= U_10*Chi_02;\
|
||||
UChi_11+= U_10*Chi_12;\
|
||||
UChi_02+= U_20*Chi_02;\
|
||||
UChi_12+= U_20*Chi_12;\
|
||||
std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \
|
||||
std::cout << "UChi_00 -- " << UChi_00 << std::endl; \
|
||||
std::cout << "UChi_01 -- " << UChi_01 << std::endl; \
|
||||
std::cout << "UChi_02 -- " << UChi_02 << std::endl; \
|
||||
std::cout << "UChi_10 -- " << UChi_10 << std::endl; \
|
||||
std::cout << "UChi_11 -- " << UChi_11 << std::endl; \
|
||||
std::cout << "UChi_12 -- " << UChi_12 << std::endl; \
|
||||
}
|
||||
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl; \
|
||||
permute##dir(Chi_00,Chi_00);\
|
||||
permute##dir(Chi_01,Chi_01);\
|
||||
permute##dir(Chi_02,Chi_02);\
|
||||
permute##dir(Chi_10,Chi_10);\
|
||||
permute##dir(Chi_11,Chi_11);\
|
||||
permute##dir(Chi_12,Chi_12);\
|
||||
std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJ \
|
||||
Chi_00 = Chimu_00+timesI(Chimu_30);\
|
||||
Chi_01 = Chimu_01+timesI(Chimu_31);\
|
||||
Chi_02 = Chimu_02+timesI(Chimu_32);\
|
||||
Chi_10 = Chimu_10+timesI(Chimu_20);\
|
||||
Chi_11 = Chimu_11+timesI(Chimu_21);\
|
||||
Chi_12 = Chimu_12+timesI(Chimu_22);\
|
||||
std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
#define YP_PROJ \
|
||||
Chi_00 = Chimu_00-Chimu_30;\
|
||||
Chi_01 = Chimu_01-Chimu_31;\
|
||||
Chi_02 = Chimu_02-Chimu_32;\
|
||||
Chi_10 = Chimu_10+Chimu_20;\
|
||||
Chi_11 = Chimu_11+Chimu_21;\
|
||||
Chi_12 = Chimu_12+Chimu_22;\
|
||||
std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
#define ZP_PROJ \
|
||||
Chi_00 = Chimu_00+timesI(Chimu_20); \
|
||||
Chi_01 = Chimu_01+timesI(Chimu_21); \
|
||||
Chi_02 = Chimu_02+timesI(Chimu_22); \
|
||||
Chi_10 = Chimu_10-timesI(Chimu_30); \
|
||||
Chi_11 = Chimu_11-timesI(Chimu_31); \
|
||||
Chi_12 = Chimu_12-timesI(Chimu_32);\
|
||||
std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
#define TP_PROJ \
|
||||
Chi_00 = Chimu_00+Chimu_20; \
|
||||
Chi_01 = Chimu_01+Chimu_21; \
|
||||
Chi_02 = Chimu_02+Chimu_22; \
|
||||
Chi_10 = Chimu_10+Chimu_30; \
|
||||
Chi_11 = Chimu_11+Chimu_31; \
|
||||
Chi_12 = Chimu_12+Chimu_32;\
|
||||
std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
|
||||
// hspin(0)=fspin(0)-timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)-timesI(fspin(2));
|
||||
#define XM_PROJ \
|
||||
Chi_00 = Chimu_00-timesI(Chimu_30);\
|
||||
Chi_01 = Chimu_01-timesI(Chimu_31);\
|
||||
Chi_02 = Chimu_02-timesI(Chimu_32);\
|
||||
Chi_10 = Chimu_10-timesI(Chimu_20);\
|
||||
Chi_11 = Chimu_11-timesI(Chimu_21);\
|
||||
Chi_12 = Chimu_12-timesI(Chimu_22);\
|
||||
std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
#define YM_PROJ \
|
||||
Chi_00 = Chimu_00+Chimu_30;\
|
||||
Chi_01 = Chimu_01+Chimu_31;\
|
||||
Chi_02 = Chimu_02+Chimu_32;\
|
||||
Chi_10 = Chimu_10-Chimu_20;\
|
||||
Chi_11 = Chimu_11-Chimu_21;\
|
||||
Chi_12 = Chimu_12-Chimu_22;\
|
||||
std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
#define ZM_PROJ \
|
||||
Chi_00 = Chimu_00-timesI(Chimu_20); \
|
||||
Chi_01 = Chimu_01-timesI(Chimu_21); \
|
||||
Chi_02 = Chimu_02-timesI(Chimu_22); \
|
||||
Chi_10 = Chimu_10+timesI(Chimu_30); \
|
||||
Chi_11 = Chimu_11+timesI(Chimu_31); \
|
||||
Chi_12 = Chimu_12+timesI(Chimu_32);\
|
||||
std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
#define TM_PROJ \
|
||||
Chi_00 = Chimu_00-Chimu_20; \
|
||||
Chi_01 = Chimu_01-Chimu_21; \
|
||||
Chi_02 = Chimu_02-Chimu_22; \
|
||||
Chi_10 = Chimu_10-Chimu_30; \
|
||||
Chi_11 = Chimu_11-Chimu_31; \
|
||||
Chi_12 = Chimu_12-Chimu_32;\
|
||||
std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \
|
||||
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||
|
||||
// fspin(0)=hspin(0);
|
||||
// fspin(1)=hspin(1);
|
||||
// fspin(2)=timesMinusI(hspin(1));
|
||||
// fspin(3)=timesMinusI(hspin(0));
|
||||
#define XP_RECON\
|
||||
result_00 = UChi_00;\
|
||||
result_01 = UChi_01;\
|
||||
result_02 = UChi_02;\
|
||||
result_10 = UChi_10;\
|
||||
result_11 = UChi_11;\
|
||||
result_12 = UChi_12;\
|
||||
result_20 = timesMinusI(UChi_10);\
|
||||
result_21 = timesMinusI(UChi_11);\
|
||||
result_22 = timesMinusI(UChi_12);\
|
||||
result_30 = timesMinusI(UChi_00);\
|
||||
result_31 = timesMinusI(UChi_01);\
|
||||
result_32 = timesMinusI(UChi_02);\
|
||||
std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define XP_RECON_ACCUM\
|
||||
result_00+=UChi_00;\
|
||||
result_01+=UChi_01;\
|
||||
result_02+=UChi_02;\
|
||||
result_10+=UChi_10;\
|
||||
result_11+=UChi_11;\
|
||||
result_12+=UChi_12;\
|
||||
result_20-=timesI(UChi_10);\
|
||||
result_21-=timesI(UChi_11);\
|
||||
result_22-=timesI(UChi_12);\
|
||||
result_30-=timesI(UChi_00);\
|
||||
result_31-=timesI(UChi_01);\
|
||||
result_32-=timesI(UChi_02);\
|
||||
std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define XM_RECON\
|
||||
result_00 = UChi_00;\
|
||||
result_01 = UChi_01;\
|
||||
result_02 = UChi_02;\
|
||||
result_10 = UChi_10;\
|
||||
result_11 = UChi_11;\
|
||||
result_12 = UChi_12;\
|
||||
result_20 = timesI(UChi_10);\
|
||||
result_21 = timesI(UChi_11);\
|
||||
result_22 = timesI(UChi_12);\
|
||||
result_30 = timesI(UChi_00);\
|
||||
result_31 = timesI(UChi_01);\
|
||||
result_32 = timesI(UChi_02);\
|
||||
std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define XM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= timesI(UChi_10);\
|
||||
result_21+= timesI(UChi_11);\
|
||||
result_22+= timesI(UChi_12);\
|
||||
result_30+= timesI(UChi_00);\
|
||||
result_31+= timesI(UChi_01);\
|
||||
result_32+= timesI(UChi_02);\
|
||||
std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define YP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= UChi_10;\
|
||||
result_21+= UChi_11;\
|
||||
result_22+= UChi_12;\
|
||||
result_30-= UChi_00;\
|
||||
result_31-= UChi_01;\
|
||||
result_32-= UChi_02;\
|
||||
std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define YM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= UChi_10;\
|
||||
result_21-= UChi_11;\
|
||||
result_22-= UChi_12;\
|
||||
result_30+= UChi_00;\
|
||||
result_31+= UChi_01;\
|
||||
result_32+= UChi_02;\
|
||||
std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define ZP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= timesI(UChi_00); \
|
||||
result_21-= timesI(UChi_01); \
|
||||
result_22-= timesI(UChi_02); \
|
||||
result_30+= timesI(UChi_10); \
|
||||
result_31+= timesI(UChi_11); \
|
||||
result_32+= timesI(UChi_12);\
|
||||
std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define ZM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= timesI(UChi_00); \
|
||||
result_21+= timesI(UChi_01); \
|
||||
result_22+= timesI(UChi_02); \
|
||||
result_30-= timesI(UChi_10); \
|
||||
result_31-= timesI(UChi_11); \
|
||||
result_32-= timesI(UChi_12);\
|
||||
std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define TP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= UChi_00; \
|
||||
result_21+= UChi_01; \
|
||||
result_22+= UChi_02; \
|
||||
result_30+= UChi_10; \
|
||||
result_31+= UChi_11; \
|
||||
result_32+= UChi_12;\
|
||||
std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define TM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= UChi_00; \
|
||||
result_21-= UChi_01; \
|
||||
result_22-= UChi_02; \
|
||||
result_30-= UChi_10; \
|
||||
result_31-= UChi_11; \
|
||||
result_32-= UChi_12;\
|
||||
std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||
|
||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU; \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON;
|
||||
|
||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU; \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else if ( st.same_node[DIR] ) { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
if (local || st.same_node[DIR] ) { \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
||||
LOAD_CHI; \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define HAND_RESULT(ss) \
|
||||
{ \
|
||||
SiteSpinor & ref (out[ss]); \
|
||||
vstream(ref()(0)(0),result_00); \
|
||||
vstream(ref()(0)(1),result_01); \
|
||||
vstream(ref()(0)(2),result_02); \
|
||||
vstream(ref()(1)(0),result_10); \
|
||||
vstream(ref()(1)(1),result_11); \
|
||||
vstream(ref()(1)(2),result_12); \
|
||||
vstream(ref()(2)(0),result_20); \
|
||||
vstream(ref()(2)(1),result_21); \
|
||||
vstream(ref()(2)(2),result_22); \
|
||||
vstream(ref()(3)(0),result_30); \
|
||||
vstream(ref()(3)(1),result_31); \
|
||||
vstream(ref()(3)(2),result_32); \
|
||||
std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;\
|
||||
}
|
||||
|
||||
#define HAND_RESULT_EXT(ss) \
|
||||
if (nmu){ \
|
||||
SiteSpinor & ref (out[ss]); \
|
||||
ref()(0)(0)+=result_00; \
|
||||
ref()(0)(1)+=result_01; \
|
||||
ref()(0)(2)+=result_02; \
|
||||
ref()(1)(0)+=result_10; \
|
||||
ref()(1)(1)+=result_11; \
|
||||
ref()(1)(2)+=result_12; \
|
||||
ref()(2)(0)+=result_20; \
|
||||
ref()(2)(1)+=result_21; \
|
||||
ref()(2)(2)+=result_22; \
|
||||
ref()(3)(0)+=result_30; \
|
||||
ref()(3)(1)+=result_31; \
|
||||
ref()(3)(2)+=result_32; \
|
||||
std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \
|
||||
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||
std::cout << "result_32 -- " << result_32 << std::endl;\
|
||||
}
|
||||
|
||||
|
||||
#define HAND_DECLARATIONS(a) \
|
||||
Simd result_00; \
|
||||
Simd result_01; \
|
||||
Simd result_02; \
|
||||
Simd result_10; \
|
||||
Simd result_11; \
|
||||
Simd result_12; \
|
||||
Simd result_20; \
|
||||
Simd result_21; \
|
||||
Simd result_22; \
|
||||
Simd result_30; \
|
||||
Simd result_31; \
|
||||
Simd result_32; \
|
||||
Simd Chi_00; \
|
||||
Simd Chi_01; \
|
||||
Simd Chi_02; \
|
||||
Simd Chi_10; \
|
||||
Simd Chi_11; \
|
||||
Simd Chi_12; \
|
||||
Simd UChi_00; \
|
||||
Simd UChi_01; \
|
||||
Simd UChi_02; \
|
||||
Simd UChi_10; \
|
||||
Simd UChi_11; \
|
||||
Simd UChi_12; \
|
||||
Simd U_00; \
|
||||
Simd U_10; \
|
||||
Simd U_20; \
|
||||
Simd U_01; \
|
||||
Simd U_11; \
|
||||
Simd U_21;\
|
||||
Simd debugreg;\
|
||||
svbool_t pg1; \
|
||||
pg1 = svptrue_b64(); \
|
||||
|
||||
#define ZERO_RESULT \
|
||||
result_00=Zero(); \
|
||||
result_01=Zero(); \
|
||||
result_02=Zero(); \
|
||||
result_10=Zero(); \
|
||||
result_11=Zero(); \
|
||||
result_12=Zero(); \
|
||||
result_20=Zero(); \
|
||||
result_21=Zero(); \
|
||||
result_22=Zero(); \
|
||||
result_30=Zero(); \
|
||||
result_31=Zero(); \
|
||||
result_32=Zero();
|
||||
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_12
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
|
||||
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
|
||||
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset, ptype;
|
||||
StencilEntry *SE;
|
||||
int nmu=0;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||
HAND_RESULT_EXT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset, ptype;
|
||||
int nmu=0;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||
HAND_RESULT_EXT(ss);
|
||||
}
|
||||
|
||||
////////////// Wilson ; uses this implementation /////////////////////
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
#undef LOAD_CHIMU
|
||||
#undef LOAD_CHI
|
||||
#undef MULT_2SPIN
|
||||
#undef PERMUTE_DIR
|
||||
#undef XP_PROJ
|
||||
#undef YP_PROJ
|
||||
#undef ZP_PROJ
|
||||
#undef TP_PROJ
|
||||
#undef XM_PROJ
|
||||
#undef YM_PROJ
|
||||
#undef ZM_PROJ
|
||||
#undef TM_PROJ
|
||||
#undef XP_RECON
|
||||
#undef XP_RECON_ACCUM
|
||||
#undef XM_RECON
|
||||
#undef XM_RECON_ACCUM
|
||||
#undef YP_RECON_ACCUM
|
||||
#undef YM_RECON_ACCUM
|
||||
#undef ZP_RECON_ACCUM
|
||||
#undef ZM_RECON_ACCUM
|
||||
#undef TP_RECON_ACCUM
|
||||
#undef TM_RECON_ACCUM
|
||||
#undef ZERO_RESULT
|
||||
#undef Chimu_00
|
||||
#undef Chimu_01
|
||||
#undef Chimu_02
|
||||
#undef Chimu_10
|
||||
#undef Chimu_11
|
||||
#undef Chimu_12
|
||||
#undef Chimu_20
|
||||
#undef Chimu_21
|
||||
#undef Chimu_22
|
||||
#undef Chimu_30
|
||||
#undef Chimu_31
|
||||
#undef Chimu_32
|
||||
#undef HAND_STENCIL_LEG
|
||||
#undef HAND_STENCIL_LEG_INT
|
||||
#undef HAND_STENCIL_LEG_EXT
|
||||
#undef HAND_RESULT
|
||||
#undef HAND_RESULT_INT
|
||||
#undef HAND_RESULT_EXT
|
@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Generic implementation; move to different file?
|
||||
////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef GRID_SIMT
|
||||
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
||||
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
||||
uint4 * chip_pun = (uint4 *)&chip;
|
||||
@ -51,6 +52,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
||||
SE = st.GetEntry(ptype, Dir, sF); \
|
||||
@ -61,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
} else { \
|
||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||
} \
|
||||
synchronise(); \
|
||||
acceleratorSynchronise(); \
|
||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
||||
Recon(result, Uchi);
|
||||
|
||||
@ -74,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
} else if ( st.same_node[Dir] ) { \
|
||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||
} \
|
||||
synchronise(); \
|
||||
acceleratorSynchronise(); \
|
||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
||||
Recon(result, Uchi); \
|
||||
} \
|
||||
synchronise();
|
||||
acceleratorSynchronise();
|
||||
|
||||
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
|
||||
SE = st.GetEntry(ptype, Dir, sF); \
|
||||
@ -89,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
Recon(result, Uchi); \
|
||||
nmu++; \
|
||||
} \
|
||||
synchronise();
|
||||
acceleratorSynchronise();
|
||||
|
||||
#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \
|
||||
if (SE->_is_local ) { \
|
||||
@ -99,7 +101,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
} else { \
|
||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||
} \
|
||||
synchronise(); \
|
||||
acceleratorSynchronise(); \
|
||||
Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
|
||||
Recon(result, Uchi);
|
||||
|
||||
@ -112,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// All legs kernels ; comms then compute
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -126,7 +128,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=SIMTlane(Nsimd);
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
|
||||
GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
|
||||
GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
|
||||
@ -138,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
||||
coalescedWrite(out[sF],result,lane);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -153,7 +155,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
||||
int ptype;
|
||||
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=SIMTlane(Nsimd);
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
|
||||
GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
|
||||
GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
|
||||
@ -167,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Interior kernels
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -181,7 +183,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=SIMTlane(Nsimd);
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
|
||||
result=Zero();
|
||||
GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
|
||||
@ -195,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
||||
coalescedWrite(out[sF], result,lane);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -203,7 +205,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
||||
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
|
||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=SIMTlane(Nsimd);
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
|
||||
calcHalfSpinor chi;
|
||||
// calcHalfSpinor *chi_p;
|
||||
@ -225,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Exterior kernels
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -239,7 +241,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
||||
int ptype;
|
||||
int nmu=0;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=SIMTlane(Nsimd);
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
result=Zero();
|
||||
GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
|
||||
GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
|
||||
@ -256,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
||||
}
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -270,7 +272,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
||||
int ptype;
|
||||
int nmu=0;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=SIMTlane(Nsimd);
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
result=Zero();
|
||||
GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
|
||||
GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
|
||||
@ -288,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
||||
};
|
||||
|
||||
#define DhopDirMacro(Dir,spProj,spRecon) \
|
||||
template <class Impl> \
|
||||
template <class Impl> accelerator_inline \
|
||||
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
||||
{ \
|
||||
@ -300,7 +302,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
||||
StencilEntry *SE; \
|
||||
int ptype; \
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd(); \
|
||||
const int lane=SIMTlane(Nsimd); \
|
||||
const int lane=acceleratorSIMTlane(Nsimd); \
|
||||
\
|
||||
SE = st.GetEntry(ptype, dir, sF); \
|
||||
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \
|
||||
@ -316,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
|
||||
DhopDirMacro(Zm,spProjZm,spReconZm);
|
||||
DhopDirMacro(Tm,spProjTm,spReconTm);
|
||||
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||
{
|
||||
@ -328,7 +330,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=SIMTlane(Nsimd);
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
|
||||
SE = st.GetEntry(ptype, dir, sF);
|
||||
GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
|
||||
@ -346,30 +348,30 @@ template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
|
||||
int Nsite, const FermionField &in, std::vector<FermionField> &out)
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto st_v = st.View();
|
||||
autoView(U_v ,U,AcceleratorRead);
|
||||
autoView(in_v ,in,AcceleratorRead);
|
||||
autoView(st_v ,st,AcceleratorRead);
|
||||
|
||||
auto out_Xm = out[0].View();
|
||||
auto out_Ym = out[1].View();
|
||||
auto out_Zm = out[2].View();
|
||||
auto out_Tm = out[3].View();
|
||||
auto out_Xp = out[4].View();
|
||||
auto out_Yp = out[5].View();
|
||||
auto out_Zp = out[6].View();
|
||||
auto out_Tp = out[7].View();
|
||||
|
||||
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
|
||||
autoView(out_Xm,out[0],AcceleratorWrite);
|
||||
autoView(out_Ym,out[1],AcceleratorWrite);
|
||||
autoView(out_Zm,out[2],AcceleratorWrite);
|
||||
autoView(out_Tm,out[3],AcceleratorWrite);
|
||||
autoView(out_Xp,out[4],AcceleratorWrite);
|
||||
autoView(out_Yp,out[5],AcceleratorWrite);
|
||||
autoView(out_Zp,out[6],AcceleratorWrite);
|
||||
autoView(out_Tp,out[7],AcceleratorWrite);
|
||||
auto CBp=st.CommBuf();
|
||||
accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
|
||||
int sU=sss/Ls;
|
||||
int sF =sss;
|
||||
DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
|
||||
DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
|
||||
DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
|
||||
DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
|
||||
DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
|
||||
DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
|
||||
DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
|
||||
DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
|
||||
DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
|
||||
DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
|
||||
DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
|
||||
DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
|
||||
DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
|
||||
DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
|
||||
DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
|
||||
DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
|
||||
});
|
||||
}
|
||||
|
||||
@ -381,17 +383,18 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
assert(dirdisp<=7);
|
||||
assert(dirdisp>=0);
|
||||
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
autoView(U_v ,U ,AcceleratorRead);
|
||||
autoView(in_v ,in ,AcceleratorRead);
|
||||
autoView(out_v,out,AcceleratorWrite);
|
||||
autoView(st_v ,st ,AcceleratorRead);
|
||||
auto CBp=st.CommBuf();
|
||||
#define LoopBody(Dir) \
|
||||
case Dir : \
|
||||
accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \
|
||||
accelerator_for(ss,Nsite,Simd::Nsimd(),{ \
|
||||
for(int s=0;s<Ls;s++){ \
|
||||
int sU=ss; \
|
||||
int sF = s+Ls*sU; \
|
||||
DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
|
||||
DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
|
||||
} \
|
||||
}); \
|
||||
break;
|
||||
@ -435,26 +438,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
autoView(U_v , U,AcceleratorRead);
|
||||
autoView(in_v , in,AcceleratorRead);
|
||||
autoView(out_v,out,AcceleratorWrite);
|
||||
autoView(st_v , st,AcceleratorRead);
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
||||
#endif
|
||||
@ -466,26 +469,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
autoView(U_v ,U,AcceleratorRead);
|
||||
autoView(in_v ,in,AcceleratorRead);
|
||||
autoView(out_v,out,AcceleratorWrite);
|
||||
autoView(st_v ,st,AcceleratorRead);
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||
#ifndef GRID_NVCC
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||
#endif
|
||||
@ -493,5 +496,8 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
#undef KERNEL_CALLNB
|
||||
#undef KERNEL_CALL
|
||||
#undef ASM_CALL
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -0,0 +1,36 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi, Peter Boyle
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
|
||||
const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1,37 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi, Peter Boyle
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class NaiveStaggeredFermion<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1 @@
|
||||
../NaiveStaggeredFermionInstantiation.cc.master
|
@ -0,0 +1 @@
|
||||
../NaiveStaggeredFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -0,0 +1,51 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015, 2020
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||
|
||||
#ifndef AVX512
|
||||
#ifndef QPX
|
||||
#ifndef A64FX
|
||||
#ifndef A64FXFIXEDSIZE
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class WilsonKernels<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -0,0 +1,51 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015, 2020
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||
|
||||
#ifndef AVX512
|
||||
#ifndef QPX
|
||||
#ifndef A64FX
|
||||
#ifndef A64FXFIXEDSIZE
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class WilsonKernels<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -0,0 +1,51 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015, 2020
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||
|
||||
#ifndef AVX512
|
||||
#ifndef QPX
|
||||
#ifndef A64FX
|
||||
#ifndef A64FXFIXEDSIZE
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class WilsonKernels<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -0,0 +1,51 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015, 2020
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||
|
||||
#ifndef AVX512
|
||||
#ifndef QPX
|
||||
#ifndef A64FX
|
||||
#ifndef A64FXFIXEDSIZE
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class WilsonKernels<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -0,0 +1,51 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015, 2020
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||
|
||||
#ifndef AVX512
|
||||
#ifndef QPX
|
||||
#ifndef A64FX
|
||||
#ifndef A64FXFIXEDSIZE
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class WilsonKernels<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -0,0 +1,51 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015, 2020
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||
|
||||
#ifndef AVX512
|
||||
#ifndef QPX
|
||||
#ifndef A64FX
|
||||
#ifndef A64FXFIXEDSIZE
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class WilsonKernels<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
Copyright (C) 2015, 2020
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@ -34,9 +35,13 @@ directory
|
||||
|
||||
#ifndef AVX512
|
||||
#ifndef QPX
|
||||
#ifndef A64FX
|
||||
#ifndef A64FXFIXEDSIZE
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
@ -44,4 +49,3 @@ NAMESPACE_BEGIN(Grid);
|
||||
template class WilsonKernels<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user