1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 07:17:06 +01:00

Compare commits

..

114 Commits

Author SHA1 Message Date
d8c0c0ba0a Fix and compiles 2020-08-12 14:35:08 -04:00
c6cf918d4c Typo 2020-08-12 14:24:39 -04:00
6d0a907c5c first try at A2A four quark offload 2020-08-12 14:17:46 -04:00
3276aa67dc Update 2020-08-12 14:15:53 -04:00
7cf7f11e1a Doc recompile 2020-07-22 14:44:11 -04:00
f1f655d92b Merge pull request #304 from Heinrich-BR/develop
ScalarImpl.h updates
2020-07-06 10:16:03 +01:00
43334e88c3 Tiny change in a comment for clarity 2020-07-04 16:11:16 +01:00
4f1e66b044 Fixed HMC SU(N) integrator which was causing fields to leave Lie Algebra manifold for N>2 2020-07-04 03:53:06 +01:00
64fe5b21b4 Merge pull request #298 from rrhodgson/feature/baryon
Update baryon 2pt and add 3pt function
2020-06-29 18:45:00 +01:00
ee9889821d Runs through to coarse space solve 2020-06-29 12:59:52 -04:00
eb470aa6dc Update to baryon and added comments/fix whitespace 2020-06-29 09:43:01 +01:00
77af9a3ddc Baryon revert sign 2020-06-26 10:08:42 +01:00
102089798c BaryonUtils: update to autoView 2020-06-25 16:41:58 +01:00
39cea8b5a7 Merge branch 'develop' into feature/baryon 2020-06-25 16:24:07 +01:00
a65f66d2db Merge branch 'feature/baryon3pt' into feature/baryon 2020-06-25 16:20:59 +01:00
936c5ecf69 Reduction GPU no compile fix 2020-06-24 17:28:31 -04:00
22cfbdbbb3 Boost precision in inner products in single 2020-06-24 12:52:31 -04:00
093d1ee21b Force initial values 2020-06-24 08:54:49 -04:00
d6ba2581ce Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-06-24 08:25:08 -04:00
577c064184 Memory manager initialise earlier 2020-06-24 08:24:38 -04:00
2ff1fa6fad UVM used shared for CPU alloccations andd ddont migrate 2020-06-23 22:14:56 -04:00
70be1bd8be Adding code under development 2020-06-23 10:24:21 -04:00
4ef50ba31f Baryon speedup 2020-06-23 11:44:20 +01:00
3e97a26f90 BaryonGamm3pt threads -> accelerator 2020-06-23 11:35:32 +01:00
599f28f6ef Baryon bug fixes 2020-06-23 11:10:26 +01:00
c48da35921 Memory Vector UVM and Lattice alignedAllocator separate 2020-06-22 20:21:53 -04:00
6c5fa8dcd8 Aligned allocate on CPU put through this interface 2020-06-20 14:34:29 -04:00
0d2f913a1a String.h for linux 2020-06-20 09:37:31 -04:00
1a74816c25 Hopeefully fixed 2020-06-19 17:50:52 -04:00
73de335256 Merge branch 'develop' into sycl 2020-06-19 17:44:16 -04:00
228fd450ce Typo fix (excusee - my keyboard is starting to break) 2020-06-19 17:36:05 -04:00
b949cf6b12 PeekLocal needs a view to keep thread safe.
ALLOCATION_CACHEE reenable
2020-06-19 17:13:27 -04:00
11bc1aeadc TThread count defaultt to fastest 2020-06-19 14:30:35 -04:00
66005929af Set up the cache size on all ranks 2020-06-19 12:50:54 -04:00
ff7c847735 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-19 01:22:16 -04:00
1aa988b2af Comms overlap fix UVM case 2020-06-19 01:21:14 -04:00
edf17708a8 Range improvement 2020-06-18 22:41:06 -04:00
f46f029dbb Merge pull request #292 from lehner/feature/gpt-sycl
Catch edge case in SharedMemoryMPI::GetShmDims; Change default units …
2020-06-14 13:43:27 -04:00
3dccd7aa2c Catch edge case in SharedMemoryMPI::GetShmDims; Change default units to consistent MB in init args; Want last element not past last element in MemoryManagerCache.cc 2020-06-14 13:26:01 -04:00
65e6e7da6f Merge pull request #291 from lehner/feature/gpt-sycl
Feature/gpt sycl
2020-06-12 20:42:32 -04:00
b5e87e8d97 summit compile fixes 2020-06-12 18:16:12 -04:00
5f5807d60a cleanup 2020-06-12 14:48:23 -04:00
7974acff54 merged sycl to feature-gpt 2020-06-12 06:49:38 -04:00
f0d17d2b49 Added Baryon3pt code 2020-06-12 11:35:52 +01:00
244c003a1b Updated Baryon code 2020-06-12 11:00:25 +01:00
0174f5f742 look for librt when using shm=shmopen 2020-06-11 16:50:43 +01:00
32b2b59be4 Offload 2020-06-10 20:36:26 -04:00
86bb0cc24b Keep on GPU 2020-06-10 20:00:00 -04:00
84c19587e7 Offload 2020-06-10 19:59:31 -04:00
237ce92540 Offload loops 2020-06-10 19:59:11 -04:00
a7ffc61e82 acceleratorSIMTlane() 2020-06-10 19:58:33 -04:00
fd97f64612 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-10 12:58:13 -04:00
8720aecb80 Offload more loops 2020-06-10 12:57:55 -04:00
cdf0a04fc5 Merge branch 'develop' into sycl 2020-06-09 04:00:12 -04:00
616d3dd737 CCommpile updates 2020-06-08 18:57:41 -04:00
8b066baca8 Implement transient mechanism 2020-06-08 18:28:53 -04:00
e97f3688db Fix the HMC issue - kernel was launchnig asynchronously 2020-06-08 17:01:15 -04:00
89a1e78390 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 23:20:37 -04:00
ffbb3fc02c Merge pull request #287 from felixerben/baryon-cleaner
slightly cleaner baryon 2pt code
2020-06-05 22:54:52 -04:00
5a73ef3647 Minor tweak to compile 2020-06-05 21:50:15 -04:00
87e5d2f4b7 Merge branch 'sycl' of https://www.github.com/paboyle/Grid into sycl 2020-06-05 17:32:21 -07:00
d720f10758 Liink error fix 2020-06-05 17:29:20 -07:00
14fcd0912a Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 19:14:17 -04:00
3111c0bd4f Single precisiono hardwire 2020-06-05 19:13:27 -04:00
e03064490e Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 18:53:39 -04:00
1a4c8c3387 Global edit with change to View usage. autoView() creates a wrapper object that closes the view when scope closes. 2020-06-05 18:52:35 -04:00
2b1e259441 Decode of SYCL devices fix 2020-06-04 17:16:55 -07:00
f39c2a240b Priintinig and device memory size detection 2020-06-04 14:58:03 -04:00
0d95805cde Print improvement 2020-06-03 22:50:32 -04:00
f67830587f Accelerator loop use 2020-06-03 22:50:09 -04:00
6bf7f839ff Better printing and logging 2020-06-03 09:28:57 -04:00
e3147881a9 Cache scheme 2020-06-03 09:23:48 -04:00
fb559614ad Initialise meemory manager 2020-06-03 09:12:47 -04:00
e93e12b6a4 More verbose SYCL setup 2020-06-03 09:12:11 -04:00
0c3112cd94 Use view mechanism 2020-06-03 09:11:51 -04:00
8cfd5d2639 Need lattice view 2020-06-03 09:11:28 -04:00
1c9f20b15e Views must be closed 2020-06-03 09:10:29 -04:00
32237895bd Reorg memory manager for O(1) hash table 2020-06-03 09:09:52 -04:00
9fcb47ee63 Explicit error message instead of infinite loop in GlobalSharedMemory::GetShmDims 2020-06-02 07:44:38 -04:00
1d252d0922 Accelerator inline 2020-05-28 11:45:25 -04:00
006cc8a8f1 Staggereed move to accelerator 2020-05-28 08:33:06 -04:00
cf2938688a Sycl unhappy fix 2020-05-25 08:36:53 -07:00
ee63721bad int unhappiness sycl fix 2020-05-25 08:36:24 -07:00
22c5168d70 Sycl happier 2020-05-25 08:35:56 -07:00
949ac3cd24 Must avoid non-trivial copy constructors 2020-05-25 08:35:28 -07:00
7bc0166c1c SYCLL maknig happy - must avoid non ttrivial copy constructors 2020-05-25 08:34:19 -07:00
cb0d1b3399 hopefullly fix buildd fail 2020-05-24 21:27:00 -04:00
d1f1ccc705 HIP changes 2020-05-24 21:18:49 -04:00
c7519a237a Assertions fail on HIP foor unknown reasons - dedbugging 2020-05-24 14:02:47 -04:00
32be2b13d3 Updates for HiP 2020-05-24 14:00:55 -04:00
92b342a477 Hip reduction too 2020-05-24 13:50:28 -04:00
556da86ac3 HIP fp16 2020-05-24 13:41:58 -04:00
8285e41574 View location / access mode 2020-05-21 16:14:41 -04:00
f999408e92 View locatoin and access mode 2020-05-21 16:14:20 -04:00
a7abda89e2 View location & access mode 2020-05-21 16:13:59 -04:00
7860a50f70 Make view specify where and drive data motion - first cut.
This is a compile tiime option --enable-unified=yes/no
2020-05-21 16:13:16 -04:00
6c6812a5ca GB/s output 2020-05-20 12:26:57 +01:00
8358ee38c4 pull develop 2020-05-19 08:56:18 -04:00
1f154fe652 some cleanup in BaryonUtils 2020-05-19 13:48:56 +01:00
d708c0258d some cleanup in BaryonUtils 2020-05-19 13:48:00 +01:00
a7635fd5ba summit mem 2020-05-18 17:52:26 -04:00
ebb60330c9 Automatic data motion options beginning 2020-05-17 16:34:25 -04:00
32fbdf4fb1 Merge pull request #5 from paboyle/develop
Sync upstream
2020-05-13 09:02:56 +02:00
a9847aa866 Dependence fix 2020-05-12 20:03:37 -04:00
d24d8e8398 Use X-direction as more bits meaningful on CUDA.
2^31-1 shoulddd always bee enough for SIMD and thread reduced local volume

e.g. 32*2^31 = 2^36 = (2^9)^4 or 512^4 ias big enough.

Where 32 is gpu_threads * Nsimd = 8*4
2020-05-12 10:35:49 -04:00
162e4bb567 no automatic prefetching for now 2020-05-12 07:01:23 -04:00
07c0c02f8c Speed up Cshift 2020-05-11 17:02:01 -04:00
8c31c065b5 Keep the Vector fixed to protect it from realloc 2020-05-11 17:00:30 -04:00
b1c86900b2 Merge pull request #4 from paboyle/develop
merge
2020-05-11 20:59:29 +02:00
bbbee5660d First compiile on HiP 2020-05-10 05:28:09 -04:00
52081acfa5 NVCC compile fixes 2020-05-08 13:14:12 -04:00
f8b8e00090 Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc
Aim to reduce the amount of cuda and other code variations floating around all over the place.

Will move GpuInit iinto Accelerator.cc from Init.cc
Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
2020-05-08 06:23:55 -07:00
28a1fcaaff First compile against SYCL 2020-05-05 11:13:27 -07:00
2c22db841a Added momentum scaling to scalar HMC theories in order to follow UKQCD/CPS conventions 2020-04-02 17:38:47 +01:00
143 changed files with 6756 additions and 2790 deletions

View File

@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h>
#include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h>
#include <Grid/threads/Threads.h>
#include <Grid/threads/ThreadReduction.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h>

View File

@ -6,6 +6,7 @@
///////////////////
#include <cassert>
#include <complex>
#include <memory>
#include <vector>
#include <array>
#include <string>

View File

@ -18,19 +18,20 @@
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
#undef __CUDA_ARCH__
#undef __NVCC__
#undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__
#endif
/* SYCL save and restore compile environment*/
#ifdef __SYCL_DEVICE_ONLY__
#ifdef GRID_SYCL
#pragma push
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#undef EIGEN_USE_SYCL
#define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif
@ -41,7 +42,7 @@
#ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop_macro("GRID_SIMT")
#pragma pop
#endif

View File

@ -21,7 +21,7 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h
endif
all: version-cache
all: version-cache Version.h
version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
@ -42,7 +42,7 @@ version-cache:
fi;\
rm -f vertmp
Version.h:
Version.h: version-cache
cp version-cache Version.h
.PHONY: version-cache

View File

@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H
NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h>
#include <Grid/algorithms/Preconditioner.h>
NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h>
@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx);
#include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h>
NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h>
@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/FFT.h>
#endif

View File

@ -1,14 +1,3 @@
// blockZaxpy in bockPromote - 3s, 5%
// noncoalesced linalg in Preconditionoer ~ 3s 5%
// Lancos tuning or replace 10-20s ~ 25%, open ended
// setup tuning 5s ~ 8%
// -- e.g. ordermin, orderstep tunables.
// MdagM path without norm in LinOp code. few seconds
// Mdir calc blocking kernels
// Fuse kernels in blockMaskedInnerProduct
// preallocate Vectors in Cayley 5D ~ few percent few seconds
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -91,35 +80,8 @@ public:
}
directions [2*_d]=0;
displacements[2*_d]=0;
//// report back
std::cout<<GridLogMessage<<"directions :";
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
std::cout<<std::endl;
std::cout<<GridLogMessage<<"displacements :";
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
std::cout<<std::endl;
}
/*
// Original cleaner code
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
for(int d=0;d<dimension;d++){
directions[2*d ] = d;
directions[2*d+1] = d;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*dimension]=0;
displacements[2*dimension]=0;
}
std::vector<int> GetDelta(int point) {
std::vector<int> delta(dimension,0);
delta[directions[point]] = displacements[point];
return delta;
};
*/
};
template<class Fobj,class CComplex,int nbasis>
@ -149,24 +111,6 @@ public:
CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
// blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
// CheckOrthogonal();
}
void CheckOrthogonal(void){
CoarseVector iProj(CoarseGrid);
CoarseVector eProj(CoarseGrid);
for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace);
eProj=Zero();
accelerator_for(ss, CoarseGrid->oSites(),1,{
eProj[ss](i)=CComplex(1.0);
});
eProj=eProj - iProj;
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
}
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
@ -175,11 +119,6 @@ public:
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
void CreateSubspaceRandom(GridParallelRNG &RNG){
for(int i=0;i<nbasis;i++){
random(RNG,subspace[i]);
}
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
@ -218,7 +157,7 @@ public:
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
////////////////////////////////////////////////////////////////////////////////////////////////
#if 1
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
@ -280,10 +219,10 @@ public:
hermop.HermOp(*Tn,y);
auto y_v = y.View();
auto Tn_v = Tn->View();
auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
autoView( y_v , y, AcceleratorWrite);
autoView( Tn_v , (*Tn), AcceleratorWrite);
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
@ -313,201 +252,6 @@ public:
}
assert(b==nn);
}
#endif
#if 0
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
FineField combined(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
#define FILTERb(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
Cheb(hermop,noise,Mn); \
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
subspace[b] = Mn; \
hermop.Op(Mn,tmp); \
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
RealD alpha=-0.8;
RealD beta =-0.8;
#define FILTER(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
Cheb(hermop,noise,Mn); \
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
subspace[b] = Mn; \
hermop.Op(Mn,tmp); \
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
#define FILTERc(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
Cheb(hermop,noise,combined); \
}
double node = 0.000;
FILTERb(lo,hi,orderfilter);// 0
// FILTERc(node,hi,51);// 0
noise = Mn;
int base = 0;
int mult = 100;
FILTER(node,hi,base+1*mult);
FILTER(node,hi,base+2*mult);
FILTER(node,hi,base+3*mult);
FILTER(node,hi,base+4*mult);
FILTER(node,hi,base+5*mult);
FILTER(node,hi,base+6*mult);
FILTER(node,hi,base+7*mult);
FILTER(node,hi,base+8*mult);
FILTER(node,hi,base+9*mult);
FILTER(node,hi,base+10*mult);
FILTER(node,hi,base+11*mult);
FILTER(node,hi,base+12*mult);
FILTER(node,hi,base+13*mult);
FILTER(node,hi,base+14*mult);
FILTER(node,hi,base+15*mult);
assert(b==nn);
}
#endif
#if 0
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
FineField combined(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
JacobiPoly(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
// subspace[b] = tmp; b++;
// }
}
#define FILTER(lambda) \
{ \
hermop.HermOp(subspace[0],tmp); \
tmp = tmp - lambda *subspace[0]; \
scale = std::pow(norm2(tmp),-0.5); \
tmp=tmp*scale; \
subspace[b] = tmp; \
hermop.Op(subspace[b],tmp); \
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
// subspace[b] = tmp; b++;
// }
FILTER(2.0e-5);
FILTER(2.0e-4);
FILTER(4.0e-4);
FILTER(8.0e-4);
FILTER(8.0e-4);
FILTER(2.0e-3);
FILTER(3.0e-3);
FILTER(4.0e-3);
FILTER(5.0e-3);
FILTER(6.0e-3);
FILTER(2.5e-3);
FILTER(3.5e-3);
FILTER(4.5e-3);
FILTER(5.5e-3);
FILTER(6.5e-3);
// FILTER(6.0e-5);//6
// FILTER(7.0e-5);//8
// FILTER(8.0e-5);//9
// FILTER(9.0e-5);//3
/*
// FILTER(1.0e-4);//10
FILTER(2.0e-4);//11
// FILTER(3.0e-4);//12
// FILTER(4.0e-4);//13
FILTER(5.0e-4);//14
FILTER(6.0e-3);//4
FILTER(7.0e-4);//1
FILTER(8.0e-4);//7
FILTER(9.0e-4);//15
FILTER(1.0e-3);//2
FILTER(2.0e-3);//2
FILTER(3.0e-3);//2
FILTER(4.0e-3);//2
FILTER(5.0e-3);//2
FILTER(6.0e-3);//2
FILTER(7.0e-3);//2
FILTER(8.0e-3);//2
FILTER(1.0e-2);//2
*/
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
assert(b==nn);
}
#endif
};
@ -549,13 +293,13 @@ public:
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
auto in_v = in.View();
auto out_v = out.View();
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@ -572,24 +316,25 @@ public:
int ptype;
StencilEntry *SE;
int lane=SIMTlane(Nsimd);
for(int point=0;point<geom.npoint;point++){
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
}
synchronise();
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res,lane);
});
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
};
void Mdag (const CoarseVector &in, CoarseVector &out)
@ -617,11 +362,11 @@ public:
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
auto out_v = out.View();
auto in_v = in.View();
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
@ -635,45 +380,21 @@ public:
int ptype;
StencilEntry *SE;
int lane=SIMTlane(Nsimd);
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
}
synchronise();
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res,lane);
coalescedWrite(out_v[ss](b),res);
});
#if 0
accelerator_for(ss,Grid()->oSites(),1,{
siteVector res = Zero();
siteVector nbr;
int ptype;
StencilEntry *SE;
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) {
permute(nbr,in_v[SE->_offset],ptype);
} else if(SE->_is_local) {
nbr = in_v[SE->_offset];
} else {
nbr = Stencil.CommBuf()[SE->_offset];
}
synchronise();
res = res + Aview_p[point][ss]*nbr;
out_v[ss]=res;
});
#endif
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{
@ -841,10 +562,10 @@ public:
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
auto iZProj_v = iZProj.View() ;
auto oZProj_v = oZProj.View() ;
auto A_p = A[p].View();
auto A_self = A[self_stencil].View();
autoView( iZProj_v , iZProj, AcceleratorRead) ;
autoView( oZProj_v , oZProj, AcceleratorRead) ;
autoView( A_p , A[p], AcceleratorWrite);
autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
@ -860,11 +581,11 @@ public:
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
{
auto tmp_ = tmp.View();
auto evenmask_ = evenmask.View();
auto oddmask_ = oddmask.View();
auto Mphie_ = Mphie.View();
auto Mphio_ = Mphio.View();
autoView( tmp_ , tmp, AcceleratorWrite);
autoView( evenmask_ , evenmask, AcceleratorRead);
autoView( oddmask_ , oddmask, AcceleratorRead);
autoView( Mphie_ , Mphie, AcceleratorRead);
autoView( Mphio_ , Mphio, AcceleratorRead);
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
});
@ -872,8 +593,8 @@ public:
blockProject(SelfProj,tmp,Subspace.subspace);
auto SelfProj_ = SelfProj.View();
auto A_self = A[self_stencil].View();
autoView( SelfProj_ , SelfProj, AcceleratorRead);
autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
for(int j=0;j<nbasis;j++){
@ -887,33 +608,8 @@ public:
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
ForceHermitian();
}
// AssertHermitian();
// ForceDiagonal();
}
#if 0
///////////////////////////
// test code worth preserving in if block
///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< A[p] << std::endl;
}
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif
void ForceHermitian(void) {
CoarseMatrix Diff (Grid());
for(int p=0;p<geom.npoint;p++){
@ -933,27 +629,6 @@ public:
}
}
}
void AssertHermitian(void) {
CoarseMatrix AA (Grid());
CoarseMatrix AAc (Grid());
CoarseMatrix Diff (Grid());
for(int d=0;d<4;d++){
int dd=d+1;
AAc = Cshift(A[2*d+1],dd,1);
AA = A[2*d];
Diff = AA - adj(AAc);
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
}
Diff = A[8] - adj(A[8]);
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@ -1,4 +1,3 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif
#endif
NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { };
@ -191,7 +189,7 @@ public:
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View();
autoView(pgbuf_v , pgbuf, CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -232,15 +230,18 @@ public:
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
thread_for(idx, sgrid->lSites(),{
{
autoView(r_v,result,CpuRead);
autoView(p_v,pgbuf,CpuWrite);
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
peekLocalSite(s,r_v,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
});
pokeLocalSite(s,p_v,cbuf);
});
}
if (p != processors[dim] - 1) {
result = Cshift(result,dim,L);
}
@ -269,15 +270,19 @@ public:
flops+= flops_call*NN;
// writing out result
thread_for(idx,sgrid->lSites(),{
{
autoView(pgbuf_v,pgbuf,CpuRead);
autoView(result_v,result,CpuWrite);
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
});
peekLocalSite(s,pgbuf_v,cgbuf);
pokeLocalSite(s,result_v,clbuf);
});
}
result = result*div;
// destroying plan

View File

@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
LinearCombTimer.Start();
bo = beta * omega;
auto p_v = p.View();
auto r_v = r.View();
auto v_v = v.View();
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
});
{
autoView( p_v , p, AcceleratorWrite);
autoView( r_v , r, AcceleratorRead);
autoView( v_v , v, AcceleratorRead);
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
alpha = rho / Calpha.real();
LinearCombTimer.Start();
auto h_v = h.View();
auto psi_v = psi.View();
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
});
auto s_v = s.View();
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
});
{
autoView( p_v , p, AcceleratorRead);
autoView( r_v , r, AcceleratorRead);
autoView( v_v , v, AcceleratorRead);
autoView( psi_v,psi, AcceleratorRead);
autoView( h_v , h, AcceleratorWrite);
autoView( s_v , s, AcceleratorWrite);
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
});
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
@ -166,11 +172,17 @@ class BiCGSTAB : public OperatorFunction<Field>
omega = Comega.real() / norm2(t);
LinearCombTimer.Start();
auto t_v = t.View();
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
});
{
autoView( psi_v,psi, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
autoView( h_v , h, AcceleratorRead);
autoView( s_v , s, AcceleratorRead);
autoView( t_v , t, AcceleratorRead);
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
});
}
LinearCombTimer.Stop();
cp = norm2(r);

View File

@ -140,13 +140,15 @@ public:
b = cp / c;
LinearCombTimer.Start();
auto psi_v = psi.View();
auto p_v = p.View();
auto r_v = r.View();
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
{
autoView( psi_v , psi, AcceleratorWrite);
autoView( p_v , p, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();

View File

@ -0,0 +1,241 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_NON_HERM_H
#define GRID_PREC_GCR_NON_HERM_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void Level(int lv) { level=lv; };
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
void operator() (const Field &src, Field &psi){
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(src,psi,rsq);
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.Op(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
return;
}
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b, zAz;
RealD zAAz;
ComplexD rq;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
LinalgTimer.Start();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= innerProduct(q[peri_k],r); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -6,93 +6,6 @@ NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
#ifdef GRID_CUDA
int PointerCache::Ncache = 32;
#else
int PointerCache::Ncache = 8;
#endif
int PointerCache::Victim;
int PointerCache::VictimSmall;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
void PointerCache::Init(void)
{
char * str;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) Ncache = atoi(str);
if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) NcacheSmall = atoi(str);
if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
// printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
}
void *PointerCache::Insert(void *ptr,size_t bytes)
{
if (bytes < GRID_ALLOC_SMALL_LIMIT )
return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
return Insert(ptr,bytes,Entries,Ncache,Victim);
}
void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes)
{
if (bytes < GRID_ALLOC_SMALL_LIMIT )
return Lookup(bytes,EntriesSmall,NcacheSmall);
return Lookup(bytes,Entries,Ncache);
}
void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__

View File

@ -26,129 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
#pragma once
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
class PointerCache {
private:
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
/* Could make these configurable, perhaps up to a max size*/
static const int NcacheSmallMax=128;
static const int NcacheMax=16;
static int NcacheSmall;
static int Ncache;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[NcacheMax];
static int Victim;
static PointerCacheEntry EntriesSmall[NcacheSmallMax];
static int VictimSmall;
public:
static void Init(void);
static void *Insert(void *ptr,size_t bytes) ;
static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes) ;
static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
};
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#ifdef GRID_NVCC
#define profilerCudaMeminfo \
{ size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
#else
#define profilerCudaMeminfo
#endif
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
} \
profilerCudaMeminfo;
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp>
class alignedAllocator {
public:
@ -172,70 +53,60 @@ public:
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
pointer ptr = nullptr;
#endif
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n) {
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::CpuFree((void *)__p,bytes);
}
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
// FIXME: hack for the copy constructor, eventually it must be avoided
//void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
#endif
template<typename _Tp>
class uvmAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
uvmAllocator() throw() { }
uvmAllocator(const uvmAllocator&) throw() { }
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
~uvmAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::SharedFree((void *)__p,bytes);
}
// FIXME: hack for the copy constructor, eventually it must be avoided
@ -244,17 +115,17 @@ public:
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>;
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
template<class T> using commAllocator = uvmAllocator<T>;
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
//template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,4 @@
#pragma once
#include <Grid/allocator/MemoryStats.h>
#include <Grid/allocator/MemoryManager.h>
#include <Grid/allocator/AlignedAllocator.h>

View File

@ -0,0 +1,244 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
}
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes;
}
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
total_device-=bytes;
// PrintBytes();
}
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes;
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes();
}
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_shared-=bytes;
// PrintBytes();
}
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes;
}
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_host-=bytes;
}
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
total_host+=bytes;
}
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
total_host-=bytes;
}
}
#endif
//////////////////////////////////////////
// call only once
//////////////////////////////////////////
void MemoryManager::Init(void)
{
char * str;
int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc;
Ncache[Acc]=Nc;
Ncache[Shared]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc;
}
}
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
#endif
#ifdef GRID_UVM
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
#endif
#else
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,181 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryManager.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <list>
#include <unordered_map>
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define ALLOCATION_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Regular data
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
class MemoryManager {
private:
////////////////////////////////////////////////////////////
// For caching recently freed allocations
////////////////////////////////////////////////////////////
typedef struct {
void *address;
size_t bytes;
int valid;
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void PrintBytes(void);
public:
static void Init(void);
static void *SharedAllocate(size_t bytes);
static void SharedFree (void *ptr,size_t bytes);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
static uint64_t DeviceBytes;
static uint64_t DeviceLRUBytes;
static uint64_t DeviceMaxBytes;
static uint64_t HostToDeviceBytes;
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
typedef std::list<uint64_t> LRU_t;
typedef typename LRU_t::iterator LRUiterator;
typedef struct {
int LRU_valid;
LRUiterator LRU_entry;
uint64_t CpuPtr;
uint64_t AccPtr;
size_t bytes;
uint32_t transient;
uint32_t state;
uint32_t accLock;
uint32_t cpuLock;
} AcceleratorViewEntry;
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
static AccViewTable_t AccViewTable;
static LRU_t LRU;
/////////////////////////////////////////////////
// Device motion
/////////////////////////////////////////////////
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
static void Evict(AcceleratorViewEntry &AccCache);
static void Flush(AcceleratorViewEntry &AccCache);
static void Clone(AcceleratorViewEntry &AccCache);
static void AccDiscard(AcceleratorViewEntry &AccCache);
static void CpuDiscard(AcceleratorViewEntry &AccCache);
// static void LRUupdate(AcceleratorViewEntry &AccCache);
static void LRUinsert(AcceleratorViewEntry &AccCache);
static void LRUremove(AcceleratorViewEntry &AccCache);
// manage entries in the table
static int EntryPresent(uint64_t CpuPtr);
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EntryErase (uint64_t CpuPtr);
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
static void AcceleratorViewClose(uint64_t AccPtr);
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif
static void NotifyDeletion(void * CpuPtr);
public:
static void Print(void);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,468 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
MemoryManager::LRU_t MemoryManager::LRU;
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
////////////////////////////////////
// Priority ordering for unlocked entries
// Empty
// CpuDirty
// Consistent
// AccDirty
////////////////////////////////////
#define Empty (0x0) /*Entry unoccupied */
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
#define AccDirty (0x4) /*ACC copy is golden */
#define EvictNext (0x8) /*Priority for eviction*/
/////////////////////////////////////////////////
// Mechanics of data table maintenance
/////////////////////////////////////////////////
int MemoryManager::EntryPresent(uint64_t CpuPtr)
{
if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count;
}
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty;
AccCache.LRU_valid=0;
AccCache.transient=0;
AccCache.accLock=0;
AccCache.cpuLock=0;
AccViewTable[CpuPtr] = AccCache;
}
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{
assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator;
}
void MemoryManager::EntryErase(uint64_t CpuPtr)
{
auto AccCache = EntryLookup(CpuPtr);
AccViewTable.erase(CpuPtr);
}
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
} else {
LRU.push_front(AccCache.CpuPtr);
AccCache.LRU_entry = LRU.begin();
}
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes;
}
/////////////////////////////////////////////////
// Accelerator cache motion & consistency logic
/////////////////////////////////////////////////
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////
// Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool.
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
}
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
AccCache.state=Consistent;
}
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state!=Empty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
AccCache.state=AccDirty;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
} else {
assert(0);
}
}
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return NULL;
}
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
}
}
}
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,hint);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
/*
* State transitions and actions
*
* Action State StateNext Flush Clone
*
* AccRead Empty Consistent - Y
* AccWrite Empty AccDirty - Y
* AccRead CpuDirty Consistent - Y
* AccWrite CpuDirty AccDirty - Y
* AccRead Consistent Consistent - -
* AccWrite Consistent AccDirty - -
* AccRead AccDirty AccDirty - -
* AccWrite AccDirty AccDirty - -
*/
if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Cpu starts primary
if(mode==AcceleratorWriteDiscard){
CpuDiscard(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite){
Clone(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
} else {
assert(0);
}
// If view is opened on device remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
LRUremove(AccCache);
}
int transient =hint;
AccCache.transient= transient? EvictNext : 0;
return AccCache.AccPtr;
}
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0);
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
LRUinsert(AccCache);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0);
assert(AccCache.accLock==0);
AccCache.cpuLock--;
}
/*
* Action State StateNext Flush Clone
*
* CpuRead Empty CpuDirty - -
* CpuWrite Empty CpuDirty - -
* CpuRead CpuDirty CpuDirty - -
* CpuWrite CpuDirty CpuDirty - -
* CpuRead Consistent Consistent - -
* CpuWrite Consistent CpuDirty - -
* CpuRead AccDirty Consistent Y -
* CpuWrite AccDirty CpuDirty Y -
*/
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,transient);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes);
}
if(AccCache.state==Empty) {
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
AccCache.accLock= 0;
AccCache.cpuLock= 1;
} else if(AccCache.state==CpuDirty ){
// AccPtr dont care, deferred allocate
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++;
} else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++;
} else {
assert(0); // should be unreachable
}
AccCache.transient= transient? EvictNext : 0;
return AccCache.CpuPtr;
}
void MemoryManager::NotifyDeletion(void *_ptr)
{
// Look up in ViewCache
uint64_t ptr = (uint64_t)_ptr;
if(EntryPresent(ptr)) {
auto e = EntryLookup(ptr);
AccDiscard(e->second);
}
}
void MemoryManager::Print(void)
{
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
return AccCache.cpuLock+AccCache.accLock;
} else {
return 0;
}
}
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,24 @@
#include <Grid/GridCore.h>
#ifdef GRID_UVM
#warning "Grid is assuming unified virtual memory address space"
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::Print(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,67 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,95 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryStats.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
NAMESPACE_END(Grid);

View File

@ -81,6 +81,7 @@ public:
bool _isCheckerBoarded;
int LocallyPeriodic;
Coordinate _checker_dim_mask;
public:

View File

@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
public:
int dummy;
Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
@ -104,6 +105,7 @@ public:
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
@ -114,6 +116,8 @@ public:
for (int d = 0; d < _ndimension; d++)
{
_checker_dim_mask[d]=0;
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];

View File

@ -36,11 +36,27 @@ static const int CbBlack=1;
static const int Even =CbRed;
static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
{
int nd=rdim.size();
Coordinate coor(nd);
Lexicographic::CoorFromIndex(coor,oindex,rdim);
int linear=0;
for(int d=0;d<nd;d++){
if(chk_dim_msk[d])
linear=linear+coor[d];
}
return (linear&0x1);
}
// Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase
{
public:
Coordinate _checker_dim_mask;
// Coordinate _checker_dim_mask;
int _checker_dim;
std::vector<int> _checker_board;

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <pwd.h>
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#include <cuda_runtime_api.h>
#endif
@ -170,17 +170,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
for(int p=0;p<primes.size();p++) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
@ -413,7 +420,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
#ifdef GRID_CUDA
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
@ -433,13 +440,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -677,7 +677,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
#ifdef GRID_NVCC
#ifdef GRID_CUDA
cudaMemset(dest,0,bytes);
#else
bzero(dest,bytes);
@ -685,7 +685,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{
#ifdef GRID_NVCC
#ifdef GRID_CUDA
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else
bcopy(src,dest,bytes);

View File

@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0;
static Vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int bo = n*e2;
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
}
}
} else {
@ -65,14 +67,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int o = n*stride;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
}
}
}
}
thread_for(i,ent,{
buffer[table[i].first]=rhs_v[table[i].second];
});
{
autoView(rhs_v , rhs, AcceleratorRead);
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
}
}
///////////////////////////////////////////////////////////////////
@ -95,36 +102,38 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for2d(n,e1,b,e2,1,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
});
} else {
autoView(rhs_v , rhs, AcceleratorRead);
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl;
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
accelerator_for2d(n,e1,b,e2,1,{
Coordinate coor;
int o=n*n1;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
});
});
}
}
@ -145,7 +154,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent =0;
if ( cbmask ==0x3 ) {
@ -154,7 +164,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
}
}
@ -165,16 +175,20 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++);
Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
}
}
}
}
auto rhs_v = rhs.View();
thread_for(i,ent,{
rhs_v[table[i].first]=buffer[table[i].second];
});
{
autoView( rhs_v, rhs, AcceleratorWrite);
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
}
}
//////////////////////////////////////////////////////
@ -194,21 +208,19 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) {
auto rhs_v = rhs.View();
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for2d(n,e1,b,e2,1,{
int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension];
merge(rhs_v[so+o+b],pointers,offset);
}
});
});
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View();
autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension];
@ -225,6 +237,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@ -239,14 +252,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0;
if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
table[ent++] = std::pair<int,int>(lo+o,ro+o);
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
} else {
@ -255,23 +270,24 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o);
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
}
}
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
{
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
}
}
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
@ -285,29 +301,33 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0;
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
}
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
{
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
}
}
//////////////////////////////////////////////////////

View File

@ -0,0 +1,4 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
NAMESPACE_END(Grid);

View File

@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/lattice/Lattice_view.h>
#include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h>

View File

@ -92,12 +92,18 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{
return arg[ss];
}
// What needs this?
// Cannot be legal on accelerator
// Comparison must convert
#if 1
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{
auto view = arg.AcceleratorView(ViewRead);
auto view = arg.View(AcceleratorRead);
return view[ss];
}
#endif
///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand
@ -180,16 +186,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
cb = lat.Checkerboard();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{
}
inline void CBFromExpression(int &cb, const T1 &notlat) {} // non-lattice leaf
template <typename Op, typename T1> inline
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{
@ -204,6 +206,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
CBFromExpression(cb, expr.arg3); // recurse AST
}
//////////////////////////////////////////////////////////////////////////
// ViewOpen
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
{
lat.ViewOpen(AcceleratorRead);
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
ExpressionViewOpen(expr.arg3); // recurse AST
}
//////////////////////////////////////////////////////////////////////////
// ViewClose
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
{
lat.ViewClose();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
ExpressionViewClose(expr.arg3); // recurse AST
}
////////////////////////////////////////////
// Unary operators and funcs
////////////////////////////////////////////

View File

@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
conformable(ret,rhs);
conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@ -56,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -73,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -89,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -108,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs);
@ -121,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -135,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -148,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -165,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
@ -179,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
@ -193,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
@ -206,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
@ -221,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto x_v = x.AcceleratorView(ViewRead);
auto y_v = y.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( x_v , x, AcceleratorRead);
autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp);
@ -234,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.AcceleratorView(ViewWrite);
auto x_v = x.AcceleratorView(ViewRead);
auto y_v = y.AcceleratorView(ViewRead);
autoView( ret_v , ret, AcceleratorWrite);
autoView( x_v , x, AcceleratorRead);
autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp);

View File

@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#define STREAMING_STORES
@ -37,180 +38,6 @@ NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum LatticeAcceleratorAdvise {
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
ViewRead = 0x1,
ViewWrite = 0x2,
ViewReadWrite = 0x3
};
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
accelerator_inline void Advise(int advise) {
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
if (advise & AdviseInfrequentUse) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
}
if (advise & AdviseReadMostly) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
}
#endif
#endif
};
accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
int target;
cudaGetDevice(&target);
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
#endif
#endif
};
accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
#endif
#endif
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
/////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code
@ -253,28 +80,23 @@ private:
}
}
public:
/////////////////////////////////////////////////////////////////////////////////
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
/////////////////////////////////////////////////////////////////////////////////
void SetViewMode(ViewMode mode) {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose();
}
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas
/////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
{ // and HostView for thread_for
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const
LatticeView<vobj> View (ViewMode mode) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
accessor.AcceleratorPrefetch(mode);
return accessor;
}
LatticeView<vobj> HostView(int mode = ViewReadWrite) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
accessor.HostPrefetch(mode);
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
return accessor;
}
@ -298,11 +120,15 @@ public:
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite);
auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp);
});
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this;
}
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
@ -317,11 +143,15 @@ public:
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite);
auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp);
});
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this;
}
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
@ -335,11 +165,15 @@ public:
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite);
auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp);
});
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this;
}
//GridFromExpression is tricky to do
@ -390,10 +224,11 @@ public:
}
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View();
auto me = View(CpuWrite);
thread_for(ss,me.size(),{
me[ss] = r;
me[ss]= r;
});
me.ViewClose();
return *this;
}
@ -403,11 +238,12 @@ public:
///////////////////////////////////////////
// user defined constructor
///////////////////////////////////////////
Lattice(GridBase *grid) {
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid;
resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0;
SetViewMode(mode);
}
// virtual ~Lattice(void) = default;
@ -445,11 +281,12 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = AcceleratorView(ViewWrite);
auto him= r.AcceleratorView(ViewRead);
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
me.ViewClose(); him.ViewClose();
return *this;
}
@ -459,11 +296,12 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = AcceleratorView(ViewWrite);
auto him= r.AcceleratorView(ViewRead);
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
me.ViewClose(); him.ViewClose();
return *this;
}
///////////////////////////////////////////

View File

@ -51,34 +51,18 @@ template<class VField, class Matrix>
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
Vector<View> basis_v(basis.size(),tmp_v);
typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size());
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
#ifndef GRID_NVCC
thread_region
{
std::vector < vobj > B(Nm); // Thread private
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda
return;
@ -86,6 +70,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
@ -96,7 +82,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
int j = i/Nm;
int k = i%Nm;
Qt_p[i]=Qt(j,k);
});
});
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
@ -132,27 +118,30 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef decltype(basis[0].AcceleratorView()) View;
typedef decltype(basis[0].View(AcceleratorRead)) View;
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.AcceleratorView(ViewWrite);
Vector<View> basis_v(basis.size(),result_v);
Vector<View> basis_v; basis_v.reserve(basis.size());
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].AcceleratorView(ViewRead);
basis_v.push_back(basis[k].View(AcceleratorRead));
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){
@ -160,6 +149,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
}
coalescedWrite(result_v[ss], B);
});
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
template<class Field>

View File

@ -78,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
});
@ -93,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
autoView( lhs_v, lhs, CpuRead);
autoView( ret_v, ret, CpuWrite);
thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs);
});
@ -107,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
autoView( rhs_v, rhs, CpuRead);
autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]);
});

View File

@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
GridBase *grid = l.Grid();
int Nsimd = grid->iSites();
auto l_v = l.View();
autoView(l_v, l, CpuWrite);
thread_for( o, grid->oSites(), {
vector_type vI;
Coordinate gcoor;
@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
});
};
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid);

View File

@ -43,8 +43,8 @@ template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
autoView( rhs_v , rhs, AcceleratorRead);
autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
});
@ -56,9 +56,9 @@ template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
});
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop

View File

@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto Y_v = Y.View();
auto R_v = R.View();
autoView( X_v , X, CpuRead);
autoView( Y_v , Y, CpuRead);
autoView( R_v , R, CpuWrite);
thread_region
{
std::vector<vobj> s_x(Nblock);
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto R_v = R.View();
autoView( X_v , X, CpuRead);
autoView( R_v , R, CpuWrite);
thread_region
{
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
autoView( lhs_v , lhs, CpuRead);
autoView( rhs_v , rhs, CpuRead);
thread_region {
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);

View File

@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
{
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
autoView( ret_v, ret, AcceleratorWrite);
autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
});
return ret;
@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
{
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
autoView( ret_v, ret, AcceleratorWrite);
autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
});
return ret;
@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
});
}
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
});
}
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
autoView( l_v , l, CpuWrite);
if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf);
buf[idx] = s;
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
autoView( l_v , l, CpuWrite);
extract(l_v[odx],buf);
s = buf[idx];
@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
// Must be CPU read view
template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
GridBase *grid = l.Grid();
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid = l.getGrid();
assert(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
@ -173,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
@ -183,18 +182,19 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
return;
};
// Must be CPU write view
template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
GridBase *grid=l.Grid();
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid=l.getGrid();
assert(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
@ -202,13 +202,11 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
}
return;
};

View File

@ -40,9 +40,11 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
});
@ -51,9 +53,11 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
});

View File

@ -25,7 +25,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_NVCC
#if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
@ -39,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd();
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
@ -63,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
ssum = ssum+sumarray[i];
}
return ssum;
typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
}
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#ifdef GRID_NVCC
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto arg_v = arg.View();
#if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites);
auto ssum= sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum);
return ssum;
}
@ -102,42 +151,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
GridBase *grid = left.Grid();
// Might make all code paths go this way.
auto left_v = left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
// Might make all code paths go this way.
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
})
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, 1,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
});
}
// This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
auto anrm = sum(inner_tmp_v,sites);
nrm = anrm;
return nrm;
}
@ -175,40 +211,24 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid();
auto x_v=x.AcceleratorView(ViewRead);
auto y_v=y.AcceleratorView(ViewRead);
auto z_v=z.AcceleratorView(ViewWrite);
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
accelerator_for( ss, sites, 1,{
auto tmp = a*x_v[ss]+b*y_v[ss];
inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
});
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm);
return nrm;
}
@ -224,47 +244,29 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
GridBase *grid = left.Grid();
auto left_v=left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
{
autoView(left_v,left, AcceleratorRead);
autoView(right_v,right,AcceleratorRead);
accelerator_for( ss, sites, 1,{
auto left_tmp = left_v[ss];
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
});
}
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
});
tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
});
// Already promoted to double
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
#endif
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0];
nrm = real(tmp[1]);
@ -335,7 +337,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
auto Data_v=Data.View();
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
@ -413,8 +415,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View();
auto rhv=rhs.View();
autoView( lhv, lhs, CpuRead);
autoView( rhv, rhs, CpuRead);
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@ -521,14 +523,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av;
auto Rv=R.View();
auto Xv=X.View();
auto Yv=Y.View();
thread_for_collapse(2, n, e1, {
for(int b=0;b<e2;b++){
autoView( Rv, R, CpuWrite);
autoView( Xv, X, CpuRead);
autoView( Yv, Y, CpuRead);
thread_for2d( n, e1, b,e2, {
int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss];
}
});
}
};
@ -581,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v=X.View();
auto Y_v=Y.View();
auto R_v=R.View();
autoView( X_v, X, CpuRead);
autoView( Y_v, Y, CpuRead);
autoView( R_v, R, CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
@ -628,13 +628,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View();
auto X_v = X.View();
autoView( R_v, R, CpuWrite);
autoView( X_v, X, CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
@ -692,8 +693,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View();
auto rhs_v=rhs.View();
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);

View File

@ -1,7 +1,13 @@
NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32
#ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props;
#endif
#ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props;
#endif
#define WARP_SIZE 32
__device__ unsigned int retirementCount = 0;
template <class Iterator>
@ -19,7 +25,12 @@ template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
#ifdef GRID_CUDA
cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
@ -147,7 +158,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished
__threadfence();
acceleratorFence();
if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
@ -156,7 +167,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
}
// each thread must read the correct value of amLast
__syncthreads();
acceleratorSynchroniseAll();
if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1]
@ -199,13 +210,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
accelerator_barrier();
auto result = buffer_v[0];
return result;
}

View File

@ -375,7 +375,7 @@ public:
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View();
autoView(l_v, l, CpuWrite);
thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
@ -462,7 +462,7 @@ public:
{
// Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads();
int Nthread = 32; // Hardwire a good level or parallelism
std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine);

View File

@ -42,8 +42,8 @@ template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
autoView(ret_v , ret, AcceleratorWrite);
autoView(lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
});
@ -58,8 +58,8 @@ template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
});

View File

@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
{
half.Checkerboard() = cb;
auto half_v = half.View();
auto full_v = full.View();
autoView( half_v, half, CpuWrite);
autoView( full_v, full, CpuRead);
thread_for(ss, full.Grid()->oSites(),{
int cbos;
Coordinate coor;
@ -64,11 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
}
});
}
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
{
int cb = half.Checkerboard();
auto half_v = half.View();
auto full_v = full.View();
autoView( half_v , half, CpuRead);
autoView( full_v , full, CpuWrite);
thread_for(ss,full.Grid()->oSites(),{
Coordinate coor;
@ -96,15 +97,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
out = in;
}
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
}
#endif
@ -151,12 +152,11 @@ accelerator_inline void convertType(T & out, const T & in) {
template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
auto out_v = out.AcceleratorView(ViewWrite);
auto in_v = in.AcceleratorView(ViewRead);
autoView( out_v , out,AcceleratorWrite);
autoView( in_v , in ,AcceleratorRead);
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
convertType(out_v[ss],in_v(ss));
});
});
}
////////////////////////////////////////////////////////////////////////////////////////////
@ -164,19 +164,20 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
{
auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.AcceleratorView(ViewRead);
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
Lattice<iScalar<t_inner>> ret(lhs.Grid());
auto ret_v = ret.AcceleratorView(ViewWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
{
autoView(ret_v, ret,AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
});
}
return ret;
}
@ -194,14 +195,13 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<iScalar<CComplex>> ip(coarse);
Lattice<vobj> fineDataRed = fineData;
// auto fineData_ = fineData.View();
auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
auto ip_ = ip.AcceleratorView(ViewReadWrite);
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
});
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
@ -210,68 +210,6 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
}
}
template<class vobj,class CComplex,int nbasis>
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis)
{
typedef iVector<CComplex,nbasis > coarseSiteData;
coarseSiteData elide;
typedef decltype(coalescedRead(elide)) ScalarComplex;
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
int _ndimension = coarse->_ndimension;
// checks
assert( nbasis == Basis.size() );
subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){
conformable(Basis[i],fineData);
}
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
}
int blockVol = fine->oSites()/coarse->oSites();
coarseData=Zero();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
////////////////////////////////////////////////////////////////////////////////////////////////////////
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
// Otherwise do fine inner product per site, and make the update atomic
////////////////////////////////////////////////////////////////////////////////////////////////////////
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
auto sc=sci/nbasis;
auto i=sci%nbasis;
auto Basis_ = Basis[i].View();
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
int sf;
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
for(int sb=0;sb<blockVol;sb++){
Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r);
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
}
coalescedWrite(coarseData_[sc](i),reduce);
});
return;
}
template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -298,10 +236,10 @@ template<class vobj,class vobj2,class CComplex>
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
}
auto fineZ_ = fineZ.AcceleratorView(ViewWrite);
auto fineX_ = fineX.AcceleratorView(ViewRead);
auto fineY_ = fineY.AcceleratorView(ViewRead);
auto coarseA_= coarseA.AcceleratorView(ViewRead);
autoView( fineZ_ , fineZ, AcceleratorWrite);
autoView( fineX_ , fineX, AcceleratorRead);
autoView( fineY_ , fineY, AcceleratorRead);
autoView( coarseA_, coarseA, AcceleratorRead);
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@ -314,7 +252,7 @@ template<class vobj,class vobj2,class CComplex>
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
// z = A x + y
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
typename vobj2::tensor_reduced::scalar_object cA;
typename vobj::scalar_object cAx;
#else
@ -344,15 +282,16 @@ template<class vobj,class CComplex>
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse);
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
// Precision promotion
fine_inner = localInnerProductD(fineX,fineY);
fine_inner = localInnerProductD<vobj>(fineX,fineY);
blockSum(coarse_inner,fine_inner);
accelerator_for(ss, coarse->oSites(), 1, {
{
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
});
}
}
@ -370,14 +309,15 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse);
// Precision promotion?
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner);
accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss];
});
{
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss];
});
}
}
template<class vobj,class CComplex>
@ -408,8 +348,10 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
}
int blockVol = fine->oSites()/coarse->oSites();
auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
auto fineData_ = fineData.AcceleratorView(ViewRead);
// Turn this around to loop threaded over sc and interior loop
// over sf would thread better
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( fineData_ , fineData, AcceleratorRead);
accelerator_for(sc,coarse->oSites(),1,{
@ -510,8 +452,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
}
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
autoView( fineData_ , fineData, AcceleratorWrite);
autoView( coarseData_ , coarseData, AcceleratorRead);
// Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{
@ -524,7 +466,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) {
auto basis_ = Basis[i].View();
/* auto basis_ = Basis[i], );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
}
@ -543,7 +485,14 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
fineData=Zero();
for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
auto ip_ = ip.AcceleratorView(ViewRead);
//Lattice<CComplex> cip(coarse);
//autoView( cip_ , cip, AcceleratorWrite);
//autoView( ip_ , ip, AcceleratorRead);
//accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
// coalescedWrite(cip_[sc], ip_(sc)());
// });
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
blockZAXPY(fineData,ip,Basis[i],fineData);
}
}
@ -571,15 +520,17 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->lSites() == og->lSites());
}
autoView(in_v,in,CpuRead);
autoView(out_v,out,CpuWrite);
thread_for(idx, ig->lSites(),{
sobj s;
ssobj ss;
Coordinate lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(s,in,lcoor);
peekLocalSite(s,in_v,lcoor);
ss=s;
pokeLocalSite(ss,out,lcoor);
pokeLocalSite(ss,out_v,lcoor);
});
}
@ -614,8 +565,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
auto t_v = To.AcceleratorView(ViewWrite);
auto f_v = From.AcceleratorView(ViewRead);
autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{
sobj s;
Coordinate Fcoor(nd);
@ -638,8 +590,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int w=0;w<words;w++){
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
}
// peekLocalSite(s,From,Fcoor);
// pokeLocalSite(s,To ,Tcoor);
}
});
}
@ -670,6 +620,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
}
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{
sobj s;
Coordinate lcoor(nl);
@ -682,8 +634,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[d]=lcoor[ddl++];
}
}
peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDim,hcoor);
peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor);
});
}
@ -711,6 +663,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
}
}
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{
sobj s;
Coordinate lcoor(nl);
@ -723,8 +677,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
hcoor[d]=lcoor[ddl++];
}
}
peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDim,lcoor);
peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDimv,lcoor);
});
}
@ -752,6 +706,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
}
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{
sobj s;
Coordinate lcoor(nl);
@ -760,8 +716,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor;
hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDim,hcoor);
peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor);
}
});
}
@ -789,6 +745,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
}
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{
sobj s;
Coordinate lcoor(nl);
@ -797,8 +755,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor;
hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDim,lcoor);
peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDimv,lcoor);
}
});
}
@ -862,7 +820,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
}
//loop over outer index
auto in_v = in.View();
autoView( in_v , in, CpuRead);
thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@ -955,7 +913,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane);
}
auto out_v = out.View();
autoView( out_v , out, CpuWrite);
thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd);
@ -1058,7 +1016,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View();
autoView( out_v , out, CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx);

View File

@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid);
template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
autoView( ret_v, ret, AcceleratorWrite);
autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
});
@ -58,8 +58,8 @@ template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
autoView( ret_v, ret, AcceleratorWrite);
autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
});

View File

@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
autoView( rhs, rhs_i, AcceleratorRead);
autoView( ret, ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y);
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
}
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
autoView( rhs , rhs_i, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y));
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View();
auto rhs = rhs_i.View();
autoView( ret , ret_i, AcceleratorWrite);
autoView( rhs , rhs_i, AcceleratorRead);
ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y));
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
autoView( rhs , rhs_i, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));

168
Grid/lattice/Lattice_view.h Normal file
View File

@ -0,0 +1,168 @@
#pragma once
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
//public:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
ViewAdvise advise;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline ViewAdvise Advise(void) const { return advise; };
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
// Host only
GridBase * getGrid(void) const { return _grid; };
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
ViewMode mode;
void * cpu_ptr;
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
return coalescedRead(this->_odata[i]);
}
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
{
this->ViewOpen(mode);
}
// Host functions
void ViewOpen(ViewMode mode)
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
this->cpu_ptr = (void *)this->_odata;
this->mode = mode;
this->_odata =(vobj *)
MemoryManager::ViewOpen(this->cpu_ptr,
this->_odata_size*sizeof(vobj),
mode,
this->advise);
}
void ViewClose(void)
{ // Inform the manager
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
ViewCloser(View &_v) : v(_v) {};
~ViewCloser() { v.ViewClose(); }
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
NAMESPACE_END(Grid);

View File

@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h>
#endif
#ifdef __x86_64__
#ifdef GRID_NVCC
#ifdef GRID_CUDA
accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else
@ -112,7 +112,6 @@ class PerformanceCounter {
private:
typedef struct {
public:
uint32_t type;
uint64_t config;
const char *name;

View File

@ -12773,7 +12773,7 @@ namespace pugi
#undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#pragma pop
#endif

View File

@ -115,18 +115,21 @@ public:
PokeIndex<LorentzIndex>(Uadj, U, mu);
}
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
autoView(Umu_v,Umu,CpuRead);
autoView(Uadj_v,Uadj,CpuRead);
autoView(Uds_v,Uds,CpuWrite);
thread_for( lidx, GaugeGrid->lSites(), {
Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUmu, Umu, lcoor);
peekLocalSite(ScalarUmu, Umu_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
peekLocalSite(ScalarUmu, Uadj, lcoor);
peekLocalSite(ScalarUmu, Uadj_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
pokeLocalSite(ScalarUds, Uds, lcoor);
}
pokeLocalSite(ScalarUds, Uds_v, lcoor);
});
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)

View File

@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
NAMESPACE_CHECK(Staggered);
@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
#ifndef GRID_NVCC
#ifndef GRID_CUDA
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;

View File

@ -96,11 +96,11 @@ public:
int sl = St._simd_layout[direction];
Coordinate icoor;
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
_Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = SIMTlane(Nsimd);
int s = acceleratorSIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd;
@ -233,14 +233,16 @@ public:
Uconj = where(coor==neglink,-Uconj,Uconj);
}
auto U_v = U.View();
auto Uds_v = Uds.View();
auto Uconj_v = Uconj.View();
auto Utmp_v= Utmp.View();
thread_foreach(ss,U_v,{
Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss]();
});
{
autoView( U_v , U, CpuRead);
autoView( Uconj_v , Uconj, CpuRead);
autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,U_v,{
Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss]();
});
}
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
Uconj = adj(Cshift(Uconj,mu,-1));
@ -250,19 +252,25 @@ public:
Utmp = where(coor==0,Uconj,Utmp);
}
thread_foreach(ss,Utmp_v,{
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
});
{
autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
});
}
Utmp = Uconj;
if ( Params.twists[mu] ) {
Utmp = where(coor==0,U,Utmp);
}
thread_foreach(ss,Utmp_v,{
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
});
{
autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
});
}
}
}
@ -272,11 +280,14 @@ public:
GaugeLinkField link(mat.Grid());
// use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
auto link_v = link.View();
auto tmp_v = tmp.View();
thread_foreach(ss,tmp_v,{
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
});
{
autoView( link_v , link, CpuWrite);
autoView( tmp_v , tmp, CpuRead);
thread_foreach(ss,tmp_v,{
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
});
}
PokeIndex<LorentzIndex>(mat, link, mu);
return;
}
@ -306,16 +317,18 @@ public:
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
auto tmp_v = tmp.View();
auto Atilde_v = Atilde.View();
auto Btilde_v = Btilde.View();
thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}
});
{
autoView( tmp_v , tmp, CpuWrite);
autoView( Atilde_v , Atilde, CpuRead);
autoView( Btilde_v , Btilde, CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}
});
}
PokeIndex<LorentzIndex>(mat, tmp, mu);
return;
}

View File

@ -61,8 +61,8 @@ public:
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base

View File

@ -0,0 +1,194 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
#define GRID_QCD_NAIVE_STAG_FERMION_H
NAMESPACE_BEGIN(Grid);
class NaiveStaggeredFermionStatic {
public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 8;
};
template <class Impl>
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
void M(const FermionField &in, FermionField &out);
void Mdag(const FermionField &in, FermionField &out);
/////////////////////////////////////////////////////////
// half checkerboard operations
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out);
void MeooeDag(const FermionField &in, FermionField &out);
void Mooee(const FermionField &in, FermionField &out);
void MooeeDag(const FermionField &in, FermionField &out);
void MooeeInv(const FermionField &in, FermionField &out);
void MooeeInvDag(const FermionField &in, FermionField &out);
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop (const FermionField &in, FermionField &out, int dag);
void DhopOE(const FermionField &in, FermionField &out, int dag);
void DhopEO(const FermionField &in, FermionField &out, int dag);
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl &st,
DoubledGaugeField &U,
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
NaiveStaggeredFermion(GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
// DoubleStore impl dependent
void ImportGauge (const GaugeField &_U );
DoubledGaugeField &GetU(void) { return Umu ; } ;
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
// protected:
public:
// any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass;
RealD u0;
RealD c1;
GridBase *_grid;
GridBase *_cbgrid;
// Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &srct,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
};
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
NAMESPACE_END(Grid);
#endif

View File

@ -47,23 +47,34 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
public:
public:
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
protected:
///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> accelerator_inline
void DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> accelerator_inline
void DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> accelerator_inline
void DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
@ -71,15 +82,18 @@ public:
///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> accelerator_inline
void DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> accelerator_inline
void DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> accelerator_inline
void DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
@ -87,27 +101,10 @@ public:
///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
void DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
public:

View File

@ -113,20 +113,7 @@ public:
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
{
GridBase *GaugeGrid = U_ds.Grid();
thread_for(lidx, GaugeGrid->lSites(),{
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, U_ds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
});
assert(0);
}
inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term

View File

@ -257,15 +257,16 @@ private:
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
public:
// eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
thread_for(i, CloverTerm.Grid()->oSites(),
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
@ -281,9 +282,9 @@ private:
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
thread_for(i, CloverTerm.Grid()->oSites(),
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -F_v[i]()();
T_v[i]()(1, 0) = F_v[i]()();
@ -299,9 +300,9 @@ private:
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
thread_for(i, CloverTerm.Grid()->oSites(),
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
T_v[i]()(1, 1) = timesI(F_v[i]()());
@ -317,9 +318,9 @@ private:
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
thread_for(i, CloverTerm.Grid()->oSites(),
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesI(F_v[i]()());
T_v[i]()(1, 0) = timesI(F_v[i]()());
@ -335,9 +336,9 @@ private:
CloverFieldType T(F.Grid());
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
thread_for(i, CloverTerm.Grid()->oSites(),
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -(F_v[i]()());
T_v[i]()(1, 0) = (F_v[i]()());
@ -354,9 +355,9 @@ private:
T = Zero();
auto T_v = T.View();
auto F_v = F.View();
thread_for(i, CloverTerm.Grid()->oSites(),
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesI(F_v[i]()());
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());

View File

@ -106,10 +106,10 @@ public:
const _SpinorField & phi,
int mu)
{
auto out_v= out.View();
auto phi_v= phi.View();
auto Umu_v= Umu.View();
thread_for(sss,out.Grid()->oSites(),{
autoView( out_v, out, AcceleratorWrite);
autoView( phi_v, phi, AcceleratorRead);
autoView( Umu_v, Umu, AcceleratorRead);
accelerator_for(sss,out.Grid()->oSites(),1,{
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
});
}
@ -191,18 +191,19 @@ public:
int Ls=Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
auto tmp_v = tmp.View();
auto Btilde_v = Btilde.View();
auto Atilde_v = Atilde.View();
thread_for(sss,tmp.Grid()->oSites(),{
int sU=sss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
}
});
{
autoView( tmp_v , tmp, AcceleratorWrite);
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,tmp.Grid()->oSites(),1,{
int sU=sss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
}
});
}
PokeIndex<LorentzIndex>(mat,tmp,mu);
}
};

View File

@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
#ifdef GRID_NVCC
#ifdef GRID_CUDA
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
#ifndef GRID_NVCC
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
}
#endif
#ifndef GRID_NVCC
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0;
////////////////////////////////////////////////
// GENERAL CAYLEY CASE

View File

@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i,AcceleratorRead);
autoView(phi , phi_i,AcceleratorRead);
autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
{
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i,AcceleratorRead);
autoView(phi , phi_i,AcceleratorRead);
autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i,AcceleratorRead);
autoView(chi , chi_i,AcceleratorWrite);
int Ls=this->Ls;
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
GridBase *grid=psi_i.Grid();
int Ls=this->Ls;
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i,AcceleratorRead);
autoView(chi , chi_i,AcceleratorWrite);
auto plee = & lee [0];
auto pdee = & dee [0];

View File

@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
autoView(psi, psi_i,CpuRead);
autoView(phi, phi_i,CpuRead);
autoView(chi, chi_i,CpuWrite);
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
const int nsimd= Simd::Nsimd();
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
auto psi=psi_i.View();
auto phi=phi_i.View();
auto chi=chi_i.View();
autoView(psi,psi_i,CpuRead);
autoView(phi,phi_i,CpuRead);
autoView(chi,chi_i,CpuWrite);
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd();
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
Vector<iSinglet<Simd> > &Matm)
{
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i,CpuRead);
autoView(chi , chi_i,CpuWrite);
#ifndef AVX512
{
SiteHalfSpinor BcastP;
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
EnableIf<Impl::LsVectorised,int> sfinae=0;
#ifndef AVX512
{
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i,CpuRead);
autoView(chi , chi_i,CpuWrite);
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
}
#else
{
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i,CpuRead);
autoView(chi , chi_i,CpuWrite);
// pointers
// MASK_REGS;
#define Chi_00 %zmm0

View File

@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
chi_i.Checkerboard() = psi_i.Checkerboard();
int Ls = this->Ls;
GridBase* grid = psi_i.Grid();
auto phi = phi_i.View();
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView( phi , phi_i, AcceleratorRead);
autoView( psi , psi_i, AcceleratorRead);
autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
GridBase* grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
autoView( psi , psi_i, AcceleratorRead);
autoView( phi , phi_i, AcceleratorRead);
autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid();
auto psi=psi_i.View();
auto chi=chi_i.View();
autoView( psi, psi_i, AcceleratorRead);
autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls;
auto plee = & this->lee[0];
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid();
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView( psi, psi_i, AcceleratorRead);
autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls;
auto plee = & this->lee[0];

View File

@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
Compressor compressor;
Stencil.HaloExchange(in,compressor);
auto Umu_v = Umu.View();
auto UUUmu_v = UUUmu.View();
auto in_v = in.View();
auto out_v = out.View();
autoView( Umu_v , Umu, CpuRead);
autoView( UUUmu_v , UUUmu, CpuRead);
autoView( in_v , in, CpuRead);
autoView( out_v , out, CpuWrite);
thread_for( ss,Umu.Grid()->oSites(),{
for(int s=0;s<Ls;s++){
int sU=ss;
@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor;
int LLs = in.Grid()->_rdimensions[0];
@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DhopFaceTime-=usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
// Remove explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
DhopComputeTime-=usecond();
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U.Grid()->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
int interior=1;
int exterior=0;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopCommTime += ctime;
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopComputeTime+=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
DhopComputeTime2-=usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
});
} else {
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
});
DhopComputeTime2-=usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime2+=usecond();
#else
assert(0);
#endif
}
template<class Impl>
@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
Compressor compressor;
int LLs = in.Grid()->_rdimensions[0];
//double t1=usecond();
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
});
} else {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
});
{
int interior=1;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
}
/*CHANGE END*/

View File

@ -258,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
////////////////////////
// Call the single hop
////////////////////////
auto U_v = U.View();
auto UUU_v = UUU.View();
auto B_v = B.View();
auto Btilde_v = Btilde.View();
autoView( U_v , U, CpuRead);
autoView( UUU_v , UUU, CpuRead);
autoView( B_v , B, CpuWrite);
autoView( Btilde_v , Btilde, CpuWrite);
thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
});
@ -386,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
Compressor compressor;
Stencil.HaloExchange(in, compressor);
auto Umu_v = Umu.View();
auto UUUmu_v = UUUmu.View();
auto in_v = in.View();
auto out_v = out.View();
autoView( Umu_v , Umu, CpuRead);
autoView( UUUmu_v , UUUmu, CpuRead);
autoView( in_v , in, CpuRead);
autoView( out_v , out, CpuWrite);
thread_for( sss, in.Grid()->oSites(),{
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
});
@ -403,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template <class Impl>
@ -417,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
Compressor compressor;
int len = U.Grid()->oSites();
@ -426,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
}
} else {
st.CommunicateThreaded();
}
int interior=1;
int exterior=0;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
@ -487,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopComputeTime2 -= usecond();
{
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
} else {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
}
int interior=0;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
#else
assert(0);
#endif
}
@ -528,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
DhopComputeTime -= usecond();
if (dag == DaggerYes) {
thread_for(sss, in.Grid()->oSites(),{
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
});
} else {
thread_for(sss, in.Grid()->oSites(),{
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
});
{
int interior=1;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();

View File

@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i, AcceleratorRead);
autoView(phi , phi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i, AcceleratorRead);
autoView(phi , phi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
auto pm = this->pm;
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i, AcceleratorRead);
autoView(phi , phi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
auto psi = psi_i.View();
auto phi = phi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i, AcceleratorRead);
autoView(phi , phi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->lee [0];
auto pdee = & this->dee [0];
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
auto pm = this->pm;
auto plee = & this->lee [0];
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
int Ls = this->Ls;
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->lee [0];
auto pdee = & this->dee [0];
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
auto psi = psi_i.View();
auto chi = chi_i.View();
autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
int Ls = this->Ls;
auto pm = this->pm;

View File

@ -0,0 +1,499 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////
// Constructor and gauge import
/////////////////////////////////
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: Kernels(p),
_grid(&Fgrid),
_cbgrid(&Hgrid),
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
_tmp(&Hgrid)
{
int vol4;
int LLs=1;
c1=_c1;
u0=_u0;
vol4= _grid->oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4= _cbgrid->oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
{
ImportGauge(_U);
}
////////////////////////////////////////////////////////////
// Momentum space propagator should be
// https://arxiv.org/pdf/hep-lat/9712010.pdf
//
// mom space action.
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
//
// must track through staggered flavour/spin reduction in literature to
// turn to free propagator for the one component chi field, a la page 4/5
// of above link to implmement fourier based solver.
////////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
{
GaugeLinkField U(GaugeGrid());
DoubledGaugeField _UUU(GaugeGrid());
////////////////////////////////////////////////////////
// Double Store should take two fields for Naik and one hop separately.
// Discard teh Naik as Naive
////////////////////////////////////////////////////////
Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
////////////////////////////////////////////////////////
// Apply scale factors to get the right fermion Kinetic term
// Could pass coeffs into the double store to save work.
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
U = PeekIndex<LorentzIndex>(Umu, mu+4);
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
}
CopyGaugeCheckerboards();
}
/////////////////////////////
// Implement the interface
/////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo);
axpy(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes);
axpy(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass);
out = scal * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out);
}
///////////////////////////////////
// Internal
///////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField & mat,
const FermionField &A, const FermionField &B, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor;
FermionField Btilde(B.Grid());
FermionField Atilde(B.Grid());
Atilde = A;
st.HaloExchange(B, compressor);
for (int mu = 0; mu < Nd; mu++) {
////////////////////////
// Call the single hop
////////////////////////
autoView( U_v , U, CpuRead);
autoView( B_v , B, CpuWrite);
autoView( Btilde_v , Btilde, CpuWrite);
thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
});
assert(0);// need to figure out the force interface with a blasted three link term.
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _grid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
mat.Checkerboard() = U.Checkerboard();
DerivInternal(Stencil, Umu, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Even);
assert(U.Checkerboard() == Odd);
mat.Checkerboard() = Odd;
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Odd);
assert(U.Checkerboard() == Even);
mat.Checkerboard() = Even;
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
{
DhopDir(in, out, dir, disp);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
{
assert(0); // Not implemented yet
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
{
Compressor compressor;
Stencil.HaloExchange(in, compressor);
autoView( Umu_v , Umu, CpuRead);
autoView( in_v , in, CpuRead);
autoView( out_v , out, CpuWrite);
// thread_for( sss, in.Grid()->oSites(),{
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
// });
assert(0);
};
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
DhopInternalSerialComms(st,lo,U,in,out,dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
Compressor compressor;
int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=0;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void NaiveStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
assert(0);
}
NAMESPACE_END(Grid);

View File

@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{
assert(0);
@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
gauge2 =(uint64_t)&UU[sU]( Z ); \
gauge3 =(uint64_t)&UU[sU]( T );
// This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{
#ifdef AVX512
@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
// for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
}
#include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
{
#ifdef AVX512
@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
// This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{
#ifdef AVX512
@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1);
@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
}
#include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{
#ifdef AVX512
@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1);

View File

@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag)
{
typedef typename Simd::scalar_type S;
@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
skew = 0;
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
HAND_STENCIL_LEG (U,Zm,1,skew,even);
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
if (Naik) {
skew = 8;
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
}
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag)
{
typedef typename Simd::scalar_type S;
@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
}
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) {
result()()(0) = - even_0 - odd_0;
@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag)
{
typedef typename Simd::scalar_type S;
@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
}
// Add sum of all exterior connected stencil legs
if ( nmu ) {
if ( dag ) {
@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
}
}
/*
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionFieldView &in, FermionFieldView &out, int dag); \
*/
#undef LOAD_CHI
NAMESPACE_END(Grid);

View File

@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \
permute(chi, in[SE->_offset], ptype); \
} else { \
chi_p = &in[SE->_offset]; \
chi_p = &in[SE->_offset]; \
} \
} else { \
chi_p = &buf[SE->_offset]; \
@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid);
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \
permute(chi, in[SE->_offset], ptype); \
} else { \
chi_p = &in[SE->_offset]; \
chi_p = &in[SE->_offset]; \
} \
} else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U[sU], *chi_p, Dir); \
multLink(Uchi, U[sU], *chi_p, Dir); \
}
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid);
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
chi_p = &buf[SE->_offset]; \
multLink(Uchi, U[sU], *chi_p, Dir); \
multLink(Uchi, U[sU], *chi_p, Dir); \
}
template <class Impl>
@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
// Int, Ext, Int+Ext cases for comms overlap
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag) {
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag)
{
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
int ptype;
int skew;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// for(int s=0;s<LLs;s++){
//
// int sF=LLs*sU+s;
{
skew = 0;
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) {
Uchi = - Uchi;
}
@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
// Only contributions from interior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
int ptype;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// for(int s=0;s<LLs;s++){
// int sF=LLs*sU+s;
{
skew = 0;
Uchi=Zero();
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) {
Uchi = - Uchi;
}
@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
// Only contributions from exterior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p;
// SiteSpinor chi;
@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
int nmu=0;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// for(int s=0;s<LLs;s++){
// int sF=LLs*sU+s;
{
skew = 0;
Uchi=Zero();
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( nmu ) {
if ( dag ) {
out[sF] = out[sF] - Uchi;
@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=1;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=0;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionFieldView &in, FermionFieldView &out,
int dag,int interior,int exterior)
{
switch(Opt) {
#ifdef AVX512
case OptInlineAsm:
if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else {
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
assert(0);
}
break;
#endif
case OptHandUnroll:
if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
case OptGeneric:
if ( interior && exterior ) {
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
};
template <class Impl>
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
{
// Disp should be either +1,-1,+3,-3
// What about "dag" ?
@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
assert(0);
}
#define KERNEL_CALLNB(A,improved) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
#define ASM_CALL(A) \
const uint64_t NN = Nsite*Ls; \
thread_for( ss, NN, { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
template <class Impl>
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
autoView( UUU_v , UUU, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
autoView( UUU_v , U, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
#endif
}
}
#undef KERNEL_CALLNB
#undef KERNEL_CALL
#undef ASM_CALL
NAMESPACE_END(Grid);

View File

@ -98,32 +98,35 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
Coordinate lcoor;
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
for (int site = 0; site < lvol; site++)
{
grid->LocalIndexToLocalCoor(site, lcoor);
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
peekLocalSite(Qx, CloverTerm, lcoor);
Qxinv = Zero();
//if (csw!=0){
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
for (int site = 0; site < lvol; site++) {
grid->LocalIndexToLocalCoor(site, lcoor);
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
peekLocalSite(Qx, CTv, lcoor);
Qxinv = Zero();
//if (csw!=0){
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CTIv, lcoor);
}
}
// Separate the even and odd parts

View File

@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd);
Tcomplex cc;
// RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a,lcoor);
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd);
Tcomplex cc;
// RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a_v,lcoor);
}
}
Wea = ( exp( a) * abs(W) );
@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd);
Tcomplex cc;
// RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha,lcoor);
peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a,lcoor);
}
pokeLocalSite(cc,a_v,lcoor);
}}
Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) );

View File

@ -67,7 +67,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
diag_mass = 4.0 + mass;
}
int vol4;
vol4=Fgrid.oSites();
Stencil.BuildSurfaceList(1,vol4);
vol4=Hgrid.oSites();
StencilEven.BuildSurfaceList(1,vol4);
StencilOdd.BuildSurfaceList(1,vol4);
}
template <class Impl>
@ -483,32 +488,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
conformable(_grid, q_in_1.Grid());
conformable(_grid, q_in_2.Grid());
conformable(_grid, q_out.Grid());
#if 0
PropagatorField tmp1(_grid), tmp2(_grid);
q_out = Zero();
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
// Inefficient comms method but not performance critical.
tmp1 = Cshift(q_in_1, mu, 1);
tmp2 = Cshift(q_in_2, mu, 1);
auto tmp1_v = tmp1.View();
auto tmp2_v = tmp2.View();
auto q_in_1_v=q_in_1.View();
auto q_in_2_v=q_in_2.View();
auto q_out_v = q_out.View();
auto Umu_v = Umu.View();
thread_for(sU, Umu.Grid()->oSites(),{
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
q_in_2_v[sU],
q_out_v[sU],
Umu_v, sU, mu);
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
tmp2_v[sU],
q_out_v[sU],
Umu_v, sU, mu);
});
#else
#endif
assert(0);
}
@ -524,62 +504,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
{
conformable(_grid, q_in.Grid());
conformable(_grid, q_out.Grid());
#if 0
// Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
Complex i(0.0,1.0);
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
q_out = Zero();
LatticeInteger coords(_grid);
LatticeCoordinate(coords, Tp);
// Need q(x + mu) and q(x - mu).
tmp = Cshift(q_in, mu, 1);
tmpFwd = tmp*lattice_cmplx;
tmp = lattice_cmplx*q_in;
tmpBwd = Cshift(tmp, mu, -1);
auto coords_v = coords.View();
auto tmpFwd_v = tmpFwd.View();
auto tmpBwd_v = tmpBwd.View();
auto Umu_v = Umu.View();
auto q_out_v = q_out.View();
thread_for(sU, Umu.Grid()->oSites(), {
// Compute the sequential conserved current insertion only if our simd
// object contains a timeslice we need.
vPredicate t_mask;
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
Integer timeSlices = Reduce(t_mask());
if (timeSlices > 0) {
Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],
q_out_v[sU],
Umu_v, sU, mu, t_mask);
}
// Repeat for backward direction.
t_mask() = ((coords_v[sU] >= (tmin + tshift)) &&
(coords_v[sU] <= (tmax + tshift)));
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
unsigned int t0 = 0;
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
timeSlices = Reduce(t_mask());
if (timeSlices > 0) {
Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],
q_out_v[sU],
Umu_v, sU, mu, t_mask);
}
});
#else
#endif
assert(0);
}
NAMESPACE_END(Grid);

View File

@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
// Generic implementation; move to different file?
////////////////////////////////////////////
/*
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
{
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
uint4 * chip_pun = (uint4 *)&chip;
@ -51,6 +52,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
#endif
return;
}
*/
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
@ -61,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
synchronise(); \
acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi);
@ -74,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else if ( st.same_node[Dir] ) { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
synchronise(); \
acceleratorSynchronise(); \
if (SE->_is_local || st.same_node[Dir] ) { \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \
} \
synchronise();
acceleratorSynchronise();
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
@ -89,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
Recon(result, Uchi); \
nmu++; \
} \
synchronise();
acceleratorSynchronise();
#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \
if (SE->_is_local ) { \
@ -99,7 +101,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
synchronise(); \
acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
Recon(result, Uchi);
@ -126,7 +128,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
StencilEntry *SE;
int ptype;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd);
const int lane=acceleratorSIMTlane(Nsimd);
GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
@ -153,7 +155,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
int ptype;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd);
const int lane=acceleratorSIMTlane(Nsimd);
GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
@ -181,7 +183,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
StencilEntry *SE;
int ptype;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd);
const int lane=acceleratorSIMTlane(Nsimd);
result=Zero();
GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
@ -203,7 +205,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
typedef decltype(coalescedRead(in[0])) calcSpinor;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd);
const int lane=acceleratorSIMTlane(Nsimd);
calcHalfSpinor chi;
// calcHalfSpinor *chi_p;
@ -239,7 +241,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
int ptype;
int nmu=0;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd);
const int lane=acceleratorSIMTlane(Nsimd);
result=Zero();
GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
@ -270,7 +272,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
int ptype;
int nmu=0;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd);
const int lane=acceleratorSIMTlane(Nsimd);
result=Zero();
GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
@ -300,7 +302,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
StencilEntry *SE; \
int ptype; \
const int Nsimd = SiteHalfSpinor::Nsimd(); \
const int lane=SIMTlane(Nsimd); \
const int lane=acceleratorSIMTlane(Nsimd); \
\
SE = st.GetEntry(ptype, dir, sF); \
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \
@ -328,7 +330,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
StencilEntry *SE;
int ptype;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd);
const int lane=acceleratorSIMTlane(Nsimd);
SE = st.GetEntry(ptype, dir, sF);
GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
@ -346,30 +348,30 @@ template <class Impl>
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
int Nsite, const FermionField &in, std::vector<FermionField> &out)
{
auto U_v = U.View();
auto in_v = in.View();
auto st_v = st.View();
autoView(U_v ,U,AcceleratorRead);
autoView(in_v ,in,AcceleratorRead);
autoView(st_v ,st,AcceleratorRead);
auto out_Xm = out[0].View();
auto out_Ym = out[1].View();
auto out_Zm = out[2].View();
auto out_Tm = out[3].View();
auto out_Xp = out[4].View();
auto out_Yp = out[5].View();
auto out_Zp = out[6].View();
auto out_Tp = out[7].View();
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
autoView(out_Xm,out[0],AcceleratorWrite);
autoView(out_Ym,out[1],AcceleratorWrite);
autoView(out_Zm,out[2],AcceleratorWrite);
autoView(out_Tm,out[3],AcceleratorWrite);
autoView(out_Xp,out[4],AcceleratorWrite);
autoView(out_Yp,out[5],AcceleratorWrite);
autoView(out_Zp,out[6],AcceleratorWrite);
autoView(out_Tp,out[7],AcceleratorWrite);
auto CBp=st.CommBuf();
accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
int sU=sss/Ls;
int sF =sss;
DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
});
}
@ -381,17 +383,18 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
assert(dirdisp<=7);
assert(dirdisp>=0);
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
autoView(U_v ,U ,AcceleratorRead);
autoView(in_v ,in ,AcceleratorRead);
autoView(out_v,out,AcceleratorWrite);
autoView(st_v ,st ,AcceleratorRead);
auto CBp=st.CommBuf();
#define LoopBody(Dir) \
case Dir : \
accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \
case Dir : \
accelerator_for(ss,Nsite,Simd::Nsimd(),{ \
for(int s=0;s<Ls;s++){ \
int sU=ss; \
int sF = s+Ls*sU; \
DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
} \
}); \
break;
@ -435,26 +438,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
autoView(U_v , U,AcceleratorRead);
autoView(in_v , in,AcceleratorRead);
autoView(out_v,out,AcceleratorWrite);
autoView(st_v , st,AcceleratorRead);
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif
@ -466,26 +469,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
auto U_v = U.View();
auto in_v = in.View();
auto out_v = out.View();
auto st_v = st.View();
autoView(U_v ,U,AcceleratorRead);
autoView(in_v ,in,AcceleratorRead);
autoView(out_v,out,AcceleratorWrite);
autoView(st_v ,st,AcceleratorRead);
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
#ifndef GRID_NVCC
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif
@ -493,5 +496,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
assert(0 && " Kernel optimisation case not covered ");
}
#undef KERNEL_CALLNB
#undef KERNEL_CALL
#undef ASM_CALL
NAMESPACE_END(Grid);

View File

@ -0,0 +1,36 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
NAMESPACE_BEGIN(Grid);
const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
NAMESPACE_END(Grid);

View File

@ -0,0 +1,37 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class NaiveStaggeredFermion<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../NaiveStaggeredFermionInstantiation.cc.master

View File

@ -0,0 +1 @@
../NaiveStaggeredFermionInstantiation.cc.master

View File

@ -88,6 +88,7 @@ done
CC_LIST=" \
ImprovedStaggeredFermion5DInstantiation \
ImprovedStaggeredFermionInstantiation \
NaiveStaggeredFermionInstantiation \
StaggeredKernelsInstantiation "
for impl in $STAG_IMPL_LIST

View File

@ -86,9 +86,9 @@ public:
// Move this elsewhere? FIXME
static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
auto U_v = U.View();
auto W_v = W.View();
thread_for( ss, U.Grid()->oSites(), {
autoView(U_v,U,AcceleratorWrite);
autoView(W_v,W,AcceleratorRead);
accelerator_for( ss, U.Grid()->oSites(), 1, {
U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
});
}
@ -131,15 +131,14 @@ public:
//static std::chrono::duration<double> diff;
//auto start = std::chrono::high_resolution_clock::now();
auto U_v = U.View();
auto P_v = P.View();
thread_for(ss, P.Grid()->oSites(),{
autoView(U_v,U,AcceleratorWrite);
autoView(P_v,P,AcceleratorRead);
accelerator_for(ss, P.Grid()->oSites(),1,{
for (int mu = 0; mu < Nd; mu++) {
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
}
});
//auto end = std::chrono::high_resolution_clock::now();
//auto end = std::chrono::high_resolution_clock::now();
// diff += end - start;
// std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
}

View File

@ -1,5 +1,13 @@
#pragma once
#define CPS_MD_TIME
#ifdef CPS_MD_TIME
#define HMC_MOMENTUM_DENOMINATOR (2.0)
#else
#define HMC_MOMENTUM_DENOMINATOR (1.0)
#endif
NAMESPACE_BEGIN(Grid);
template <class S>
@ -20,7 +28,9 @@ public:
typedef Field PropagatorField;
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
gaussian(pRNG, P);
P *= scale;
}
static inline Field projectForce(Field& P){return P;}
@ -66,7 +76,7 @@ public:
}
static void FreePropagator(const Field &in, Field &out,
const Field &momKernel)
const Field &momKernel)
{
FFT fft((GridCartesian *)in.Grid());
Field inFT(in.Grid());
@ -139,14 +149,17 @@ public:
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
{
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
#ifndef USE_FFT_ACCELERATION
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
#else
Field Pgaussian(P.Grid()), Pp(P.Grid());
ComplexField p2(P.Grid()); p2 = zero;
RealD M = FFT_MASS;
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
FFT theFFT((GridCartesian*)P.Grid());
@ -156,17 +169,17 @@ public:
p2 = sqrt(p2);
Pp *= p2;
theFFT.FFT_all_dim(P, Pp, FFT::backward);
#endif //USE_FFT_ACCELERATION
P *= scale;
}
static inline Field projectForce(Field& P) {return P;}
static inline Field projectForce(Field& P) {return Ta(P);}
static inline void update_field(Field &P, Field &U, double ep)
{
#ifndef USE_FFT_ACCELERATION
double t0=usecond();
U += P*ep;
U += P*ep;
double t1=usecond();
double total_time = (t1-t0)/1e6;
std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl;

View File

@ -89,8 +89,8 @@ public:
action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
auto p_v = p.View();
auto action_v = action.View();
autoView( p_v , p, CpuRead);
autoView( action_v , action, CpuWrite);
for (int mu = 0; mu < Ndim; mu++)
{
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
@ -146,8 +146,8 @@ public:
for (int point = 0; point < npoint; point++)
{
auto p_v = p.View();
auto force_v = force.View();
autoView( p_v , p, CpuRead);
autoView( force_v , force, CpuWrite);
int permute_type;
StencilEntry *SE;

View File

@ -80,10 +80,11 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient");
//static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,
// HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB");
//static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,
// HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual");
static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB");
static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual");
// add the staggered, scalar versions here

View File

@ -49,7 +49,7 @@ public:
private:
const unsigned int smearingLevels;
Smear_Stout<Gimpl> StoutSmearing;
Smear_Stout<Gimpl> *StoutSmearing;
std::vector<GaugeField> SmearedSet;
// Member functions
@ -72,7 +72,7 @@ private:
previous_u = *ThinLinks;
for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
{
StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
StoutSmearing->smear(SmearedSet[smearLvl], previous_u);
previous_u = SmearedSet[smearLvl];
// For debug purposes
@ -93,7 +93,7 @@ private:
GaugeLinkField SigmaKPrime_mu(grid);
GaugeLinkField GaugeKmu(grid), Cmu(grid);
StoutSmearing.BaseSmear(C, GaugeK);
StoutSmearing->BaseSmear(C, GaugeK);
SigmaK = Zero();
iLambda = Zero();
@ -107,7 +107,7 @@ private:
pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
pokeLorentz(iLambda, iLambda_mu, mu);
}
StoutSmearing.derivative(SigmaK, iLambda,
StoutSmearing->derivative(SigmaK, iLambda,
GaugeK); // derivative of SmearBase
return SigmaK;
}
@ -144,14 +144,14 @@ private:
// Exponential
iQ2 = iQ * iQ;
iQ3 = iQ * iQ2;
StoutSmearing.set_uw(u, w, iQ2, iQ3);
StoutSmearing.set_fj(f0, f1, f2, u, w);
StoutSmearing->set_uw(u, w, iQ2, iQ3);
StoutSmearing->set_fj(f0, f1, f2, u, w);
e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
// Getting B1, B2, Gamma and Lambda
// simplify this part, reduntant calculations in set_fj
xi0 = StoutSmearing.func_xi0(w);
xi1 = StoutSmearing.func_xi1(w);
xi0 = StoutSmearing->func_xi0(w);
xi1 = StoutSmearing->func_xi1(w);
u2 = u * u;
w2 = w * w;
cosw = cos(w);
@ -219,7 +219,7 @@ public:
/* Standard constructor */
SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
Smear_Stout<Gimpl>& Stout)
: smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
: smearingLevels(Nsmear), StoutSmearing(&Stout), ThinLinks(NULL)
{
for (unsigned int i = 0; i < smearingLevels; ++i)
SmearedSet.push_back(*(new GaugeField(UGrid)));
@ -227,7 +227,7 @@ public:
/*! For just thin links */
SmearedConfiguration()
: smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
: smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}
// attach the smeared routines to the thin links U and fill the smeared set
void set_Field(GaugeField &U)

View File

@ -185,13 +185,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
for(int i=0;i<Lblock;i++){
auto lhs_v = lhs_wi[i].View();
// Recreate view potentially expensive outside fo UVM mode
autoView(lhs_v,lhs_wi[i],CpuRead);
auto left = conjugate(lhs_v[ss]);
for(int j=0;j<Rblock;j++){
SpinMatrix_v vv;
auto rhs_v = rhs_vj[j].View();
// Recreate view potentially expensive outside fo UVM mode
autoView(rhs_v,rhs_vj[j],CpuRead);
auto right = rhs_v[ss];
for(int s1=0;s1<Ns;s1++){
for(int s2=0;s2<Ns;s2++){
@ -204,11 +205,10 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
for ( int m=0;m<Nmom;m++){
int idx = m+base;
auto mom_v = mom[m].View();
autoView(mom_v,mom[m],CpuRead);
auto phase = mom_v[ss];
mac(&lvSum[idx],&vv,&phase);
}
}
}
}
@ -371,7 +371,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
for(int i=0;i<Lblock;i++){
auto wi_v = wi[i].View();
autoView(wi_v,wi[i],CpuRead);
auto w = conjugate(wi_v[ss]);
if (g5) {
w()(2)(0) = - w()(2)(0);
@ -383,7 +383,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
}
for(int j=0;j<Rblock;j++){
auto vj_v=vj[j].View();
autoView(vj_v,vj[j],CpuRead);
auto v = vj_v[ss];
auto vv = v()(0)(0);
@ -518,12 +518,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
for(int i=0;i<Lblock;i++){
auto wi_v = wi[i].View();
autoView(wi_v,wi[i],CpuRead);
auto w = conjugate(wi_v[ss]);
for(int j=0;j<Rblock;j++){
auto vj_v = vj[j].View();
autoView(vj_v,vj[j],CpuRead);
auto v = vj_v[ss];
auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
@ -544,7 +544,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
for ( int m=0;m<Nmom;m++){
int idx = m+base;
auto mom_v = mom[m].View();
autoView(mom_v,mom[m],CpuRead);
auto phase = mom_v[ss];
mac(&lvSum[idx],&vv,&phase()()());
}
@ -730,13 +730,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
for(int i=0;i<Lblock;i++)
{
auto wi_v = lhs_wi[i].View();
autoView(wi_v,lhs_wi[i],CpuRead);
auto left = conjugate(wi_v[ss]);
for(int j=0;j<Rblock;j++)
{
SpinMatrix_v vv;
auto vj_v = rhs_vj[j].View();
autoView(vj_v,rhs_vj[j],CpuRead);
auto right = vj_v[ss];
for(int s1=0;s1<Ns;s1++)
@ -752,8 +752,8 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
for ( int m=0;m<Nem;m++)
{
auto emB0_v = emB0[m].View();
auto emB1_v = emB1[m].View();
autoView(emB0_v,emB0[m],CpuRead);
autoView(emB1_v,emB1[m],CpuRead);
int idx = m+base;
auto b0 = emB0_v[ss];
auto b1 = emB1_v[ss];
@ -1014,21 +1014,21 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
for(int t=0;t<N_t;t++){
for(int s=0;s<N_s;s++){
auto vs_v = vs[s].View();
auto tmp1 = vs_v[ss];
vobj tmp2 = Zero();
vobj tmp3 = Zero();
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
auto vd_v = vd[d].View();
Scalar_v coeff = WW_sd(t,s,d);
tmp3 = conjugate(vd_v[ss]);
mac(&tmp2, &coeff, &tmp3);
}
autoView(vs_v,vs[s],CpuRead);
auto tmp1 = vs_v[ss];
vobj tmp2 = Zero();
vobj tmp3 = Zero();
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
autoView(vd_v,vd[d],CpuRead);
Scalar_v coeff = WW_sd(t,s,d);
tmp3 = conjugate(vd_v[ss]);
mac(&tmp2, &coeff, &tmp3);
}
//////////////////////////
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
//////////////////////////
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
//////////////////////////
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
//////////////////////////
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
}}
}
@ -1067,21 +1067,20 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
thread_for(ss,grid->oSites(),{
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
for(int s=0;s<N_s;s++){
auto vs_v = vs[s].View();
auto tmp1 = vs_v[ss];
vobj tmp2 = Zero();
vobj tmp3 = Zero();
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
auto vd_v = vd[d].View();
Scalar_v coeff = buf(s,d);
tmp3 = conjugate(vd_v[ss]);
mac(&tmp2, &coeff, &tmp3);
}
//////////////////////////
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
//////////////////////////
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
autoView(vs_v,vs[s],CpuRead);
auto tmp1 = vs_v[ss];
vobj tmp2 = Zero();
vobj tmp3 = Zero();
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
autoView(vd_v,vd[d],CpuRead);
Scalar_v coeff = buf(s,d);
tmp3 = conjugate(vd_v[ss]);
mac(&tmp2, &coeff, &tmp3);
}
//////////////////////////
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
//////////////////////////
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
}}
});
}
@ -1093,7 +1092,7 @@ inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
const vobj &rhs,
const int Ns, const int ss)
{
auto WWVV_v = WWVV.View();
autoView(WWVV_v,WWVV,CpuWrite);
for (int s1 = 0; s1 < Ns; s1++){
for (int s2 = 0; s2 < Ns; s2++){
WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
@ -1120,33 +1119,39 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
assert(gamma0.size()==gamma1.size());
int Ng = gamma0.size();
// Make device accessible copy
Vector<Gamma> Gamma0v (Ng);
Vector<Gamma> Gamma1v (Ng);
Gamma *Gamma0 = & Gamma0v[0];
Gamma *Gamma1 = & Gamma1v[0];
for(int g=0;g<Ng;g++) {
Gamma0[g]=gamma0[g];
Gamma1[g]=gamma1[g];
}
GridBase *grid = WWVV0.Grid();
auto WWVV0_v = WWVV0.View();
auto WWVV1_v = WWVV1.View();
auto O_trtr_v= O_trtr.View();
auto O_fig8_v= O_fig8.View();
thread_for(ss,grid->oSites(),{
typedef typename ComplexField::vector_object vobj;
autoView(WWVV0_v , WWVV0,AcceleratorRead);
autoView(WWVV1_v , WWVV1,AcceleratorRead);
autoView(O_trtr_v, O_trtr,AcceleratorWrite);
autoView(O_fig8_v, O_fig8,AcceleratorWrite);
accelerator_for(ss,grid->oSites(),vobj::Nsimd(),{
typedef typename ComplexField::vector_object vobj;
vobj v_trtr;
vobj v_fig8;
auto VV0 = WWVV0_v[ss];
auto VV1 = WWVV1_v[ss];
auto VV0 = WWVV0_v(ss);
auto VV1 = WWVV1_v(ss);
for(int g=0;g<Ng;g++){
v_trtr = trace(VV0 * gamma0[g])* trace(VV1*gamma1[g]);
v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);
auto v_trtr = trace(VV0 * gamma0[g])* trace(VV1*gamma1[g]);
auto v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);
if ( g==0 ) {
O_trtr_v[ss] = v_trtr;
O_fig8_v[ss] = v_fig8;
coalescedWrite(O_trtr_v[ss], v_trtr);
coalescedWrite(O_fig8_v[ss], v_fig8);
} else {
O_trtr_v[ss]+= v_trtr;
O_fig8_v[ss]+= v_fig8;
coalescedWrite(O_trtr_v[ss], O_trtr_v(ss)+v_trtr);
coalescedWrite(O_fig8_v[ss], O_fig8_v(ss)+v_fig8);
}
}
@ -1166,25 +1171,36 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
GridBase *grid = WWVV0.Grid();
auto WWVV0_v = WWVV0.View();
auto WWVV1_v = WWVV1.View();
auto O_trtr_v= O_trtr.View();
auto O_fig8_v= O_fig8.View();
// Make device accessible copy
Vector<Gamma> Gamma0v (Ng);
Vector<Gamma> Gamma1v (Ng);
Gamma *Gamma0 = & Gamma0v[0];
Gamma *Gamma1 = & Gamma1v[0];
for(int g=0;g<Ng;g++) {
Gamma0[g]=gamma0[g];
Gamma1[g]=gamma1[g];
}
thread_for(ss,grid->oSites(),{
autoView( WWVV0_v , WWVV0,AcceleratorRead);
autoView( WWVV1_v , WWVV1,AcceleratorRead);
autoView( O_trtr_v, O_trtr,AcceleratorWrite);
autoView( O_fig8_v, O_fig8,AcceleratorWrite);
typedef typename ComplexField::vector_object vobj;
typedef typename ComplexField::vector_object vobj;
accelerator_for(ss,grid->oSites(),vobj::Nsimd(),{
auto VV0 = WWVV0_v[ss];
auto VV1 = WWVV1_v[ss];
auto VV0 = WWVV0_v(ss);
auto VV1 = WWVV1_v(ss);
typedef decltype(trace(VV0)) scalar;
for(int g=0;g<Ng;g++){
auto VV0G = VV0 * gamma0[g]; // Spin multiply
auto VV1G = VV1 * gamma1[g];
vobj v_trtr=Zero();
vobj v_fig8=Zero();
scalar v_trtr=Zero();
scalar v_fig8=Zero();
/////////////////////////////////////////
// Colour mixed
@ -1198,7 +1214,7 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
// Wick1 [ spin TR TR ]
//
// (VV0*G0)_ss,ba . (VV1*G1)_tt,ab
//
//
// Wick2 [ spin fig8 ]
//
// (VV0*G0)_st,aa (VV1*G1)_ts,bb
@ -1235,11 +1251,11 @@ Bag [8,4] fig8 (-227.58,3.58808e-17) trtr (-32.5776,1.83286e-17) // - 1602
}}}}
if ( g==0 ) {
O_trtr_v[ss] = v_trtr;
O_fig8_v[ss] = v_fig8;
coalescedWrite(O_trtr_v[ss] , v_trtr);
coalescedWrite(O_fig8_v[ss] , v_fig8);
} else {
O_trtr_v[ss]+= v_trtr;
O_fig8_v[ss]+= v_fig8;
coalescedWrite(O_trtr_v[ss],O_trtr_v(ss) + v_trtr);
coalescedWrite(O_fig8_v[ss],O_fig8_v(ss) + v_fig8);
}
}

View File

@ -7,6 +7,7 @@
Copyright (C) 2019
Author: Felix Erben <felix.erben@ed.ac.uk>
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -46,7 +47,7 @@ public:
typedef typename SpinMatrixField::vector_object sobj;
static const int epsilon[6][3] ;
static const Complex epsilon_sgn[6];
static const Real epsilon_sgn[6];
private:
template <class mobj, class robj>
@ -58,9 +59,12 @@ public:
const Gamma GammaA_right,
const Gamma GammaB_right,
const int parity,
const int * wick_contractions,
const bool * wick_contractions,
robj &result);
public:
static void Wick_Contractions(std::string qi,
std::string qf,
bool* wick_contractions);
static void ContractBaryons(const PropagatorField &q1_left,
const PropagatorField &q2_left,
const PropagatorField &q3_left,
@ -68,8 +72,7 @@ public:
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const char * quarks_left,
const char * quarks_right,
const bool* wick_contractions,
const int parity,
ComplexField &baryon_corr);
template <class mobj, class robj>
@ -80,12 +83,61 @@ public:
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const char * quarks_left,
const char * quarks_right,
const bool* wick_contractions,
const int parity,
const int nt,
robj &result);
private:
template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group1_Site(
const mobj &Dq1_ti,
const mobj2 &Dq2_spec,
const mobj2 &Dq3_spec,
const mobj &Dq4_tf,
const Gamma GammaJ,
const Gamma GammaBi,
const Gamma GammaBf,
int wick_contraction,
robj &result);
template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group2_Site(
const mobj2 &Dq1_spec,
const mobj &Dq2_ti,
const mobj2 &Dq3_spec,
const mobj &Dq4_tf,
const Gamma GammaJ,
const Gamma GammaBi,
const Gamma GammaBf,
int wick_contraction,
robj &result);
template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group3_Site(
const mobj2 &Dq1_spec,
const mobj2 &Dq2_spec,
const mobj &Dq3_ti,
const mobj &Dq4_tf,
const Gamma GammaJ,
const Gamma GammaBi,
const Gamma GammaBf,
int wick_contraction,
robj &result);
public:
template <class mobj>
static void Baryon_Gamma_3pt(
const PropagatorField &q_ti,
const mobj &Dq_spec1,
const mobj &Dq_spec2,
const PropagatorField &q_tf,
int group,
int wick_contraction,
const Gamma GammaJ,
const Gamma GammaBi,
const Gamma GammaBf,
SpinMatrixField &stn_corr);
private:
template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
const mobj2 &Du_spec,
const mobj &Dd_tf,
@ -151,119 +203,152 @@ public:
template <class FImpl>
const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
template <class FImpl>
/*template <class FImpl>
const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
Complex(1),
Complex(1),
Complex(-1),
Complex(-1),
Complex(-1)};
*/
template <class FImpl>
const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
//This is the old version
template <class FImpl>
template <class mobj, class robj>
void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const int parity,
const int * wick_contraction,
robj &result)
const mobj &D2,
const mobj &D3,
const Gamma GammaA_i,
const Gamma GammaB_i,
const Gamma GammaA_f,
const Gamma GammaB_f,
const int parity,
const bool * wick_contraction,
robj &result)
{
Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
auto gD1a = GammaA_left * GammaA_right * D1;
auto gD1b = GammaA_left * g4 * GammaA_right * D1;
auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
auto gD3 = GammaB_right * D3;
auto D1_GAi = D1 * GammaA_i;
auto D1_GAi_g4 = D1_GAi * g4;
auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4);
auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P;
auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P;
auto D2g = D2 * GammaB_left;
auto pD1g = pD1 * GammaB_left;
auto gD3g = gD3 * GammaB_left;
auto D2_GBi = D2 * GammaB_i;
auto GBf_D2_GBi = GammaB_f * D2_GBi;
auto GAf_D2_GBi = GammaA_f * D2_GBi;
for (int ie_left=0; ie_left < 6 ; ie_left++){
int a_left = epsilon[ie_left][0]; //a
int b_left = epsilon[ie_left][1]; //b
int c_left = epsilon[ie_left][2]; //c
for (int ie_right=0; ie_right < 6 ; ie_right++){
int a_right = epsilon[ie_right][0]; //a'
int b_right = epsilon[ie_right][1]; //b'
int c_right = epsilon[ie_right][2]; //c'
Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
auto GBf_D3 = GammaB_f * D3;
auto GAf_D3 = GammaA_f * D3;
for (int ie_f=0; ie_f < 6 ; ie_f++){
int a_f = epsilon[ie_f][0]; //a
int b_f = epsilon[ie_f][1]; //b
int c_f = epsilon[ie_f][2]; //c
for (int ie_i=0; ie_i < 6 ; ie_i++){
int a_i = epsilon[ie_i][0]; //a'
int b_i = epsilon[ie_i][1]; //b'
int c_i = epsilon[ie_i][2]; //c'
Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
//This is the \delta_{456}^{123} part
if (wick_contraction[0]){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
for (int beta_left=0; beta_left<Ns; beta_left++){
auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
result()()() += eepD1*D2g_ab*gD3_ab;
}}}
}
if (wick_contraction[0]){
for (int rho=0; rho<Ns; rho++){
auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
result()()() += ee * GAf_D1_GAi_P_rr_cc
* D2_GBi ()(alpha_f,beta_i)(a_f,a_i)
* GBf_D3 ()(alpha_f,beta_i)(b_f,b_i);
}}
}
}
//This is the \delta_{456}^{231} part
if (wick_contraction[1]){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
for (int beta_left=0; beta_left<Ns; beta_left++){
auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
result()()() += eepD1g_gb*D2_ab*gD3_ag;
}}}
if (wick_contraction[1]){
for (int rho=0; rho<Ns; rho++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
result()()() += ee * D1_GAi_P_ar_ac
* GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i)
* GAf_D3 ()(rho,beta_i)(c_f,b_i);
}
}}
}
//This is the \delta_{456}^{312} part
if (wick_contraction[2]){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
for (int beta_left=0; beta_left<Ns; beta_left++){
auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
result()()() += eepD1_gb*D2_ag*gD3g_ab;
}}}
if (wick_contraction[2]){
for (int rho=0; rho<Ns; rho++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
result()()() += ee * GBf_D1_GAi_P_ar_bc
* GAf_D2_GBi ()(rho,beta_i)(c_f,a_i)
* D3 ()(alpha_f,beta_i)(a_f,b_i);
}
}}
}
//This is the \delta_{456}^{132} part
if (wick_contraction[3]){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
for (int beta_left=0; beta_left<Ns; beta_left++){
auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
result()()() -= eepD1*D2_ab*gD3g_ab;
}}}
if (wick_contraction[3]){
for (int rho=0; rho<Ns; rho++){
auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
result()()() -= ee * GAf_D1_GAi_P_rr_cc
* GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i)
* D3 ()(alpha_f,beta_i)(a_f,b_i);
}
}}
}
//This is the \delta_{456}^{321} part
if (wick_contraction[4]){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
for (int beta_left=0; beta_left<Ns; beta_left++){
auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
result()()() -= eepD1_gb*D2g_ab*gD3_ag;
}}}
if (wick_contraction[4]){
for (int rho=0; rho<Ns; rho++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
result()()() -= ee * GBf_D1_GAi_P_ar_bc
* D2_GBi ()(alpha_f,beta_i)(a_f,a_i)
* GAf_D3 ()(rho,beta_i)(c_f,b_i);
}
}}
}
//This is the \delta_{456}^{213} part
if (wick_contraction[5]){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
for (int beta_left=0; beta_left<Ns; beta_left++){
auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
result()()() -= eepD1g_gb*D2_ag*gD3_ab;
}}}
if (wick_contraction[5]){
for (int rho=0; rho<Ns; rho++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
result()()() -= ee * D1_GAi_P_ar_ac
* GAf_D2_GBi ()(rho,beta_i)(c_f,a_i)
* GBf_D3 ()(alpha_f,beta_i)(b_f,b_i);
}
}}
}
}
}}
}
/* Computes which wick contractions should be performed for a *
* baryon 2pt function given the initial and finals state quark *
* flavours. *
* The array wick_contractions must be of length 6 */
template<class FImpl>
void BaryonUtils<FImpl>::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) {
const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
for (int ie=0; ie < 6 ; ie++) {
wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3
&& qi[0] == qf[epsilon[ie][0]]
&& qi[1] == qf[epsilon[ie][1]]
&& qi[2] == qf[epsilon[ie][2]]);
}
}
/* The array wick_contractions must be of length 6. The order *
* corresponds to the to that shown in the Hadrons documentation *
* at https://aportelli.github.io/Hadrons-doc/#/mcontraction *
* This can be computed from the quark flavours using the *
* Wick_Contractions function above */
template<class FImpl>
void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
const PropagatorField &q2_left,
@ -272,8 +357,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const char * quarks_left,
const char * quarks_right,
const bool* wick_contractions,
const int parity,
ComplexField &baryon_corr)
{
@ -281,38 +365,53 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl;
std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl;
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl;
std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl;
assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
GridBase *grid = q1_left.Grid();
int wick_contraction[6];
for (int ie=0; ie < 6 ; ie++)
wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
autoView(vbaryon_corr, baryon_corr,CpuWrite);
autoView( v1 , q1_left, CpuRead);
autoView( v2 , q2_left, CpuRead);
autoView( v3 , q3_left, CpuRead);
auto vbaryon_corr= baryon_corr.View();
auto v1 = q1_left.View();
auto v2 = q2_left.View();
auto v3 = q3_left.View();
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
thread_for(ss,grid->oSites(),{
//for(int ss=0; ss < grid->oSites(); ss++){
Real bytes =0.;
bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
for (int ie=0; ie < 6 ; ie++){
if(ie==0 or ie==3){
bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
}
else{
bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
}
}
Real t=0.;
t =-usecond();
accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto D1 = v1[ss];
auto D2 = v2[ss];
auto D3 = v3[ss];
vobj result=Zero();
baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
vbaryon_corr[ss] = result;
} );//end loop over lattice sites
t += usecond();
std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
}
/* The array wick_contractions must be of length 6. The order *
* corresponds to the to that shown in the Hadrons documentation *
* at https://aportelli.github.io/Hadrons-doc/#/mcontraction *
* This can also be computed from the quark flavours using the *
* Wick_Contractions function above */
template <class FImpl>
template <class mobj, class robj>
void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
@ -322,34 +421,363 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const char * quarks_left,
const char * quarks_right,
const bool* wick_contractions,
const int parity,
const int nt,
robj &result)
{
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl;
std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl;
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl;
std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl;
assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
int wick_contraction[6];
for (int ie=0; ie < 6 ; ie++)
wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
result=Zero();
baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
for (int t=0; t<nt; t++) {
baryon_site(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
}
}
/***********************************************************************
* End of Baryon 2pt-function code. *
* *
* The following code is for baryonGamma3pt function *
**********************************************************************/
/* Dq1_ti is a quark line from t_i to t_J
* Dq2_spec is a quark line from t_i to t_f
* Dq3_spec is a quark line from t_i to t_f
* Dq4_tf is a quark line from t_f to t_J */
template<class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
const mobj &Dq1_ti,
const mobj2 &Dq2_spec,
const mobj2 &Dq3_spec,
const mobj &Dq4_tf,
const Gamma GammaJ,
const Gamma GammaBi,
const Gamma GammaBf,
int wick_contraction,
robj &result)
{
Gamma g5(Gamma::Algebra::Gamma5);
auto adjD4_g_D1 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti;
auto Gf_adjD4_g_D1 = GammaBf * adjD4_g_D1;
auto D2_Gi = Dq2_spec * GammaBi;
auto Gf_D2_Gi = GammaBf * D2_Gi;
auto Gf_D3 = GammaBf * Dq3_spec;
int a_f, b_f, c_f;
int a_i, b_i, c_i;
Real ee;
for (int ie_f=0; ie_f < 6 ; ie_f++){
a_f = epsilon[ie_f][0]; //a
b_f = epsilon[ie_f][1]; //b
c_f = epsilon[ie_f][2]; //c
for (int ie_i=0; ie_i < 6 ; ie_i++){
a_i = epsilon[ie_i][0]; //a'
b_i = epsilon[ie_i][1]; //b'
c_i = epsilon[ie_i][2]; //c'
ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
auto D2_Gi_ab_aa = D2_Gi ()(alpha_f,beta_i)(a_f,a_i);
auto Gf_D3_ab_bb = Gf_D3 ()(alpha_f,beta_i)(b_f,b_i);
auto Gf_D2_Gi_ab_ba = Gf_D2_Gi ()(alpha_f,beta_i)(b_f,a_i);
auto Dq3_spec_ab_ab = Dq3_spec ()(alpha_f,beta_i)(a_f,b_i);
for (int gamma_i=0; gamma_i<Ns; gamma_i++){
auto ee_adjD4_g_D1_ag_ac = ee * adjD4_g_D1 ()(alpha_f,gamma_i)(a_f,c_i);
auto ee_Gf_adjD4_g_D1_ag_bc = ee * Gf_adjD4_g_D1()(alpha_f,gamma_i)(b_f,c_i);
for (int gamma_f=0; gamma_f<Ns; gamma_f++){
auto ee_adjD4_g_D1_gg_cc = ee * adjD4_g_D1 ()(gamma_f,gamma_i)(c_f,c_i);
auto Dq3_spec_gb_cb = Dq3_spec ()(gamma_f,beta_i)(c_f,b_i);
auto D2_Gi_gb_ca = D2_Gi ()(gamma_f,beta_i)(c_f,a_i);
if(wick_contraction == 1) { // Do contraction I1
result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_gg_cc
* D2_Gi_ab_aa
* Gf_D3_ab_bb;
}
if(wick_contraction == 2) { // Do contraction I2
result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_ag_ac
* Gf_D2_Gi_ab_ba
* Dq3_spec_gb_cb;
}
if(wick_contraction == 3) { // Do contraction I3
result()(gamma_f,gamma_i)() -= ee_Gf_adjD4_g_D1_ag_bc
* D2_Gi_gb_ca
* Dq3_spec_ab_ab;
}
if(wick_contraction == 4) { // Do contraction I4
result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_gg_cc
* Gf_D2_Gi_ab_ba
* Dq3_spec_ab_ab;
}
if(wick_contraction == 5) { // Do contraction I5
result()(gamma_f,gamma_i)() += ee_Gf_adjD4_g_D1_ag_bc
* D2_Gi_ab_aa
* Dq3_spec_gb_cb;
}
if(wick_contraction == 6) { // Do contraction I6
result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_ag_ac
* D2_Gi_gb_ca
* Gf_D3_ab_bb;
}
}}
}}
}}
}
/* Dq1_spec is a quark line from t_i to t_f
* Dq2_ti is a quark line from t_i to t_J
* Dq3_spec is a quark line from t_i to t_f
* Dq4_tf is a quark line from t_f to t_J */
template<class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
const mobj2 &Dq1_spec,
const mobj &Dq2_ti,
const mobj2 &Dq3_spec,
const mobj &Dq4_tf,
const Gamma GammaJ,
const Gamma GammaBi,
const Gamma GammaBf,
int wick_contraction,
robj &result)
{
Gamma g5(Gamma::Algebra::Gamma5);
auto adjD4_g_D2_Gi = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi;
auto Gf_adjD4_g_D2_Gi = GammaBf * adjD4_g_D2_Gi;
auto Gf_D1 = GammaBf * Dq1_spec;
auto Gf_D3 = GammaBf * Dq3_spec;
int a_f, b_f, c_f;
int a_i, b_i, c_i;
Real ee;
for (int ie_f=0; ie_f < 6 ; ie_f++){
a_f = epsilon[ie_f][0]; //a
b_f = epsilon[ie_f][1]; //b
c_f = epsilon[ie_f][2]; //c
for (int ie_i=0; ie_i < 6 ; ie_i++){
a_i = epsilon[ie_i][0]; //a'
b_i = epsilon[ie_i][1]; //b'
c_i = epsilon[ie_i][2]; //c'
ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
auto adjD4_g_D2_Gi_ab_aa = adjD4_g_D2_Gi ()(alpha_f,beta_i)(a_f,a_i);
auto Gf_D3_ab_bb = Gf_D3 ()(alpha_f,beta_i)(b_f,b_i);
auto Gf_adjD4_g_D2_Gi_ab_ba = Gf_adjD4_g_D2_Gi ()(alpha_f,beta_i)(b_f,a_i);
auto Dq3_spec_ab_ab = Dq3_spec ()(alpha_f,beta_i)(a_f,b_i);
for (int gamma_i=0; gamma_i<Ns; gamma_i++){
auto ee_Dq1_spec_ag_ac = ee * Dq1_spec ()(alpha_f,gamma_i)(a_f,c_i);
auto ee_Gf_D1_ag_bc = ee * Gf_D1 ()(alpha_f,gamma_i)(b_f,c_i);
for (int gamma_f=0; gamma_f<Ns; gamma_f++){
auto ee_Dq1_spec_gg_cc = ee * Dq1_spec ()(gamma_f,gamma_i)(c_f,c_i);
auto Dq3_spec_gb_cb = Dq3_spec ()(gamma_f,beta_i)(c_f,b_i);
auto adjD4_g_D2_Gi_gb_ca = adjD4_g_D2_Gi ()(gamma_f,beta_i)(c_f,a_i);
if(wick_contraction == 1) { // Do contraction II1
result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
* adjD4_g_D2_Gi_ab_aa
* Gf_D3_ab_bb;
}
if(wick_contraction == 2) { // Do contraction II2
result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
* Gf_adjD4_g_D2_Gi_ab_ba
* Dq3_spec_gb_cb;
}
if(wick_contraction == 3) { // Do contraction II3
result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
* adjD4_g_D2_Gi_gb_ca
* Dq3_spec_ab_ab;
}
if(wick_contraction == 4) { // Do contraction II4
result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
* Gf_adjD4_g_D2_Gi_ab_ba
* Dq3_spec_ab_ab;
}
if(wick_contraction == 5) { // Do contraction II5
result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
* adjD4_g_D2_Gi_ab_aa
* Dq3_spec_gb_cb;
}
if(wick_contraction == 6) { // Do contraction II6
result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
* adjD4_g_D2_Gi_gb_ca
* Gf_D3_ab_bb;
}
}}
}}
}}
}
/* Dq1_spec is a quark line from t_i to t_f
* Dq2_spec is a quark line from t_i to t_f
* Dq3_ti is a quark line from t_i to t_J
* Dq4_tf is a quark line from t_f to t_J */
template<class FImpl>
template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
const mobj2 &Dq1_spec,
const mobj2 &Dq2_spec,
const mobj &Dq3_ti,
const mobj &Dq4_tf,
const Gamma GammaJ,
const Gamma GammaBi,
const Gamma GammaBf,
int wick_contraction,
robj &result)
{
Gamma g5(Gamma::Algebra::Gamma5);
auto adjD4_g_D3 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti;
auto Gf_adjD4_g_D3 = GammaBf * adjD4_g_D3;
auto Gf_D1 = GammaBf * Dq1_spec;
auto D2_Gi = Dq2_spec * GammaBi;
auto Gf_D2_Gi = GammaBf * D2_Gi;
int a_f, b_f, c_f;
int a_i, b_i, c_i;
Real ee;
for (int ie_f=0; ie_f < 6 ; ie_f++){
a_f = epsilon[ie_f][0]; //a
b_f = epsilon[ie_f][1]; //b
c_f = epsilon[ie_f][2]; //c
for (int ie_i=0; ie_i < 6 ; ie_i++){
a_i = epsilon[ie_i][0]; //a'
b_i = epsilon[ie_i][1]; //b'
c_i = epsilon[ie_i][2]; //c'
ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
auto D2_Gi_ab_aa = D2_Gi ()(alpha_f,beta_i)(a_f,a_i);
auto Gf_adjD4_g_D3_ab_bb = Gf_adjD4_g_D3 ()(alpha_f,beta_i)(b_f,b_i);
auto Gf_D2_Gi_ab_ba = Gf_D2_Gi ()(alpha_f,beta_i)(b_f,a_i);
auto adjD4_g_D3_ab_ab = adjD4_g_D3 ()(alpha_f,beta_i)(a_f,b_i);
for (int gamma_i=0; gamma_i<Ns; gamma_i++) {
auto ee_Dq1_spec_ag_ac = ee * Dq1_spec ()(alpha_f,gamma_i)(a_f,c_i);
auto ee_Gf_D1_ag_bc = ee * Gf_D1 ()(alpha_f,gamma_i)(b_f,c_i);
for (int gamma_f=0; gamma_f<Ns; gamma_f++) {
auto ee_Dq1_spec_gg_cc = ee * Dq1_spec ()(gamma_f,gamma_i)(c_f,c_i);
auto adjD4_g_D3_gb_cb = adjD4_g_D3 ()(gamma_f,beta_i)(c_f,b_i);
auto D2_Gi_gb_ca = D2_Gi ()(gamma_f,beta_i)(c_f,a_i);
if(wick_contraction == 1) { // Do contraction III1
result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
* D2_Gi_ab_aa
* Gf_adjD4_g_D3_ab_bb;
}
if(wick_contraction == 2) { // Do contraction III2
result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
* Gf_D2_Gi_ab_ba
* adjD4_g_D3_gb_cb;
}
if(wick_contraction == 3) { // Do contraction III3
result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
* D2_Gi_gb_ca
* adjD4_g_D3_ab_ab;
}
if(wick_contraction == 4) { // Do contraction III4
result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
* Gf_D2_Gi_ab_ba
* adjD4_g_D3_ab_ab;
}
if(wick_contraction == 5) { // Do contraction III5
result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
* D2_Gi_ab_aa
* adjD4_g_D3_gb_cb;
}
if(wick_contraction == 6) { // Do contraction III6
result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
* D2_Gi_gb_ca
* Gf_adjD4_g_D3_ab_bb;
}
}}
}}
}}
}
/* The group indicates which inital state quarks the current is *
* connected to. It must be in the range 1-3. *
* The wick_contraction must be in the range 1-6 correspond to *
* the contractions given in the Hadrons documentation at *
* https://aportelli.github.io/Hadrons-doc/#/mcontraction */
template<class FImpl>
template <class mobj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
const PropagatorField &q_ti,
const mobj &Dq_spec1,
const mobj &Dq_spec2,
const PropagatorField &q_tf,
int group,
int wick_contraction,
const Gamma GammaJ,
const Gamma GammaBi,
const Gamma GammaBf,
SpinMatrixField &stn_corr)
{
GridBase *grid = q_tf.Grid();
autoView( vcorr, stn_corr, CpuWrite);
autoView( vq_ti , q_ti, CpuRead);
autoView( vq_tf , q_tf, CpuRead);
if (group == 1) {
accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss];
sobj result=Zero();
Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result;
});//end loop over lattice sites
} else if (group == 2) {
accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss];
sobj result=Zero();
Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result;
});//end loop over lattice sites
} else if (group == 3) {
accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss];
sobj result=Zero();
Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result;
});//end loop over lattice sites
}
}
/***********************************************************************
* End of BaryonGamma3pt-function code. *
* *
* The following code is for Sigma -> N rare hypeon decays *
**********************************************************************/
@ -590,13 +1018,12 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
GridBase *grid = qs_ti.Grid();
auto vcorr= stn_corr.View();
auto vq_loop = qq_loop.View();
auto vd_tf = qd_tf.View();
auto vs_ti = qs_ti.View();
autoView( vcorr, stn_corr, CpuWrite);
autoView( vq_loop , qq_loop, CpuRead);
autoView( vd_tf , qd_tf, CpuRead);
autoView( vs_ti , qs_ti, CpuRead);
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
thread_for(ss,grid->oSites(),{
accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto Dq_loop = vq_loop[ss];
auto Dd_tf = vd_tf[ss];
auto Ds_ti = vs_ti[ss];
@ -631,12 +1058,11 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
GridBase *grid = qs_ti.Grid();
auto vcorr= stn_corr.View();
auto vq_ti = qq_ti.View();
auto vq_tf = qq_tf.View();
auto vd_tf = qd_tf.View();
auto vs_ti = qs_ti.View();
autoView( vcorr , stn_corr, CpuWrite);
autoView( vq_ti , qq_ti, CpuRead);
autoView( vq_tf , qq_tf, CpuRead);
autoView( vd_tf , qd_tf, CpuRead);
autoView( vs_ti , qs_ti, CpuRead);
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
thread_for(ss,grid->oSites(),{
auto Dq_ti = vq_ti[ss];

View File

@ -47,8 +47,8 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
GridBase *grid=x.Grid();
Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View();
auto z_v = z.View();
autoView(x_v, x, AcceleratorRead);
autoView(z_v, z, AcceleratorWrite);
accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
coalescedWrite(z_v[ss],tmp);
@ -63,9 +63,9 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
conformable(x,z);
GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0];
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
// FIXME -- need a new class of accelerator_loop to implement this
//
uint64_t nloop = grid->oSites()/Ls;
@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls;
@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
conformable(x,z);
GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0];
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
Gamma G5(Gamma::Algebra::Gamma5);
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{
@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0];
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
Gamma G5(Gamma::Algebra::Gamma5);
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{
@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0];
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls;
@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
conformable(x,z);
GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0];
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls;
@ -189,8 +189,8 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
conformable(x,z);
int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View();
auto z_v = z.View();
autoView( x_v, x, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls;
@ -222,8 +222,8 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
static_assert(nbasis % 2 == 0, "");
int nb = nbasis / 2;
auto z_v = z.View();
auto x_v = x.View();
autoView( z_v, z, AcceleratorWrite);
autoView( x_v, x, AcceleratorRead);
accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
{
for(int n = 0; n < nb; ++n) {

View File

@ -222,11 +222,11 @@ public:
conformable(subgroup, Determinant);
int i0, i1;
su2SubGroupIndex(i0, i1, su2_index);
auto subgroup_v = subgroup.View();
auto source_v = source.View();
auto Determinant_v = Determinant.View();
thread_for(ss, grid->oSites(), {
autoView( subgroup_v , subgroup,AcceleratorWrite);
autoView( source_v , source,AcceleratorRead);
autoView( Determinant_v , Determinant,AcceleratorWrite);
accelerator_for(ss, grid->oSites(), 1, {
subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1);
@ -257,15 +257,16 @@ public:
su2SubGroupIndex(i0, i1, su2_index);
dest = 1.0; // start out with identity
auto dest_v = dest.View();
auto subgroup_v = subgroup.View();
thread_for(ss, grid->oSites(),
autoView( dest_v , dest, AcceleratorWrite);
autoView( subgroup_v, subgroup, AcceleratorRead);
accelerator_for(ss, grid->oSites(),1,
{
dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1);
dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
});
}
///////////////////////////////////////////////
@ -608,8 +609,8 @@ public:
// reunitarise??
template <typename LatticeMatrixType>
static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out,
double scale = 1.0) {
static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0)
{
GridBase *grid = out.Grid();
typedef typename LatticeMatrixType::vector_type vector_type;
@ -618,8 +619,7 @@ public:
typedef iSinglet<vector_type> vTComplexType;
typedef Lattice<vTComplexType> LatticeComplexType;
typedef typename GridTypeMapper<
typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
typedef typename GridTypeMapper<typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
LatticeComplexType ca(grid);
LatticeMatrixType lie(grid);
@ -629,6 +629,7 @@ public:
MatrixType ta;
lie = Zero();
for (int a = 0; a < AdjointDimension; a++) {
random(pRNG, ca);
@ -640,6 +641,7 @@ public:
la = ci * ca * ta;
lie = lie + la; // e^{i la ta}
}
taExp(lie, out);
}

View File

@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "BinaryIO.h"
#include "TextIO.h"
#include "XmlIO.h"
#ifndef GRID_NVCC
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
#include "JSON_IO.h"
#endif

View File

@ -32,7 +32,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*/
//----------------------------------------------------------------------
#ifdef GRID_CUDA
#include <cuda_fp16.h>
#endif
#ifdef GRID_HIP
#include <hip/hip_fp16.h>
#endif
namespace Grid {
@ -142,7 +147,7 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
accelerator_inline float half2float(half h)
{
float f;
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
f = __half2float(h);
#else
//f = __half2float(h);
@ -156,7 +161,7 @@ accelerator_inline float half2float(half h)
accelerator_inline half float2half(float f)
{
half h;
#ifdef __CUDA_ARCH__
#ifdef GRID_SIMT
h = __float2half(f);
#else
Grid_half hh = sfw_float_to_half(f);

View File

@ -31,7 +31,7 @@ directory
#ifndef GRID_SIMD_H
#define GRID_SIMD_H
#ifdef GRID_NVCC
#if defined(GRID_CUDA) || defined(GRID_HIP)
#include <thrust/complex.h>
#endif
@ -65,7 +65,7 @@ typedef RealD Real;
typedef RealF Real;
#endif
#ifdef GRID_NVCC
#if defined(GRID_CUDA) || defined(GRID_HIP)
typedef thrust::complex<RealF> ComplexF;
typedef thrust::complex<RealD> ComplexD;
typedef thrust::complex<Real> Complex;

View File

@ -67,7 +67,8 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
{
int num=table.size();
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View();
auto rhs_v = rhs.View(AcceleratorRead);
accelerator_forNB( i,num, vobj::Nsimd(), {
typedef decltype(coalescedRead(buffer[0])) compressed_t;
compressed_t tmp_c;
@ -75,6 +76,7 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
coalescedWrite(buffer[off+o],tmp_c);
});
rhs_v.ViewClose();
// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table
}
@ -94,7 +96,7 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View();
auto rhs_v = rhs.View(AcceleratorRead);
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
@ -104,10 +106,11 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
so+tp[2*j+1].second,
type);
});
rhs_v.ViewClose();
}
struct StencilEntry {
#ifdef GRID_NVCC
#ifdef GRID_CUDA
uint64_t _byte_offset; // 8 bytes
uint32_t _offset; // 4 bytes
#else
@ -122,7 +125,7 @@ struct StencilEntry {
// Could pack to 8 + 4 + 4 = 128 bit and use
template<class vobj,class cobj,class Parameters>
class CartesianStencilView {
class CartesianStencilAccelerator {
public:
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
@ -130,14 +133,15 @@ class CartesianStencilView {
////////////////////////////////////////
// Basic Grid and stencil info
////////////////////////////////////////
int _checkerboard;
int _npoints; // Move to template param?
int _checkerboard;
int _npoints; // Move to template param?
int _osites;
StencilVector _directions;
StencilVector _distances;
StencilVector _comm_buf_size;
StencilVector _permute_type;
StencilVector same_node;
Coordinate _simd_layout;
Coordinate _simd_layout;
Parameters parameters;
StencilEntry* _entries_p;
cobj* u_recv_buf_p;
@ -175,13 +179,43 @@ class CartesianStencilView {
{
Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout);
}
};
template<class vobj,class cobj,class Parameters>
class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters>
{
private:
int *closed;
StencilEntry *cpu_ptr;
ViewMode mode;
public:
// default copy constructor
CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me),
cpu_ptr(this->_entries_p),
mode(_mode)
{
this->_entries_p =(StencilEntry *)
MemoryManager::ViewOpen(this->_entries_p,
this->_npoints*this->_osites*sizeof(StencilEntry),
mode,
AdviseDefault);
}
void ViewClose(void)
{
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
////////////////////////////////////////
// The Stencil Class itself
////////////////////////////////////////
template<class vobj,class cobj,class Parameters>
class CartesianStencil : public CartesianStencilView<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in.
class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in.
public:
typedef typename cobj::vector_type vector_type;
@ -226,8 +260,8 @@ public:
// Generalise as required later if needed
////////////////////////////////////////////////////////////////////////
View_type View(void) const {
View_type accessor(*( (View_type *) this));
View_type View(ViewMode mode) const {
View_type accessor(*( (View_type *) this),mode);
return accessor;
}
@ -662,9 +696,9 @@ public:
_unified_buffer_size=0;
surface_list.resize(0);
int osites = _grid->oSites();
this->_osites = _grid->oSites();
_entries.resize(this->_npoints* osites);
_entries.resize(this->_npoints* this->_osites);
this->_entries_p = &_entries[0];
for(int ii=0;ii<npoints;ii++){

View File

@ -31,22 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
//accelerator_inline void SIMTsynchronise(void)
accelerator_inline void synchronise(void)
{
#ifdef __CUDA_ARCH__
// __syncthreads();
__syncwarp();
#endif
return;
}
#ifndef __CUDA_ARCH__
#ifndef GRID_SIMT
//////////////////////////////////////////
// Trivial mapping of vectors on host
//////////////////////////////////////////
accelerator_inline int SIMTlane(int Nsimd) { return 0; } // CUDA specific
template<class vobj> accelerator_inline
vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)
{
@ -66,7 +55,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
template<class vobj> accelerator_inline
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
{
// vstream(vec, extracted);
vec = extracted;
}
template<class vobj> accelerator_inline
@ -75,25 +63,24 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
vstream(vec, extracted);
}
#else
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
//////////////////////////////////////////
// Extract and insert slices on the GPU
//////////////////////////////////////////
template<class vobj> accelerator_inline
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=SIMTlane(vobj::Nsimd()))
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{
return extractLane(lane,vec);
}
template<class vobj> accelerator_inline
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=SIMTlane(vobj::Nsimd()))
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{
int mask = vobj::Nsimd() >> (ptype + 1);
int plane= doperm ? lane ^ mask : lane;
return extractLane(plane,vec);
}
template<class vobj> accelerator_inline
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=SIMTlane(vobj::Nsimd()))
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{
insertLane(lane,vec,extracted);
}

View File

@ -59,6 +59,20 @@ class GridTensorBase {};
using DoublePrecision2= typename Traits::DoublePrecision2; \
static constexpr int TensorLevel = Traits::TensorLevel
///////////////////////////////////////////////////////////
// Allows to turn scalar<scalar<scalar<double>>>> back to double.
///////////////////////////////////////////////////////////
template <class T>
accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
TensorRemove(T arg) {
return arg;
}
template <class vtype>
accelerator_inline auto TensorRemove(iScalar<vtype> arg)
-> decltype(TensorRemove(arg._internal)) {
return TensorRemove(arg._internal);
}
template <class vtype>
class iScalar {
public:
@ -135,9 +149,10 @@ public:
operator ComplexD() const {
return (TensorRemove(_internal));
}
// instantiation of "Grid::iScalar<vtype>::operator Grid::RealD() const [with vtype=Grid::Real, U=Grid::Real, V=Grid::RealD, <unnamed>=0, <unnamed>=0U]"
template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> accelerator_inline
operator RealD() const {
return TensorRemove(_internal);
return (RealD) TensorRemove(_internal);
}
template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> accelerator_inline
operator Integer() const {
@ -169,20 +184,6 @@ public:
strong_inline scalar_type * end() { return begin() + Traits::count; }
};
///////////////////////////////////////////////////////////
// Allows to turn scalar<scalar<scalar<double>>>> back to double.
///////////////////////////////////////////////////////////
template <class T>
accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
TensorRemove(T arg) {
return arg;
}
template <class vtype>
accelerator_inline auto TensorRemove(iScalar<vtype> arg)
-> decltype(TensorRemove(arg._internal)) {
return TensorRemove(arg._internal);
}
template <class vtype, int N>
class iVector {
public:

View File

@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
// Specialisation: Cayley-Hamilton exponential for SU(3)
#ifndef GRID_NVCC
#ifndef GRID_CUDA
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
{

207
Grid/threads/Accelerator.cc Normal file
View File

@ -0,0 +1,207 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
uint32_t accelerator_threads=2;
uint32_t acceleratorThreads(void) {return accelerator_threads;};
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_CUDA
cudaDeviceProp *gpu_props;
void acceleratorInit(void)
{
int nDevices = 1;
cudaGetDeviceCount(&nDevices);
gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
size_t totalDeviceMem=0;
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
cudaDeviceProp prop;
prop = gpu_props[i];
totalDeviceMem = prop.totalGlobalMem;
if ( world_rank == 0) {
printf("AcceleratorCudaInit: ========================\n");
printf("AcceleratorCudaInit: Device Number : %d\n", i);
printf("AcceleratorCudaInit: ========================\n");
printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name);
GPU_PROP_FMT(totalGlobalMem,"%lld");
GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
#undef GPU_PROP_FMT
#undef GPU_PROP
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n");
#else
if ( world_rank == 0 ) printf("AcceleratorCudaInit: setting device to node rank\n");
cudaSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
}
#endif
#ifdef GRID_HIP
hipDeviceProp_t *gpu_props;
void acceleratorInit(void)
{
int nDevices = 1;
hipGetDeviceCount(&nDevices);
gpu_props = new hipDeviceProp_t[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
hipGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
hipDeviceProp_t prop;
prop = gpu_props[i];
printf("AcceleratorHipInit: ========================\n");
printf("AcceleratorHipInit: Device Number : %d\n", i);
printf("AcceleratorHipInit: ========================\n");
printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
// GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
#undef GPU_PROP_FMT
#undef GPU_PROP
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n");
#else
if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n");
hipSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
}
#endif
#ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator;
void acceleratorInit(void)
{
int nDevices = 1;
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice);
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
auto devices = cl::sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){
#define GPU_PROP_STR(prop) \
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
#define GPU_PROP_FMT(prop,FMT) \
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
GPU_PROP_STR(vendor);
GPU_PROP_STR(version);
// GPU_PROP_STR(device_type);
/*
GPU_PROP(max_compute_units);
GPU_PROP(native_vector_width_char);
GPU_PROP(native_vector_width_short);
GPU_PROP(native_vector_width_int);
GPU_PROP(native_vector_width_long);
GPU_PROP(native_vector_width_float);
GPU_PROP(native_vector_width_double);
GPU_PROP(native_vector_width_half);
GPU_PROP(address_bits);
GPU_PROP(half_fp_config);
GPU_PROP(single_fp_config);
*/
// GPU_PROP(double_fp_config);
GPU_PROP(global_mem_size);
}
if ( world_rank == 0 ) {
auto name = theGridAccelerator->get_device().get_info<sycl::info::device::name>();
printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str());
printf("AcceleratorSyclInit: ================================================\n");
}
}
#endif
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
void acceleratorInit(void){}
#endif
NAMESPACE_END(Grid);

426
Grid/threads/Accelerator.h Normal file
View File

@ -0,0 +1,426 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Accelerator.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <string.h>
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////
// Accelerator primitives; fall back to threading if not CUDA or SYCL
//////////////////////////////////////////////////////////////////////////////////
//
// Function attributes
//
// accelerator
// accelerator_inline
//
// Parallel looping
//
// accelerator_for
// accelerator_forNB
// uint32_t accelerator_barrier(); // device synchronise
//
// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd
//
// uint32_t acceleratorThreads(void);
// void acceleratorThreads(uint32_t);
//
// Warp control and info:
//
// acceleratorInit;
// void acceleratorSynchronise(void); // synch warp etc..
// int acceleratorSIMTlane(int Nsimd);
//
// Memory management:
//
// void *acceleratorAllocShared(size_t bytes);
// void acceleratorFreeShared(void *ptr);
//
// void *acceleratorAllocDevice(size_t bytes);
// void acceleratorFreeDevice(void *ptr);
//
// void *acceleratorCopyToDevice(void *from,void *to,size_t bytes);
// void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes);
//
//////////////////////////////////////////////////////////////////////////////////
uint32_t acceleratorThreads(void);
void acceleratorThreads(uint32_t);
void acceleratorInit(void);
//////////////////////////////////////////////
// CUDA acceleration
//////////////////////////////////////////////
#ifdef GRID_CUDA
#ifdef __CUDA_ARCH__
#define GRID_SIMT
#endif
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return threadIdx.z;
#else
return 0;
#endif
} // CUDA specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
int nt=acceleratorThreads(); \
dim3 cu_threads(acceleratorThreads(),1,nsimd); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
{
uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
uint64_t z = threadIdx.z;
if ( (x < num1) && (y<num2) && (z<num3) ) {
Lambda(x,y,z);
}
}
#define accelerator_barrier(dummy) \
{ \
cudaDeviceSynchronize(); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("Cuda error %s \n", cudaGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
} \
}
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMalloc((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
#endif
//////////////////////////////////////////////
// SyCL acceleration
//////////////////////////////////////////////
#ifdef GRID_SYCL
NAMESPACE_END(Grid);
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
NAMESPACE_BEGIN(Grid);
extern cl::sycl::queue *theGridAccelerator;
#ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT
#endif
#define accelerator
#define accelerator_inline strong_inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2];
#else
return 0;
#endif
} // SYCL specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
unsigned long nt=acceleratorThreads(); \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
cgh.parallel_for<class dslash>( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) mutable { \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
auto lane = item.get_global_id(2); \
{ __VA_ARGS__ }; \
}); \
});
#define accelerator_barrier(dummy) theGridAccelerator->wait();
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
#endif
//////////////////////////////////////////////
// HIP acceleration
//////////////////////////////////////////////
#ifdef GRID_HIP
NAMESPACE_END(Grid);
#include <hip/hip_runtime.h>
NAMESPACE_BEGIN(Grid);
#ifdef __HIP_DEVICE_COMPILE__
#define GRID_SIMT
#endif
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
/*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return hipThreadIdx_z;
#else
return 0;
#endif
} // HIP specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane ) mutable { \
{ __VA_ARGS__;} \
}; \
int nt=acceleratorThreads(); \
dim3 hip_threads(nt,1,nsimd); \
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \
num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
{
uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
if ( (x < numx) && (y<numy) && (z<numz) ) {
Lambda(x,y,z);
}
}
#define accelerator_barrier(dummy) \
{ \
hipDeviceSynchronize(); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
inline void *acceleratorAllocShared(size_t bytes)
{
#if 0
void *ptr=NULL;
auto err = hipMallocManaged((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
}
return ptr;
#else
return malloc(bytes);
#endif
};
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ free(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
#endif
//////////////////////////////////////////////
// Common on all GPU targets
//////////////////////////////////////////////
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
#define accelerator_for( iter, num, nsimd, ... ) \
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#endif
//////////////////////////////////////////////
// CPU Target - No accelerator just thread instead
//////////////////////////////////////////////
#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
#undef GRID_SIMT
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_barrier(dummy)
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
#ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
#else
inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
inline void acceleratorFreeShared(void *ptr){free(ptr);};
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
#endif
#endif // CPU target
#ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocCpu(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeCpu (void *ptr){_mm_free(ptr);};
#else
inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
inline void acceleratorFreeCpu (void *ptr){free(ptr);};
#endif
///////////////////////////////////////////////////
// Synchronise across local threads for divergence resynch
///////////////////////////////////////////////////
accelerator_inline void acceleratorSynchronise(void)
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
__syncwarp();
#endif
#ifdef GRID_SYCL
// No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier
#endif
#ifdef GRID_HIP
__syncthreads();
#endif
#endif
return;
}
accelerator_inline void acceleratorSynchroniseAll(void)
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
__syncthreads();
#endif
#ifdef GRID_SYCL
// No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier
#endif
#ifdef GRID_HIP
__syncthreads();
#endif
#endif
return;
}
accelerator_inline void acceleratorFence(void)
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
__threadfence();
#endif
#ifdef GRID_SYCL
// FIXMEE
#endif
#ifdef GRID_HIP
__threadfence();
#endif
#endif
return;
}
NAMESPACE_END(Grid);

View File

@ -2,7 +2,7 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Threads.h
Source file: ./lib/Pragmas.h
Copyright (C) 2015
@ -28,107 +28,5 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#pragma once
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
#define strong_inline __attribute__((always_inline)) inline
#define UNROLL _Pragma("unroll")
//////////////////////////////////////////////////////////////////////////////////
// New primitives; explicit host thread calls, and accelerator data parallel calls
//////////////////////////////////////////////////////////////////////////////////
#ifdef _OPENMP
#define GRID_OMP
#include <omp.h>
#endif
#ifdef GRID_OMP
#define DO_PRAGMA_(x) _Pragma (#x)
#define DO_PRAGMA(x) DO_PRAGMA_(x)
#define thread_num(a) omp_get_thread_num()
#define thread_max(a) omp_get_max_threads()
#else
#define DO_PRAGMA_(x)
#define DO_PRAGMA(x)
#define thread_num(a) (0)
#define thread_max(a) (1)
#endif
#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
#define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse( N , i, num, ... ) DO_PRAGMA(omp parallel for collapse ( N ) ) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse_in_region( N , i, num, ... ) DO_PRAGMA(omp for collapse ( N )) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_region DO_PRAGMA(omp parallel)
#define thread_critical DO_PRAGMA(omp critical)
//////////////////////////////////////////////////////////////////////////////////
// Accelerator primitives; fall back to threading
//////////////////////////////////////////////////////////////////////////////////
#ifdef __NVCC__
#define GRID_NVCC
#endif
#ifdef GRID_NVCC
extern uint32_t gpu_threads;
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
template<typename lambda> __global__
void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
{
uint64_t isite = threadIdx.y;
uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
if ( (osite <Osites) && (isite<Isites) ) {
Lambda(isite,osite);
}
}
/////////////////////////////////////////////////////////////////
// Internal only really... but need to call when
/////////////////////////////////////////////////////////////////
#define accelerator_barrier(dummy) \
{ \
cudaDeviceSynchronize(); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("Cuda error %s \n", cudaGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
// Copy the for_each_n style ; Non-blocking variant
#define accelerator_forNB( iterator, num, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(gpu_threads,nsimd); \
dim3 cu_blocks ((num+gpu_threads-1)/gpu_threads); \
LambdaApplySIMT<<<cu_blocks,cu_threads>>>(nsimd,num,lambda); \
}
// Copy the for_each_n style ; Non-blocking variant (default
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#else
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_barrier(dummy)
#endif
#include <Grid/threads/Threads.h>
#include <Grid/threads/Accelerator.h>

View File

@ -0,0 +1,127 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/ThreadReduction.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
// Introduce a class to gain deterministic bit reproducible reduction.
// make static; perhaps just a namespace is required.
NAMESPACE_BEGIN(Grid);
class GridThread {
public:
static int _threads;
static int _hyperthreads;
static int _cores;
static void SetCores(int cr) {
#ifdef GRID_OMP
_cores = cr;
#else
_cores = 1;
#endif
}
static void SetThreads(int thr) {
#ifdef GRID_OMP
_threads = MIN(thr,omp_get_max_threads()) ;
omp_set_num_threads(_threads);
#else
_threads = 1;
#endif
};
static void SetMaxThreads(void) {
#ifdef GRID_OMP
_threads = omp_get_max_threads();
omp_set_num_threads(_threads);
#else
_threads = 1;
#endif
};
static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
static int GetCores(void) { return _cores; };
static int GetThreads(void) { return _threads; };
static int SumArraySize(void) {return _threads;};
static void GetWork(int nwork, int me, int & mywork, int & myoff){
GetWork(nwork,me,mywork,myoff,_threads);
}
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
int basework = nwork/units;
int backfill = units-(nwork%units);
if ( me >= units ) {
mywork = myoff = 0;
} else {
mywork = (nwork+me)/units;
myoff = basework * me;
if ( me > backfill )
myoff+= (me-backfill);
}
return;
};
static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
me = ThreadBarrier();
GetWork(nwork,me,mywork,myoff);
};
static int ThreadBarrier(void) {
#ifdef GRID_OMP
#pragma omp barrier
return omp_get_thread_num();
#else
return 0;
#endif
};
template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
sum_array[me] = val;
val=Zero();
ThreadBarrier();
for(int i=0;i<_threads;i++) val+= sum_array[i];
ThreadBarrier();
}
static void bcopy(const void *src, void *dst, size_t len) {
#ifdef GRID_OMP
#pragma omp parallel
{
const char *c_src =(char *) src;
char *c_dest=(char *) dst;
int me,mywork,myoff;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
}
#else
bcopy(src,dst,len);
#endif
}
};
NAMESPACE_END(Grid);

View File

@ -28,101 +28,47 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#pragma once
// Introduce a class to gain deterministic bit reproducible reduction.
// make static; perhaps just a namespace is required.
NAMESPACE_BEGIN(Grid);
class GridThread {
public:
static int _threads;
static int _hyperthreads;
static int _cores;
static void SetCores(int cr) {
#ifdef GRID_OMP
_cores = cr;
#else
_cores = 1;
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
}
static void SetThreads(int thr) {
#ifdef GRID_OMP
_threads = MIN(thr,omp_get_max_threads()) ;
omp_set_num_threads(_threads);
#else
_threads = 1;
#define strong_inline __attribute__((always_inline)) inline
#define UNROLL _Pragma("unroll")
//////////////////////////////////////////////////////////////////////////////////
// New primitives; explicit host thread calls, and accelerator data parallel calls
//////////////////////////////////////////////////////////////////////////////////
#ifdef _OPENMP
#define GRID_OMP
#include <omp.h>
#endif
};
static void SetMaxThreads(void) {
#ifdef GRID_OMP
_threads = omp_get_max_threads();
omp_set_num_threads(_threads);
#define DO_PRAGMA_(x) _Pragma (#x)
#define DO_PRAGMA(x) DO_PRAGMA_(x)
#define thread_num(a) omp_get_thread_num()
#define thread_max(a) omp_get_max_threads()
#else
_threads = 1;
#define DO_PRAGMA_(x)
#define DO_PRAGMA(x)
#define thread_num(a) (0)
#define thread_max(a) (1)
#endif
};
static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
static int GetCores(void) { return _cores; };
static int GetThreads(void) { return _threads; };
static int SumArraySize(void) {return _threads;};
static void GetWork(int nwork, int me, int & mywork, int & myoff){
GetWork(nwork,me,mywork,myoff,_threads);
}
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
int basework = nwork/units;
int backfill = units-(nwork%units);
if ( me >= units ) {
mywork = myoff = 0;
} else {
mywork = (nwork+me)/units;
myoff = basework * me;
if ( me > backfill )
myoff+= (me-backfill);
}
return;
};
static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
me = ThreadBarrier();
GetWork(nwork,me,mywork,myoff);
};
static int ThreadBarrier(void) {
#ifdef GRID_OMP
#pragma omp barrier
return omp_get_thread_num();
#else
return 0;
#endif
};
template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
sum_array[me] = val;
val=Zero();
ThreadBarrier();
for(int i=0;i<_threads;i++) val+= sum_array[i];
ThreadBarrier();
}
static void bcopy(const void *src, void *dst, size_t len) {
#ifdef GRID_OMP
#pragma omp parallel
{
const char *c_src =(char *) src;
char *c_dest=(char *) dst;
int me,mywork,myoff;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
}
#else
bcopy(src,dst,len);
#endif
}
};
NAMESPACE_END(Grid);
#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for2d( i1, n1,i2,n2, ... ) \
DO_PRAGMA(omp parallel for collapse(2)) \
for ( uint64_t i1=0;i1<n1;i1++) { \
for ( uint64_t i2=0;i2<n2;i2++) { \
{ __VA_ARGS__ } ; \
}}
#define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
#define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse( N , i, num, ... ) DO_PRAGMA(omp parallel for collapse ( N ) ) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse_in_region( N , i, num, ... ) DO_PRAGMA(omp for collapse ( N )) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_region DO_PRAGMA(omp parallel)
#define thread_critical DO_PRAGMA(omp critical)

View File

@ -52,14 +52,14 @@ public:
accelerator_inline size_type size(void) const { return _size; };
accelerator_inline void clear(void) { resize(0);}
accelerator_inline void resize(size_type sz) {
#ifndef GRID_HIP
assert(sz>=0);
assert(sz<=MaxEntries);
#endif
_size = sz;
}
accelerator_inline void resize(size_type sz,const value &val) {
assert(sz>=0);
assert(sz<=MaxEntries);
_size = sz;
resize(sz);
for(int s=0;s<sz;s++) _data[s]=val;
}
accelerator_inline pointer begin(void) { return &_data[0]; }
@ -67,7 +67,7 @@ public:
accelerator_inline pointer end (void) { return &_data[_size]; }
accelerator_inline const_pointer end (void) const { return &_data[_size]; }
accelerator_inline void push_back(const value &val) { resize(_size+1); _data[_size-1] = val;}
accelerator_inline AcceleratorVector() { _size = 0; }
accelerator_inline AcceleratorVector() { resize(0); }
accelerator_inline AcceleratorVector(size_type sz) { resize(sz); }
accelerator_inline AcceleratorVector(size_type sz,const value &val) { resize(sz,val); }
AcceleratorVector(const std::vector<value> &copyme) {

View File

@ -73,8 +73,6 @@ feenableexcept (unsigned int excepts)
}
#endif
uint32_t gpu_threads=8;
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
@ -192,16 +190,12 @@ void GridParseLayout(char **argv,int argc,
assert(ompthreads.size()==1);
GridThread::SetThreads(ompthreads[0]);
}
if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){
std::vector<int> gputhreads(0);
#ifndef GRID_NVCC
std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
<< " not compiled with GPU support" << std::endl;
#endif
arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads");
arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads");
GridCmdOptionIntVector(arg,gputhreads);
assert(gputhreads.size()==1);
gpu_threads=gputhreads[0];
acceleratorThreads(gputhreads[0]);
}
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
@ -241,8 +235,6 @@ static int Grid_is_initialised;
/////////////////////////////////////////////////////////
void GridBanner(void)
{
static int printed =0;
if( !printed ) {
std::cout <<std::endl;
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
@ -278,67 +270,6 @@ void GridBanner(void)
std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
#endif
std::cout << std::endl;
printed=1;
}
}
#ifdef GRID_NVCC
cudaDeviceProp *gpu_props;
#endif
void GridGpuInit(void)
{
#ifdef GRID_NVCC
int nDevices = 1;
cudaGetDeviceCount(&nDevices);
gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ( world_rank == 0 ) {
GridBanner();
}
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
cudaDeviceProp prop;
prop = gpu_props[i];
printf("GpuInit: ========================\n");
printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name);
GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
if ( world_rank == 0 ) {
printf("GpuInit: ================================================\n");
}
#endif
}
void Grid_init(int *argc,char ***argv)
@ -353,9 +284,7 @@ void Grid_init(int *argc,char ***argv)
//////////////////////////////////////////////////////////
// Early intialisation necessities without rank knowledge
//////////////////////////////////////////////////////////
GridGpuInit(); // Must come first to set device prior to MPI init
PointerCache::Init();
acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
int MB;
@ -365,6 +294,14 @@ void Grid_init(int *argc,char ***argv)
GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){
int MB;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem");
GridCmdOptionInt(arg,MB);
uint64_t MB64 = MB;
MemoryManager::DeviceMaxBytes = MB64*1024LL*1024LL;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--hypercube") ){
int enable;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--hypercube");
@ -381,6 +318,11 @@ void Grid_init(int *argc,char ***argv)
Grid_debug_handler_init();
}
//////////////////////////////////////////////////////////
// Memory manager
//////////////////////////////////////////////////////////
MemoryManager::Init();
//////////////////////////////////////////////////////////
// MPI initialisation
//////////////////////////////////////////////////////////
@ -419,11 +361,18 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
/////////////////////////////////////////////////////////
// Reporting
/////////////////////////////////////////////////////////
std::cout << GridLogMessage << "Requested "<< GlobalSharedMemory::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
if ( GlobalSharedMemory::Hugepages) {
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
}
#ifndef GRID_UVM
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
#endif
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
MemoryProfiler::debug = true;

View File

@ -237,9 +237,9 @@ public:
Vec rn ; random(sRNG,rn);
LatticeVec z(&Grid); z=rn;
LatticeVec x(&Grid); x=rn;
LatticeVec y(&Grid); y=rn;
LatticeVec z(&Grid); z=Zero();
LatticeVec x(&Grid); x=Zero();
LatticeVec y(&Grid); y=Zero();
double a=2.0;
uint64_t Nloop=NLOOP;
@ -247,9 +247,9 @@ public:
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x-y;
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v , x, CpuWrite);
autoView( y_v , y, CpuWrite);
autoView( z_v , z, CpuRead);
x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
y_v[4]=z_v[4];
}

View File

@ -21,7 +21,7 @@
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
@ -129,8 +129,8 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
{
auto Umu5d_v = Umu5d.View();
auto Umu_v = Umu.View();
autoView( Umu5d_v, Umu5d, CpuWrite);
autoView( Umu_v , Umu , CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
@ -258,8 +258,8 @@ int main (int argc, char ** argv)
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1);
{
auto ref_v = ref.View();
auto tmp_v = tmp.View();
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
}
@ -268,8 +268,8 @@ int main (int argc, char ** argv)
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
auto ref_v = ref.View();
auto tmp_v = tmp.View();
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}

Some files were not shown because too many files have changed in this diff Show More