1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-18 07:47:06 +01:00

Compare commits

..

79 Commits

Author SHA1 Message Date
3aab983760 Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency) 2020-11-16 17:13:58 +01:00
9c4dcc5ea3 Merge branch 'master' into develop 2020-11-16 16:34:57 +01:00
a1063ddbb9 Update options and simplify 2020-11-13 04:11:03 +01:00
18ef8056ec Hide Shared Memory 2020-11-13 04:10:40 +01:00
1c673977fa Must ask for COMMMS_THREADS 2020-11-13 03:59:36 +01:00
e9bc748828 Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel 2020-11-13 03:58:34 +01:00
f48156529b Work on 2,2,2,8 ranks 2020-11-13 03:57:58 +01:00
d05ce01809 TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED 2020-11-13 03:52:19 +01:00
cf23eff60e Device to Device, Memset, cannot assume UVM == Communicable 2020-11-13 03:51:08 +01:00
6e313575be Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac 2020-11-13 03:50:16 +01:00
b13d1f7238 TOFU compat flag to help Isaaku 2020-11-13 03:49:44 +01:00
b5e7945dd9 Option for host or device Cshift implementation 2020-11-13 01:38:54 +01:00
7535566f54 Option for bounce through the SHM buffer 2020-11-12 22:54:27 +01:00
50b808ab33 Configure option between host and device 2020-11-12 22:28:12 +01:00
f16c2665f5 Host memory explict 2020-11-12 20:29:58 +01:00
41e28015ae Volume divisible guarantee 2020-11-07 13:32:16 +01:00
a0ccbb3bd6 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-01 01:16:35 +00:00
5eeabaa2bb HIP fix 2020-11-01 01:16:01 +00:00
00d0d6d008 Hip Free managed 2020-10-31 18:14:31 -04:00
537a9f7030 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-31 18:13:30 -04:00
cc9c993f74 Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt 2020-10-31 18:12:47 -04:00
d10422ded8 Test project on group 2020-10-31 18:12:30 -04:00
f313565a3c HiP compile 2020-10-31 12:12:40 +00:00
61d5860b46 Merge pull request #318 from rrhodgson/feature/BaryonSpinMat
Added untraced baryon contraction code
2020-10-28 18:39:59 +00:00
52d17987dc BaryonUtils.h updated debug output 2020-10-23 11:41:08 +01:00
19d8bba97d BaryonUtils function naming change 2020-10-21 11:58:53 +01:00
463d72d322 Added untraced baryon contraction code 2020-10-19 16:13:28 +01:00
3362f8dfa0 happy compile 2020-10-14 22:59:41 -04:00
bf3c9857e0 Closure changes 2020-10-14 21:37:14 -04:00
a88b3ceca5 Closure cases 2020-10-14 21:33:51 -04:00
aa135412f5 toComplex, toReal 2020-10-13 22:25:01 -04:00
9945399e60 Reaality issues fix by drop from ET 2020-10-13 22:24:32 -04:00
5eeffa49e8 Reality forced included 2020-10-13 22:23:57 -04:00
3f06209720 Pretty print 2020-10-13 22:18:51 -04:00
12e239dd9f Merge branch 'release/dirac-ITT-2020' 2020-10-13 13:38:29 -04:00
af2301afbb Merge pull request #312 from i-kanamori/debug_512
add reordring of random number generators in IO
2020-10-13 11:42:12 -04:00
f98856a26f Merge pull request #314 from smangham/issue_readme_precision
Fix for deprecated configure options in documentation (issue #313)
2020-10-13 11:41:38 -04:00
d55cc5b380 Fixed typo on --enable-comm, removed all references to --enable-precision except for config options, where it is listed as deprecated. Removed travis test for single precision. 2020-10-12 12:33:13 +01:00
c2b688abc9 Benchmark_IO: reducing max local volume to 32^4 2020-10-10 16:52:56 +01:00
b0d61b9687 Benchmark_IO cleaner output 2020-10-09 21:46:45 +01:00
5f893bf9af Benchmark_IO procurement sizes 2020-10-09 21:31:59 +01:00
0e17bd6597 I/O benchmark cleanup 2020-10-09 20:29:57 +01:00
22caa158cc multi-pass I/O benchmark, with statistic and robustness summary 2020-10-09 20:29:40 +01:00
b24a504d7c hook to access last parallel I/O performance measurement 2020-10-09 20:28:54 +01:00
992ef6e9fc more runtime 2020-10-08 22:19:20 -04:00
f32a320bc3 Single prec benchmark in double prec compile 2020-10-08 19:52:08 -04:00
5f0fe029d2 Improve meemory benchmarks for GPU (avoid host mem ping pong) 2020-10-08 19:51:28 -04:00
6b1486e89b fixing number of colours defaulting to 4 in most cases 2020-10-08 16:31:24 +01:00
3f9c427a3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-07 13:12:57 -04:00
d201277652 Expose Nc as a compile time configure option.
Remove precision option
2020-10-07 13:07:00 -04:00
fdda7cf9cf Merge branch 'feature/benchmark-io-update' into develop 2020-10-07 15:57:53 +01:00
e22d30f715 Merge branch 'develop' into feature/benchmark-io-update 2020-10-07 15:56:39 +01:00
1ba25a0d8c more I/O benchmark code cleaning 2020-10-07 15:38:41 +01:00
9ba3647bdf script to convert I/O benchmark logs to CSV 2020-10-07 15:35:03 +01:00
5ee832f738 I/O benchmark code cleaning 2020-10-07 15:31:51 +01:00
35a69a5133 SU4 x SU4 2020-10-06 21:48:35 -04:00
e9c5a271a8 fixing potential issues with log alignment and timer I/O 2020-10-06 17:58:16 +01:00
acac2d6938 standard C/C++ I/O in benchmark 2020-10-06 17:57:00 +01:00
97db2b8d20 add reordring of random number generator in IO 2020-10-06 17:25:59 +09:00
ace9cd64bb dpcpp happy 2020-09-29 08:03:46 -07:00
a3e2aeb603 dpcpp options happiness 2020-09-29 06:50:10 -07:00
049dd25785 Revert accidental commit thanks michael 2020-09-23 04:13:50 -04:00
d43d372294 Merge pull request #311 from mmphys/bugfix/MPIasynch
Asynchronous calls removed - reflect this in Communicator_none.cc
2020-09-22 10:41:48 -04:00
b71a081cba Asynchronous calls removed - reflect this in Communicator_none.cc
(Opportunistic doc update - OpenMP support on Mac OS)
2020-09-21 09:33:23 +01:00
c48909590b MPI asynch call removal 2020-09-17 20:47:32 +01:00
446ef40570 HIP IPC 2020-09-17 20:31:46 +01:00
81441e98f4 HIP runs sensible 2020-09-16 03:35:03 +01:00
ecd3f890f5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-16 02:30:14 +01:00
1c881ce23c HIP does not like half2 visible members x and y so must define own Half2 2020-09-16 02:28:33 +01:00
dacbbdd051 Hip Happy Birthday 2020-09-16 00:37:02 +01:00
2859955a03 HIP requires "inline" 2020-09-16 00:36:13 +01:00
cc220abd1d inline for HIP 2020-09-16 00:35:38 +01:00
d1c0c0197e HipCC requires inline on definition 2020-09-16 00:35:06 +01:00
fd9424ef27 innlines required to make HIP happy 2020-09-16 00:34:32 +01:00
a5c35c4024 Make HIP / Vega happy 2020-09-16 00:33:53 +01:00
e03b64dc06 HIP default flaags to work on ROCM 2020-09-16 00:33:09 +01:00
4677c40195 HIP improvements 2020-09-16 00:32:27 +01:00
288c615782 Hip improvements 2020-09-16 00:31:50 +01:00
48e81cf6f8 Hip Pragmas 2020-09-16 00:31:03 +01:00
199 changed files with 2909 additions and 1981 deletions

View File

@ -9,11 +9,6 @@ matrix:
- os: osx - os: osx
osx_image: xcode8.3 osx_image: xcode8.3
compiler: clang compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install: before_install:
- export GRIDDIR=`pwd` - export GRIDDIR=`pwd`
@ -55,7 +50,7 @@ script:
- make -j4 - make -j4
- make install - make install
- cd $CWD/build - cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4 - make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check - make check

View File

@ -28,4 +28,7 @@
/////////////////// ///////////////////
#include "Config.h" #include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */ #endif /* GRID_STD_H */

View File

@ -34,6 +34,12 @@
#define __SYCL__REDEFINE__ #define __SYCL__REDEFINE__
#endif #endif
/* HIP save and restore compile environment*/
#ifdef GRID_HIP
#pragma push
#pragma push_macro("__HIP_DEVICE_COMPILE__")
#endif
#define EIGEN_NO_HIP
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor> #include <Grid/Eigen/unsupported/CXX11/Tensor>
@ -52,6 +58,12 @@
#pragma pop #pragma pop
#endif #endif
/*HIP restore*/
#ifdef __HIP__REDEFINE__
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

View File

@ -49,11 +49,13 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> fine_inner_msk(fine); Lattice<dotp> fine_inner_msk(fine);
// Multiply could be fused with innerProduct // Multiply could be fused with innerProduct
// Single block sum kernel could do both masks.
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
mult(fine_inner_msk, fine_inner,FineMask); mult(fine_inner_msk, fine_inner,FineMask);
blockSum(CoarseInner,fine_inner_msk); blockSum(CoarseInner,fine_inner_msk);
} }
class Geometry { class Geometry {
public: public:
int npoint; int npoint;
@ -78,12 +80,8 @@ public:
} }
directions [2*_d]=0; directions [2*_d]=0;
displacements[2*_d]=0; displacements[2*_d]=0;
std::cout <<GridLogMessage << "Geometry "<<std::endl;
for(int p=0;p<npoint;p++){
std::cout <<GridLogMessage << "point " <<p<<" dir "<<directions[p]<<" delta " <<displacements[p]<<std::endl;
}
} }
}; };
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
@ -104,8 +102,8 @@ public:
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid), CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid), FineGrid(_FineGrid),
checkerboard(_checkerboard), subspace(nbasis,_FineGrid),
subspace(nbasis,_FineGrid) checkerboard(_checkerboard)
{ {
}; };
@ -287,8 +285,6 @@ public:
/////////////////////// ///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
virtual std::vector<int> Directions(void) { return geom.directions; };
virtual std::vector<int> Displacements(void){ return geom.displacements; };
void M (const CoarseVector &in, CoarseVector &out) void M (const CoarseVector &in, CoarseVector &out)
{ {
conformable(_grid,in.Grid()); conformable(_grid,in.Grid());
@ -312,9 +308,6 @@ public:
int osites=Grid()->oSites(); int osites=Grid()->oSites();
autoView(st,Stencil,AcceleratorRead);
siteVector *CBp=Stencil.CommBuf();
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis; int ss = sss/nbasis;
int b = sss%nbasis; int b = sss%nbasis;
@ -325,12 +318,12 @@ public:
for(int point=0;point<geom.npoint;point++){ for(int point=0;point<geom.npoint;point++){
SE=st.GetEntry(ptype,point,ss); SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local) { if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else { } else {
nbr = coalescedRead(CBp[SE->_offset]); nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
} }
acceleratorSynchronise(); acceleratorSynchronise();
@ -339,7 +332,7 @@ public:
} }
} }
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}; };
@ -416,23 +409,38 @@ public:
MdirCalc(in,out[p],p); MdirCalc(in,out[p],p);
} }
}; };
void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp) void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
{
this->MdirComms(in); this->MdirComms(in);
int ndim = in.Grid()->Nd(); int ndim = in.Grid()->Nd();
int point=-1; //////////////
for(int p=0;p<geom.npoint;p++){ // 4D action like wilson
if( (dir==geom.directions[p])&&(disp==geom.displacements[p])) point=p; // 0+ => 0
} // 0- => 1
assert(point!=-1);// Must find // 1+ => 2
// 1- => 3
// etc..
//////////////
// 5D action like DWF
// 1+ => 0
// 1- => 1
// 2+ => 2
// 2- => 3
// etc..
auto point = [dir, disp, ndim](){
if(dir == 0 and disp == 0)
return 8;
else if ( ndim==4 ) {
return (4 * dir + 1 - disp) / 2;
} else {
return (4 * (dir-1) + 1 - disp) / 2;
}
}();
std::cout <<GridLogMessage << "Mdir point "<<point<<" dir "<<dir<<" disp "<<disp <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout <<GridLogMessage << "point " <<p<<" dir "<<geom.directions[p]<<" delta " <<geom.displacements[p]<<std::endl;
}
MdirCalc(in,out,point); MdirCalc(in,out,point);
}; };
void Mdiag(const CoarseVector &in, CoarseVector &out) void Mdiag(const CoarseVector &in, CoarseVector &out)
@ -448,58 +456,10 @@ public:
geom(CoarseGrid._ndimension), geom(CoarseGrid._ndimension),
hermitian(hermitian_), hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid) A(geom.npoint,&CoarseGrid)
{ {
}; };
void Test(Aggregation<Fobj,CComplex,nbasis> &_Aggregates,GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop)
{
typedef Lattice<Fobj> FineField;
CoarseVector Cin(_grid);
CoarseVector Cout(_grid);
CoarseVector CFout(_grid);
FineField Fin(FineGrid);
FineField Fout(FineGrid);
std::vector<int> seeds({1,2,3,4,5});
GridParallelRNG RNG(_grid); RNG.SeedFixedIntegers(seeds);
gaussian(RNG,Cin);
_Aggregates.PromoteFromSubspace(Cin,Fin);
_Aggregates.ProjectToSubspace(Cin,Fin);
std::cout << GridLogMessage<< "************ "<<std::endl;
std::cout << GridLogMessage<< " Testing M "<<std::endl;
std::cout << GridLogMessage<< "************ "<<std::endl;
// Coarse operator
this->M(Cin,Cout);
// Fine projected operator
_Aggregates.PromoteFromSubspace(Cin,Fin);
linop.Op(Fin,Fout);
_Aggregates.ProjectToSubspace(CFout,Fout);
CFout = CFout-Cout;
RealD diff = norm2(CFout);
std::cout << GridLogMessage<< " diff "<<diff<<std::endl;
assert(diff<1.0e-5);
std::cout << GridLogMessage<< "************ "<<std::endl;
std::cout << GridLogMessage<< " Testing Mdag "<<std::endl;
std::cout << GridLogMessage<< "************ "<<std::endl;
// Coarse operator
Mdag(Cin,Cout);
// Fine operator
linop.AdjOp(Fin,Fout);
_Aggregates.ProjectToSubspace(CFout,Fout);
CFout = CFout-Cout;
diff = norm2(CFout);
std::cout << GridLogMessage<< " diff "<<diff<<std::endl;
assert(diff<1.0e-5);
}
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace) Aggregation<Fobj,CComplex,nbasis> & Subspace)
{ {
@ -536,19 +496,8 @@ public:
CoarseScalar InnerProd(Grid()); CoarseScalar InnerProd(Grid());
std::cout << GridLogMessage<< "CoarsenMatrix Orthog " << std::endl;
// Orthogonalise the subblocks over the basis // Orthogonalise the subblocks over the basis
blockOrthogonalise(InnerProd,Subspace.subspace); blockOrthogonalise(InnerProd,Subspace.subspace);
std::cout << GridLogMessage<< "CoarsenMatrix Orthog done " << std::endl;
auto OpDirections = linop.Directions();
auto OpDisplacements = linop.Displacements();
std::cout<<" Coarsening an operator with "<< OpDirections.size()<<" terms "<<std::endl;
for(int p=0;p<OpDirections.size();p++) {
assert(OpDirections[p]==geom.directions[p]);
assert(OpDisplacements[p]==geom.displacements[p]);
}
// Compute the matrix elements of linop between this orthonormal // Compute the matrix elements of linop between this orthonormal
// set of vectors. // set of vectors.
@ -584,27 +533,13 @@ public:
evenmask = where(mod(bcb,2)==(Integer)0,one,zero); evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
oddmask = one-evenmask; oddmask = one-evenmask;
/*
{
phi=Subspace.subspace[0];
linop.OpDirAll(phi,Mphi_p);
for(int p=0;p<geom.npoint-1;p++){
int dir=geom.directions[p];
int disp=geom.displacements[p];
linop.OpDir(phi,Mphi,dir,disp);
Mphi=Mphi-Mphi_p[p];
std::cout << GridLogMessage <<" Direction mapping check " <<norm2(Mphi)<<std::endl;
}
}
*/
assert(self_stencil!=-1); assert(self_stencil!=-1);
int lhermitian=hermitian;
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
phi=Subspace.subspace[i]; phi=Subspace.subspace[i];
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl; // std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
linop.OpDirAll(phi,Mphi_p); linop.OpDirAll(phi,Mphi_p);
linop.OpDiag (phi,Mphi_p[geom.npoint-1]); linop.OpDiag (phi,Mphi_p[geom.npoint-1]);
@ -615,7 +550,7 @@ public:
int dir = geom.directions[p]; int dir = geom.directions[p];
int disp = geom.displacements[p]; int disp = geom.displacements[p];
if ( (disp==-1) || (!lhermitian ) ) { if ( (disp==-1) || (!hermitian ) ) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Pick out contributions coming from this cell and neighbour cell // Pick out contributions coming from this cell and neighbour cell
@ -633,23 +568,11 @@ public:
autoView( A_self , A[self_stencil], AcceleratorWrite); autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
if ( lhermitian && (disp==-1) ) {
for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
int dirp = geom.directions[pp];
int dispp = geom.displacements[pp];
if ( (dirp==dir) && (dispp==1) ){
auto sft = conjugate(Cshift(oZProj,dir,1));
autoView( sft_v , sft , AcceleratorWrite);
autoView( A_pp , A[pp], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
}
}
}
} }
} }
} }
std::cout << GridLogMessage<< "CoarsenMatrix Diag "<<std::endl;
/////////////////////////////////////////// ///////////////////////////////////////////
// Faster alternate self coupling.. use hermiticity to save 2x // Faster alternate self coupling.. use hermiticity to save 2x
/////////////////////////////////////////// ///////////////////////////////////////////
@ -681,35 +604,31 @@ public:
} }
} }
if(hermitian) {
MemoryManager::PrintBytes(); std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
ForceHermitian();
// Auto self test
Test( Subspace,FineGrid,linop);
#if 0
///////////////////////////
// test code worth preserving in if block
///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< "\n"<<A[p] << std::endl;
} }
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif
} }
void ForceHermitian(void) {
CoarseMatrix Diff (Grid());
for(int p=0;p<geom.npoint;p++){
int dir = geom.directions[p];
int disp = geom.displacements[p];
if(disp==-1) {
// Find the opposite link
for(int pp=0;pp<geom.npoint;pp++){
int dirp = geom.directions[pp];
int dispp = geom.displacements[pp];
if ( (dirp==dir) && (dispp==1) ){
// Diff = adj(Cshift(A[p],dir,1)) - A[pp];
// std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
A[pp] = adj(Cshift(A[p],dir,1));
}
}
}
}
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -52,9 +52,6 @@ public:
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOp(const Field &in, Field &out)=0; virtual void HermOp(const Field &in, Field &out)=0;
virtual std::vector<int> Directions(void) =0;
virtual std::vector<int> Displacements(void)=0;
}; };
@ -79,9 +76,6 @@ class MdagMLinearOperator : public LinearOperatorBase<Field> {
public: public:
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){}; MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
@ -117,8 +111,6 @@ class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
RealD _shift; RealD _shift;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){}; ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
@ -159,8 +151,6 @@ template<class Matrix,class Field>
class HermitianLinearOperator : public LinearOperatorBase<Field> { class HermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){}; HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
@ -192,8 +182,6 @@ template<class Matrix,class Field>
class NonHermitianLinearOperator : public LinearOperatorBase<Field> { class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){}; NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
@ -267,8 +255,6 @@ template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> { class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
public: public:
Matrix &_Mat; Matrix &_Mat;
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
@ -295,8 +281,6 @@ template<class Matrix,class Field>
protected: protected:
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
@ -323,8 +307,6 @@ template<class Matrix,class Field>
protected: protected:
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
@ -390,8 +372,6 @@ class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase
{ {
public: public:
Matrix& _Mat; Matrix& _Mat;
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
@ -425,8 +405,6 @@ class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Fi
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
@ -457,8 +435,6 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi
Matrix& _Mat; Matrix& _Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
@ -499,8 +475,6 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
Field tmp; Field tmp;
RealD mass; RealD mass;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{ {
assert( _Mat.isTrivialEE() ); assert( _Mat.isTrivialEE() );

View File

@ -48,8 +48,6 @@ public:
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0; virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
virtual std::vector<int> Directions(void) =0;
virtual std::vector<int> Displacements(void)=0;
}; };
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@ -75,8 +73,6 @@ public:
virtual void MooeeDag (const Field &in, Field &out)=0; virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0; virtual void MooeeInvDag (const Field &in, Field &out)=0;
virtual std::vector<int> Directions(void) =0;
virtual std::vector<int> Displacements(void)=0;
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG #ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
#define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG #define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
NAMESPACE_BEGIN(Grid);
/* /*
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv. * Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
* Script A = SolverMatrix * Script A = SolverMatrix
@ -51,54 +50,53 @@ NAMESPACE_BEGIN(Grid);
* Vout = x * Vout = x
*/ */
// abstract base
template<class Field, class CoarseField, class Aggregates> template<class Field, class CoarseField>
class TwoLevelFlexiblePcg : public LinearFunction<Field> class TwoLevelFlexiblePcg : public LinearFunction<Field>
{ {
public: public:
int verbose; int verbose;
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
const int mmax = 4; const int mmax = 5;
GridBase *FineGrid; GridBase *grid;
GridBase *CoarseGrid; GridBase *coarsegrid;
LinearOperatorBase<Field> &_Linop; LinearOperatorBase<Field> *_Linop
LinearFunction<Field> &_Smoother; OperatorFunction<Field> *_Smoother,
LinearFunction<CoarseField> &_CoarseSolver; LinearFunction<CoarseField> *_CoarseSolver;
Aggregates &_Aggregates;
// Need somthing that knows how to get from Coarse to fine and back again
// more most opertor functions // more most opertor functions
TwoLevelFlexiblePcg(RealD tol, TwoLevelFlexiblePcg(RealD tol,
Integer maxit, Integer maxit,
LinearOperatorBase<Field> *Linop, LinearOperatorBase<Field> *Linop,
LinearFunction<Field> *Smoother, LinearOperatorBase<Field> *SmootherLinop,
LinearFunction<CoarseField> *CoarseSolver, OperatorFunction<Field> *Smoother,
Aggregates *AggP OperatorFunction<CoarseField> CoarseLinop
) : ) :
Tolerance(tol), Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
_Linop(*Linop), _Linop(Linop),
_Smoother(*Smoother), _PreconditionerLinop(PrecLinop),
_CoarseSolver(*CoarseSolver), _Preconditioner(Preconditioner)
_Aggregates(*AggP)
{ {
CoarseGrid=_Aggregates.CoarseGrid;
FineGrid=_Aggregates.FineGrid;
verbose=0; verbose=0;
}; };
// The Pcg routine is common to all, but the various matrices differ from derived // The Pcg routine is common to all, but the various matrices differ from derived
// implementation to derived implmentation // implementation to derived implmentation
void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi.Checkerboard() = src.Checkerboard(); psi.Checkerboard() = src.Checkerboard();
grid = src.Grid();
RealD f;
RealD rtzp,rtz,a,d,b; RealD rtzp,rtz,a,d,b;
// RealD rptzp; RealD rptzp;
// RealD tn; RealD tn;
RealD guess = norm2(psi); RealD guess = norm2(psi);
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance; RealD rsq = ssq*Tolerance*Tolerance;
@ -106,15 +104,15 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
///////////////////////////// /////////////////////////////
// Set up history vectors // Set up history vectors
///////////////////////////// /////////////////////////////
std::vector<Field> p (mmax,FineGrid); std::vector<Field> p (mmax,grid);
std::vector<Field> mmp(mmax,FineGrid); std::vector<Field> mmp(mmax,grid);
std::vector<RealD> pAp(mmax); std::vector<RealD> pAp(mmax);
Field x (FineGrid); x = psi; Field x (grid); x = psi;
Field z (FineGrid); Field z (grid);
Field tmp(FineGrid); Field tmp(grid);
Field r (FineGrid); Field r (grid);
Field mu (FineGrid); Field mu (grid);
////////////////////////// //////////////////////////
// x0 = Vstart -- possibly modify guess // x0 = Vstart -- possibly modify guess
@ -123,13 +121,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
Vstart(x,src); Vstart(x,src);
// r0 = b -A x0 // r0 = b -A x0
_Linop.HermOp(x,mmp[0]); // Shouldn't this be something else? HermOp(x,mmp); // Shouldn't this be something else?
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0 axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
////////////////////////////////// //////////////////////////////////
// Compute z = M1 x // Compute z = M1 x
////////////////////////////////// //////////////////////////////////
M1(r,z); M1(r,z,tmp,mp,SmootherMirs);
rtzp =real(innerProduct(r,z)); rtzp =real(innerProduct(r,z));
/////////////////////////////////////// ///////////////////////////////////////
@ -145,7 +143,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
int peri_kp = (k+1) % mmax; int peri_kp = (k+1) % mmax;
rtz=rtzp; rtz=rtzp;
d= M3(p[peri_k],mmp[peri_k]); d= M3(p[peri_k],mp,mmp[peri_k],tmp);
a = rtz/d; a = rtz/d;
// Memorise this // Memorise this
@ -155,13 +153,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
RealD rn = axpy_norm(r,-a,mmp[peri_k],r); RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
// Compute z = M x // Compute z = M x
M1(r,z); M1(r,z,tmp,mp);
rtzp =real(innerProduct(r,z)); rtzp =real(innerProduct(r,z));
M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
p[peri_kp]=mu; p[peri_kp]=p[peri_k];
// Standard search direction p -> z + b p ; b = // Standard search direction p -> z + b p ; b =
b = (rtzp)/rtz; b = (rtzp)/rtz;
@ -183,7 +181,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
// Stopping condition // Stopping condition
if ( rn <= rsq ) { if ( rn <= rsq ) {
_Linop.HermOp(x,mmp[0]); // Shouldn't this be something else? HermOp(x,mmp); // Shouldn't this be something else?
axpy(tmp,-1.0,src,mmp[0]); axpy(tmp,-1.0,src,mmp[0]);
RealD psinorm = sqrt(norm2(x)); RealD psinorm = sqrt(norm2(x));
@ -192,8 +190,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
RealD true_residual = tmpnorm/srcnorm; RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl; std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl; std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
return k;
return;
} }
} }
// Non-convergence // Non-convergence
@ -202,40 +199,48 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
public: public:
virtual void M1(Field & in, Field & out) virtual void M(Field & in,Field & out,Field & tmp) {
{// the smoother
}
virtual void M1(Field & in, Field & out) {// the smoother
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min] // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(FineGrid); Field tmp(grid);
Field Min(FineGrid); Field Min(grid);
CoarseField PleftProj(CoarseGrid); PcgM(in,Min); // Smoother call
CoarseField PleftMss_proj(CoarseGrid);
_Smoother(in,Min); // Smoother call HermOp(Min,out);
_Linop.HermOp(Min,out);
axpy(tmp,-1.0,out,in); // tmp = in - A Min axpy(tmp,-1.0,out,in); // tmp = in - A Min
_Aggregates.ProjectToSubspace(PleftProj,tmp); ProjectToSubspace(tmp,PleftProj);
_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min] PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
axpy(out,1.0,Min,tmp); // Min+tmp axpy(out,1.0,Min,tmp); // Min+tmp
} }
virtual void M2(const Field & in, Field & out) virtual void M2(const Field & in, Field & out) {
{
out=in; out=in;
// Must override for Def2 only
// case PcgDef2:
// Pright(in,out);
// break;
} }
virtual RealD M3(const Field & p, Field & mmp) virtual RealD M3(const Field & p, Field & mmp){
{
double d,dd; double d,dd;
_Linop.HermOpAndNorm(p,mmp,d,dd); HermOpAndNorm(p,mmp,d,dd);
return dd; return dd;
// Must override for Def1 only
// case PcgDef1:
// d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
// linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
// Pleft(mp,mmp);
// d=real(linop_d->inner(p,mmp));
} }
virtual void Vstart(Field & x,const Field & src) virtual void VstartDef2(Field & xconst Field & src){
{
//case PcgDef2: //case PcgDef2:
//case PcgAdef2: //case PcgAdef2:
//case PcgAdef2f: //case PcgAdef2f:
@ -251,79 +256,142 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
// = src_s - (A guess)_s - src_s + (A guess)_s // = src_s - (A guess)_s - src_s + (A guess)_s
// = 0 // = 0
/////////////////////////////////// ///////////////////////////////////
Field r(FineGrid); Field r(grid);
Field mmp(FineGrid); Field mmp(grid);
CoarseField PleftProj(CoarseGrid);
CoarseField PleftMss_proj(CoarseGrid);
_Linop.HermOp(x,mmp); HermOp(x,mmp);
axpy (r, -1.0, mmp, src); // r_{-1} = src - A x axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
_Aggregates.ProjectToSubspace(PleftProj,r); ProjectToSubspace(r,PleftProj);
_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} r_s ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
_Aggregates.PromoteFromSubspace(PleftMss_proj,mmp); PromoteFromSubspace(PleftMss_proj,mmp);
x=x+mmp; x=x+mmp;
} }
virtual void Vstart(Field & x,const Field & src){
return;
}
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout. Override in Def1 // Only Def1 has non-trivial Vout. Override in Def1
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){ virtual void Vout (Field & in, Field & out,Field & src){
out = in; out = in;
//case PcgDef1:
// //Qb + PT x
// ProjectToSubspace(src,PleftProj);
// ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
// PromoteFromSubspace(PleftMss_proj,tmp);
//
// Pright(in,out);
//
// linop_d->axpy(out,tmp,out,1.0);
// break;
} }
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
// Pright and Pleft are common to all implementations // Pright and Pleft are common to all implementations
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
virtual void Pright(Field & in,Field & out) virtual void Pright(Field & in,Field & out){
{
// P_R = [ 1 0 ] // P_R = [ 1 0 ]
// [ -Mss^-1 Msb 0 ] // [ -Mss^-1 Msb 0 ]
Field in_sbar(FineGrid); Field in_sbar(grid);
CoarseField PleftProj(CoarseGrid); ProjectToSubspace(in,PleftProj);
CoarseField PleftMss_proj(CoarseGrid); PromoteFromSubspace(PleftProj,out);
_Aggregates.ProjectToSubspace(PleftProj,in);
_Aggregates.PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
_Linop.HermOp(in_sbar,out); HermOp(in_sbar,out);
_Aggregates.ProjectToSubspace(PleftProj,out); // Mssbar in_sbar (project) ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project)
_CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} Mssbar ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
_Aggregates.PromoteFromSubspace(PleftMss_proj,out); // PromoteFromSubspace(PleftMss_proj,out); //
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
} }
virtual void Pleft (Field & in,Field & out) virtual void Pleft (Field & in,Field & out){
{
// P_L = [ 1 -Mbs Mss^-1] // P_L = [ 1 -Mbs Mss^-1]
// [ 0 0 ] // [ 0 0 ]
Field in_sbar(FineGrid); Field in_sbar(grid);
Field tmp2(FineGrid); Field tmp2(grid);
Field Mtmp(FineGrid); Field Mtmp(grid);
CoarseField PleftProj(CoarseGrid); ProjectToSubspace(in,PleftProj);
CoarseField PleftMss_proj(CoarseGrid); PromoteFromSubspace(PleftProj,out);
_Aggregates.ProjectToSubspace(PleftProj,in);
_Aggregates.PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
_CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} in_s ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
_Aggregates.PromoteFromSubspace(PleftMss_proj,out); PromoteFromSubspace(PleftMss_proj,out);
_Linop.HermOp(out,Mtmp); HermOp(out,Mtmp);
_Aggregates.ProjectToSubspace(PleftProj,Mtmp); // Msbar s Mss^{-1} ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1}
_Aggregates.PromoteFromSubspace(PleftProj,tmp2); PromoteFromSubspace(PleftProj,tmp2);
axpy(out,-1.0,tmp2,Mtmp); axpy(out,-1.0,tmp2,Mtmp);
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
} }
}; }
NAMESPACE_END(Grid);
template<class Field>
class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp){
}
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
}
virtual void M2(Field & in, Field & out){
}
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
}
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
}
}
/*
template<class Field>
class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
virtual void Vout (Field & in, Field & out,Field & src,Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
*/
#endif #endif

View File

@ -60,8 +60,6 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {}; DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
RealD t=-usecond();
guess = Zero(); guess = Zero();
assert(evec.size()==eval.size()); assert(evec.size()==eval.size());
auto N = evec.size(); auto N = evec.size();
@ -70,8 +68,6 @@ public:
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
} }
guess.Checkerboard() = src.Checkerboard(); guess.Checkerboard() = src.Checkerboard();
t+=usecond();
std::cout<<GridLogMessage<<"\t\t\t" << "Deflated guess took "<< t/1000.0<< "ms" <<std::endl;
} }
}; };

View File

@ -59,7 +59,7 @@ public:
GridBase *grid = src.Grid(); GridBase *grid = src.Grid();
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid); Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=Zero(); psi=zero;
r = src; r = src;
Preconditioner(r,p); Preconditioner(r,p);

View File

@ -53,11 +53,7 @@ public:
{ {
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
#ifdef GRID_UVM
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
#else
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
#endif
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@ -66,11 +62,7 @@ public:
{ {
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes); profilerFree(bytes);
#ifdef GRID_UVM
MemoryManager::SharedFree((void *)__p,bytes);
#else
MemoryManager::CpuFree((void *)__p,bytes); MemoryManager::CpuFree((void *)__p,bytes);
#endif
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
@ -173,9 +165,17 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//template<class T> using commAllocator = devAllocator<T>; #ifdef ACCELERATOR_CSHIFT
// Cshift on device
template<class T> using cshiftAllocator = devAllocator<T>;
#else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >; template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -9,13 +9,11 @@ NAMESPACE_BEGIN(Grid);
#define AccSmall (3) #define AccSmall (3)
#define Shared (4) #define Shared (4)
#define SharedSmall (5) #define SharedSmall (5)
uint64_t total_cache;
uint64_t total_shared; uint64_t total_shared;
uint64_t total_device; uint64_t total_device;
uint64_t total_host;; uint64_t total_host;;
void MemoryManager::PrintBytes(void) void MemoryManager::PrintBytes(void)
{ {
std::cout << " MemoryManager : "<<total_cache <<" cache bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl; std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl; std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl; std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
@ -37,8 +35,6 @@ void *MemoryManager::AcceleratorAllocate(size_t bytes)
if ( ptr == (void *) NULL ) { if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes); ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes; total_device+=bytes;
} else {
// std::cout <<"AcceleratorAllocate: cache hit Device pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
} }
return ptr; return ptr;
} }
@ -57,10 +53,8 @@ void *MemoryManager::SharedAllocate(size_t bytes)
if ( ptr == (void *) NULL ) { if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes); ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes; total_shared+=bytes;
// std::cout <<"SharedAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl; // std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes(); // PrintBytes();
} else {
// std::cout <<"SharedAllocate: cache hit Shared pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
} }
return ptr; return ptr;
} }
@ -80,9 +74,6 @@ void *MemoryManager::CpuAllocate(size_t bytes)
if ( ptr == (void *) NULL ) { if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes); ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes; total_host+=bytes;
// std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::dec<<std::endl;
} else {
// std::cout <<"CpufAllocate: cache hit Cpu pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
} }
return ptr; return ptr;
} }
@ -129,7 +120,7 @@ void MemoryManager::Init(void)
str= getenv("GRID_ALLOC_NCACHE_LARGE"); str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) { if ( str ) {
Nc = atoi(str); Nc = atoi(str);
if ( (Nc>=0) && (Nc <= NallocCacheMax)) { if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc; Ncache[Cpu]=Nc;
Ncache[Acc]=Nc; Ncache[Acc]=Nc;
Ncache[Shared]=Nc; Ncache[Shared]=Nc;
@ -139,7 +130,7 @@ void MemoryManager::Init(void)
str= getenv("GRID_ALLOC_NCACHE_SMALL"); str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) { if ( str ) {
Nc = atoi(str); Nc = atoi(str);
if ( (Nc>=0) && (Nc <= NallocCacheMax)) { if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc; Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc; Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc; Ncache[SharedSmall]=Nc;
@ -220,7 +211,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
if ( entries[v].valid ) { if ( entries[v].valid ) {
ret = entries[v].address; ret = entries[v].address;
total_cache-=entries[v].bytes;
entries[v].valid = 0; entries[v].valid = 0;
entries[v].address = NULL; entries[v].address = NULL;
entries[v].bytes = 0; entries[v].bytes = 0;
@ -229,7 +219,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
entries[v].address=ptr; entries[v].address=ptr;
entries[v].bytes =bytes; entries[v].bytes =bytes;
entries[v].valid =1; entries[v].valid =1;
total_cache+=entries[v].bytes;
return ret; return ret;
} }
@ -254,7 +243,6 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
for(int e=0;e<ncache;e++){ for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) { if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0; entries[e].valid = 0;
total_cache-=bytes;
return entries[e].address; return entries[e].address;
} }
} }

View File

@ -93,8 +93,8 @@ private:
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
public:
static void PrintBytes(void); static void PrintBytes(void);
public:
static void Init(void); static void Init(void);
static void InitMessage(void); static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes); static void *AcceleratorAllocate(size_t bytes);

View File

@ -138,21 +138,6 @@ public:
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,

View File

@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori #ifndef GRID_COMMS_THREADS
nCommThreads=1; nCommThreads=1;
// wrong results here too // wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor); assert(from != _processor);
assert(gme == ShmRank); assert(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag;
if ( gfrom ==MPI_UNDEFINED) { if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq); tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} }
if ( gdest == MPI_UNDEFINED ) { if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq); tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;

View File

@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
// Basic Halo comms primitive -- should never call in single node // Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit, void CartesianCommunicator::SendToRecvFrom(void *xmit,
@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
bcopy(in,out,bytes*words); bcopy(in,out,bytes*words);
@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall);
} }
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};

View File

@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_CUDA #ifdef GRID_CUDA
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif #endif
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: " #define header "SharedMemoryMpi: "
@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_CUDA #if defined(GRID_CUDA) ||defined(GRID_HIP)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@ -448,21 +451,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE ShmCommBuf = acceleratorAllocDevice(bytes);
auto err = cudaMalloc(&ShmCommBuf, bytes);
#else
auto err = cudaMallocManaged(&ShmCommBuf, bytes);
#endif
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ // if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; if ( 1 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
@ -475,15 +473,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////// //////////////////////////////////////////////////
// If it is me, pass around the IPC access key // If it is me, pass around the IPC access key
////////////////////////////////////////////////// //////////////////////////////////////////////////
#ifdef GRID_CUDA
cudaIpcMemHandle_t handle; cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
err = cudaIpcGetMemHandle(&handle,ShmCommBuf); auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
hipIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
////////////////////////////////////////////////// //////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm // Share this IPC handle across the Shm Comm
////////////////////////////////////////////////// //////////////////////////////////////////////////
@ -500,13 +509,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// If I am not the source, overwrite thisBuf with remote buffer // If I am not the source, overwrite thisBuf with remote buffer
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf; void * thisBuf = ShmCommBuf;
#ifdef GRID_CUDA
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
if ( r!=WorldShmRank ) {
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Save a copy of the device buffers // Save a copy of the device buffers
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -752,20 +772,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_IBM_SUMMIT #ifdef GRID_SHM_DISABLE
// Hide the shared memory path between sockets // Hide the shared memory path between ranks
// if even number of nodes {
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
for(int r=0;r<size;r++){ for(int r=0;r<size;r++){
int hisRank=ShmRanks[r]; ShmRanks[r] = MPI_UNDEFINED;
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
} }
} }
#endif #endif

View File

@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
} }
} }
{ {
autoView(rhs_v , rhs, AcceleratorRead);
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){ if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for2d(n,e1,b,e2,1,{ accelerator_for2d(n,e1,b,e2,1,{
int o = n*n1; int o = n*n1;
@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else { } else {
autoView(rhs_v , rhs, AcceleratorRead);
Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask; Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for2d(n,e1,b,e2,1,{ accelerator_for2d(n,e1,b,e2,1,{
Coordinate coor; Coordinate coor;
@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
} }
{ {
autoView( rhs_v, rhs, AcceleratorWrite);
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
} }
} }
@ -208,14 +253,23 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
autoView( rhs_v , rhs, AcceleratorWrite);
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for2d(n,e1,b,e2,1,{ accelerator_for2d(n,e1,b,e2,1,{
int o = n*_slice_stride; int o = n*_slice_stride;
int offset = b+n*_slice_block; int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
@ -280,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
} }
{ {
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite); autoView(lhs_v , lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@ -324,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
} }
{ {
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead); autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite); autoView( lhs_v, lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
} }
} }

View File

@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@ -121,9 +122,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size); cshiftVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size); cshiftVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
} else { } else {
int words = send_buf.size(); int words = buffer_size;
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier(); grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@ -195,8 +198,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); std::vector<cshiftVector<scalar_object> > send_buf_extract(Nsimd);
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); std::vector<cshiftVector<scalar_object> > recv_buf_extract(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@ -242,11 +252,204 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf_extract[i][0], (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
cshiftVector<vobj> send_buf_v(buffer_size);
cshiftVector<vobj> recv_buf_v(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
std::vector<cshiftVector<scalar_object> > send_buf_extract(Nsimd);
std::vector<cshiftVector<scalar_object> > recv_buf_extract(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier(); grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
} else { } else {
@ -258,7 +461,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
} }
} }
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -342,19 +342,14 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a)); GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a)); GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a)); GridUnopClass(UnaryAsin, asin(a));
@ -456,20 +451,17 @@ GridTrinOpClass(TrinaryWhere,
GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(adj, UnaryAdj); //GRID_DEF_UNOP(adj, UnaryAdj);
GRID_DEF_UNOP(conjugate, UnaryConj); //GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing // abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt); GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin); GRID_DEF_UNOP(asin, UnaryAsin);
@ -494,27 +486,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
{ {
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr); Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
{ {
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr); Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1), -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
vecEval(0, expr.arg2), vecEval(0, expr.arg2),
vecEval(0, expr.arg3)))> vecEval(0, expr.arg3)))>::type >
{ {
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1), Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
vecEval(0, expr.arg2), vecEval(0, expr.arg2),
vecEval(0, expr.arg3)))> ret(expr); vecEval(0, expr.arg3)))>::type > ret(expr);
return ret; return ret;
} }
#define EXPRESSION_CLOSURE(function) \ #define EXPRESSION_CLOSURE(function) \

View File

@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
autoView( lhs_v , lhs, AcceleratorRead); autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
auto tmp =ret_v(ss);
mac(&tmp,&lhs_t,&rhs_t); mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; auto tmp =ret_v(ss);
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs); mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
autoView( rhs_v , lhs, AcceleratorRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; auto tmp =ret_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t); mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
basis_v.push_back(basis[k].View(AcceleratorWrite)); basis_v.push_back(basis[k].View(AcceleratorWrite));
} }
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) )
int max_threads = thread_max(); int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads); Vector < vobj > Bt(Nm * max_threads);
thread_region thread_region
@ -161,11 +161,12 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
double * Qt_j = & Qt_jv[0]; double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k); for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite); autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz); auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]); B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });

View File

@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
autoView( ret_v, ret, AcceleratorWrite); autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), 1, {
coalescedWrite(ret_v[ss], adj(lhs_v(ss))); ret_v[ss] = adj(lhs_v[ss]);
}); });
return ret; return ret;
}; };
@ -64,6 +64,53 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
return ret; return ret;
}; };
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
Lattice<typename vobj::Complexified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toComplex(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
Lattice<typename vobj::Realified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toReal(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toComplex(const Expression &expr) -> decltype(closure(expr))
{
return toComplex(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toReal(const Expression &expr) -> decltype(closure(expr))
{
return toReal(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto adj(const Expression &expr) -> decltype(closure(expr))
{
return adj(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto conjugate(const Expression &expr) -> decltype(closure(expr))
{
return conjugate(closure(expr));
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);
#ifdef GRID_HIP #ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props; extern hipDeviceProp_t *gpu_props;
#define WARP_SIZE 64
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props; extern cudaDeviceProp *gpu_props;
#define WARP_SIZE 32
#endif #endif
#define WARP_SIZE 32
__device__ unsigned int retirementCount = 0; __device__ unsigned int retirementCount = 0;
template <class Iterator> template <class Iterator>
@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
// cannot use overloaded operators for sobj as they are not volatile-qualified // cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
__syncwarp(); acceleratorSynchronise();
const Iterator VEC = WARP_SIZE; const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1); const Iterator vid = tid & (VEC-1);
@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
beta += temp; beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
} }
__syncwarp(); acceleratorSynchronise();
} }
__syncthreads(); acceleratorSynchroniseAll();
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
beta = Zero(); beta = Zero();
@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
} }
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
} }
__syncthreads(); acceleratorSynchroniseAll();
} }

View File

@ -52,7 +52,6 @@ public:
// This will be safe to call from accelerator_for and is trivially copy constructible // This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions // The copy constructor for this will need to be used by device lambda functions
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
#undef LATTICE_BOUNDS_CHECK
template<class vobj> template<class vobj>
class LatticeView : public LatticeAccelerator<vobj> class LatticeView : public LatticeAccelerator<vobj>
{ {
@ -62,36 +61,14 @@ public:
void * cpu_ptr; void * cpu_ptr;
#ifdef GRID_SIMT #ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
#ifdef LATTICE_BOUNDS_CHECK
assert(i<this->_odata_size);
assert(i>=0);
#endif
return coalescedRead(this->_odata[i]); return coalescedRead(this->_odata[i]);
} }
#else #else
accelerator_inline const vobj & operator()(size_t i) const { accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#ifdef LATTICE_BOUNDS_CHECK
assert(i<this->_odata_size);
assert(i>=0);
#endif
return this->_odata[i];
}
#endif #endif
accelerator_inline const vobj & operator[](size_t i) const { accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
#ifdef LATTICE_BOUNDS_CHECK accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
assert(i<this->_odata_size);
assert(i>=0);
#endif
return this->_odata[i];
};
accelerator_inline vobj & operator[](size_t i) {
#ifdef LATTICE_BOUNDS_CHECK
assert(i<this->_odata_size);
assert(i>=0);
#endif
return this->_odata[i];
};
accelerator_inline uint64_t begin(void) const { return 0;}; accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; }; accelerator_inline uint64_t end(void) const { return this->_odata_size; };

View File

@ -130,6 +130,8 @@ public:
friend std::ostream& operator<< (std::ostream& stream, Logger& log){ friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) { if ( log.active ) {
std::ios_base::fmtflags f(stream.flags());
stream << log.background()<< std::left; stream << log.background()<< std::left;
if (log.topWidth > 0) if (log.topWidth > 0)
{ {
@ -152,6 +154,8 @@ public:
<< now << log.background() << " : " ; << now << log.background() << " : " ;
} }
stream << log.colour(); stream << log.colour();
stream.flags(f);
return stream; return stream;
} else { } else {
return devnull; return devnull;

View File

@ -1,3 +1,4 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1; int Grid::BinaryIO::latticeWriteMaxRetry = -1;
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;

View File

@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO { class BinaryIO {
public: public:
struct IoPerf
{
uint64_t size{0},time{0};
double mbytesPerSecond{0.};
};
static IoPerf lastPerf;
static int latticeWriteMaxRetry; static int latticeWriteMaxRetry;
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@ -502,12 +509,15 @@ class BinaryIO {
timer.Stop(); timer.Stop();
} }
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
lastPerf.time = timer.useconds();
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
std::cout<<GridLogMessage<<"IOobject: "; std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read "; if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write "; else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank; uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" " std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl; << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
@ -663,10 +673,15 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel_rng.SetState(tmp,lidx); Coordinate lcoor;
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.SetState(tmp,gidx);
}); });
timer.Stop(); timer.Stop();
@ -723,7 +738,12 @@ class BinaryIO {
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
parallel_rng.GetState(tmp,lidx); Coordinate lcoor;
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.GetState(tmp,gidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}); });
timer.Stop(); timer.Stop();

View File

@ -47,7 +47,7 @@ static constexpr int Ym = 5;
static constexpr int Zm = 6; static constexpr int Zm = 6;
static constexpr int Tm = 7; static constexpr int Tm = 7;
static constexpr int Nc=3; static constexpr int Nc=Config_Nc;
static constexpr int Ns=4; static constexpr int Ns=4;
static constexpr int Nd=4; static constexpr int Nd=4;
static constexpr int Nhs=2; // half spinor static constexpr int Nhs=2; // half spinor
@ -77,16 +77,9 @@ const int SpinorIndex = 2;
template<typename T> struct isSpinor { template<typename T> struct isSpinor {
static constexpr bool value = (SpinorIndex==T::TensorLevel); static constexpr bool value = (SpinorIndex==T::TensorLevel);
}; };
const int CoarseIndex = 4;
template<typename T> struct isCoarsened {
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
};
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ; template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ; template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
// ChrisK very keen to add extra space for Gparity doubling. // ChrisK very keen to add extra space for Gparity doubling.
// //
// Also add domain wall index, in a way where Wilson operator // Also add domain wall index, in a way where Wilson operator

View File

@ -89,8 +89,7 @@ public:
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual std::vector<int> Directions(void) =0;
virtual std::vector<int> Displacements(void)=0;
virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);}; virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};

View File

@ -44,9 +44,6 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels; typedef StaggeredKernels<Impl> Kernels;
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }

View File

@ -49,9 +49,6 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels; typedef StaggeredKernels<Impl> Kernels;
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }

View File

@ -47,9 +47,6 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
//////////////////////////////////////// ////////////////////////////////////////
// Performance monitoring // Performance monitoring
//////////////////////////////////////// ////////////////////////////////////////

View File

@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels // Generic Nc kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
template<int Naik> accelerator_inline template<int Naik>
static accelerator_inline
void DhopSiteGeneric(StencilView &st, void DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteGenericInt(StencilView &st, void DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteGenericExt(StencilView &st, void DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels // Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHand(StencilView &st, void DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHandInt(StencilView &st, void DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHandExt(StencilView &st, void DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels // Asm Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilView &st, void DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,

View File

@ -63,9 +63,6 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef WilsonKernels<Impl> Kernels; typedef WilsonKernels<Impl> Kernels;
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@ -72,9 +72,6 @@ public:
typedef WilsonKernels<Impl> Kernels; typedef WilsonKernels<Impl> Kernels;
PmuStat stat; PmuStat stat;
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }

View File

@ -79,8 +79,6 @@ public:
_Mat.M(in,tmp); _Mat.M(in,tmp);
G5R5(out,tmp); G5R5(out,tmp);
} }
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
}; };
@ -129,8 +127,6 @@ public:
_Mat.M(in,tmp); _Mat.M(in,tmp);
out=g5*tmp; out=g5*tmp;
} }
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField tmp(UGrid); PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid); PropagatorField Utmp(UGrid);
LatticeInteger zz (UGrid); zz=0.0; PropagatorField zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for (int s=0;s<Ls;s++) { for (int s=0;s<Ls;s++) {
@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField tmp(UGrid); PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid); PropagatorField Utmp(UGrid);
LatticeInteger zz (UGrid); zz=0.0; PropagatorField zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){

View File

@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);
template <class Impl> template <class Impl>
template <int Naik> template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
template <class Impl> template <class Impl>
template <int Naik> template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
template <class Impl> template <class Impl>
template <int Naik> template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,

View File

@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
// Int, Ext, Int+Ext cases for comms overlap // Int, Ext, Int+Ext cases for comms overlap
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
// Only contributions from interior of our node // Only contributions from interior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
// Only contributions from exterior of our node // Only contributions from exterior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -224,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel // Driving / wrapping routine to select right kernel
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp) int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
{ {
@ -253,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
}); });
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU, DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior) const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@ -293,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
} }
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");
} }
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior) const FermionField &in, FermionField &out, int dag, int interior,int exterior)

View File

@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
HAND_RESULT_EXT(ss,F) HAND_RESULT_EXT(ss,F)
#define HAND_SPECIALISE_GPARITY(IMPL) \ #define HAND_SPECIALISE_GPARITY(IMPL) \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
nmu = 0; \ nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \

View File

@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Impl> void template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> template<class Impl> accelerator_inline
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> void template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> template<class Impl> accelerator_inline
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> void template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
HAND_RESULT_EXT(ss); HAND_RESULT_EXT(ss);
} }
template<class Impl> template<class Impl> accelerator_inline
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {

View File

@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// All legs kernels ; comms then compute // All legs kernels ; comms then compute
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl> accelerator_inline
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
coalescedWrite(out[sF],result,lane); coalescedWrite(out[sF],result,lane);
}; };
template <class Impl> template <class Impl> accelerator_inline
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Interior kernels // Interior kernels
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl> accelerator_inline
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
coalescedWrite(out[sF], result,lane); coalescedWrite(out[sF], result,lane);
}; };
template <class Impl> template <class Impl> accelerator_inline
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Exterior kernels // Exterior kernels
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl> accelerator_inline
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
} }
}; };
template <class Impl> template <class Impl> accelerator_inline
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
}; };
#define DhopDirMacro(Dir,spProj,spRecon) \ #define DhopDirMacro(Dir,spProj,spRecon) \
template <class Impl> \ template <class Impl> accelerator_inline \
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
{ \ { \
@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
DhopDirMacro(Zm,spProjZm,spReconZm); DhopDirMacro(Zm,spProjZm,spReconZm);
DhopDirMacro(Tm,spProjTm,spReconTm); DhopDirMacro(Tm,spProjTm,spReconTm);
template <class Impl> template <class Impl> accelerator_inline
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
{ {

View File

@ -128,6 +128,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
hspin(0)=fspin(0)-fspin(2); hspin(0)=fspin(0)-fspin(2);
hspin(1)=fspin(1)-fspin(3); hspin(1)=fspin(1)-fspin(3);
} }
@ -137,50 +138,40 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
* 0 0 -1 0 * 0 0 -1 0
* 0 0 0 -1 * 0 0 0 -1
*/ */
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
hspin(0)=fspin(0); hspin(0)=fspin(0);
hspin(1)=fspin(1); hspin(1)=fspin(1);
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
hspin(0)=fspin(2); hspin(0)=fspin(2);
hspin(1)=fspin(3); hspin(1)=fspin(3);
} }
// template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
rfspin(0)=fspin(0); rfspin(0)=fspin(0);
rfspin(1)=fspin(1); rfspin(1)=fspin(1);
rfspin(2)=Zero(); rfspin(2)=Zero();
rfspin(3)=Zero(); rfspin(3)=Zero();
} }
// template<class vtype> accelerator_inline void fspProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
rfspin(0)=Zero(); rfspin(0)=Zero();
rfspin(1)=Zero(); rfspin(1)=Zero();
rfspin(2)=fspin(2); rfspin(2)=fspin(2);
rfspin(3)=fspin(3); rfspin(3)=fspin(3);
} }
template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
{
const int hN = N>>1;
for(int s=0;s<hN;s++){
rfspin(s)=fspin(s);
rfspin(s+hN)=Zero();
}
}
template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
{
const int hN = N>>1;
for(int s=0;s<hN;s++){
rfspin(s)=Zero();
rfspin(s+hN)=fspin(s+hN);
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Reconstruction routines to move back again to four spin // Reconstruction routines to move back again to four spin
//////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -192,6 +183,7 @@ template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inlin
*/ */
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0); fspin(0)=hspin(0);
fspin(1)=hspin(1); fspin(1)=hspin(1);
fspin(2)=timesMinusI(hspin(1)); fspin(2)=timesMinusI(hspin(1));
@ -199,6 +191,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0); fspin(0)=hspin(0);
fspin(1)=hspin(1); fspin(1)=hspin(1);
fspin(2)=timesI(hspin(1)); fspin(2)=timesI(hspin(1));
@ -206,6 +199,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0); fspin(0)+=hspin(0);
fspin(1)+=hspin(1); fspin(1)+=hspin(1);
fspin(2)-=timesI(hspin(1)); fspin(2)-=timesI(hspin(1));
@ -213,6 +207,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0); fspin(0)+=hspin(0);
fspin(1)+=hspin(1); fspin(1)+=hspin(1);
fspin(2)+=timesI(hspin(1)); fspin(2)+=timesI(hspin(1));
@ -226,6 +221,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0); fspin(0)=hspin(0);
fspin(1)=hspin(1); fspin(1)=hspin(1);
fspin(2)= hspin(1); fspin(2)= hspin(1);
@ -233,6 +229,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0); fspin(0)=hspin(0);
fspin(1)=hspin(1); fspin(1)=hspin(1);
fspin(2)=-hspin(1); fspin(2)=-hspin(1);
@ -240,6 +237,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0); fspin(0)+=hspin(0);
fspin(1)+=hspin(1); fspin(1)+=hspin(1);
fspin(2)+=hspin(1); fspin(2)+=hspin(1);
@ -247,6 +245,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0); fspin(0)+=hspin(0);
fspin(1)+=hspin(1); fspin(1)+=hspin(1);
fspin(2)-=hspin(1); fspin(2)-=hspin(1);
@ -261,6 +260,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
*/ */
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0); fspin(0)=hspin(0);
fspin(1)=hspin(1); fspin(1)=hspin(1);
fspin(2)=timesMinusI(hspin(0)); fspin(2)=timesMinusI(hspin(0));
@ -268,6 +268,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0); fspin(0)=hspin(0);
fspin(1)=hspin(1); fspin(1)=hspin(1);
fspin(2)= timesI(hspin(0)); fspin(2)= timesI(hspin(0));
@ -275,6 +276,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0); fspin(0)+=hspin(0);
fspin(1)+=hspin(1); fspin(1)+=hspin(1);
fspin(2)-=timesI(hspin(0)); fspin(2)-=timesI(hspin(0));
@ -282,6 +284,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0); fspin(0)+=hspin(0);
fspin(1)+=hspin(1); fspin(1)+=hspin(1);
fspin(2)+=timesI(hspin(0)); fspin(2)+=timesI(hspin(0));
@ -295,6 +298,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
*/ */
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0); fspin(0)=hspin(0);
fspin(1)=hspin(1); fspin(1)=hspin(1);
fspin(2)=hspin(0); fspin(2)=hspin(0);
@ -302,6 +306,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0); fspin(0)=hspin(0);
fspin(1)=hspin(1); fspin(1)=hspin(1);
fspin(2)=-hspin(0); fspin(2)=-hspin(0);
@ -309,6 +314,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0); fspin(0)+=hspin(0);
fspin(1)+=hspin(1); fspin(1)+=hspin(1);
fspin(2)+=hspin(0); fspin(2)+=hspin(0);
@ -316,6 +322,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0); fspin(0)+=hspin(0);
fspin(1)+=hspin(1); fspin(1)+=hspin(1);
fspin(2)-=hspin(0); fspin(2)-=hspin(0);
@ -329,6 +336,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
*/ */
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul
fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though
fspin(2)=Zero(); fspin(2)=Zero();
@ -336,6 +344,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)=Zero(); fspin(0)=Zero();
fspin(1)=Zero(); fspin(1)=Zero();
fspin(2)=hspin(0)+hspin(0); fspin(2)=hspin(0)+hspin(0);
@ -343,6 +352,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
} }
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0)+hspin(0); fspin(0)+=hspin(0)+hspin(0);
fspin(1)+=hspin(1)+hspin(1); fspin(1)+=hspin(1)+hspin(1);
} }
@ -362,6 +372,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
////////// //////////
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProjXp(hspin._internal[i],fspin._internal[i]); spProjXp(hspin._internal[i],fspin._internal[i]);
} }
@ -415,21 +426,26 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iM
}} }}
} }
//////// ////////
// Xm // Xm
//////// ////////
template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProjXm(hspin._internal,fspin._internal); spProjXm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProjXm(hspin._internal[i],fspin._internal[i]); spProjXm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProjXm(hspin._internal[i][j],fspin._internal[i][j]); spProjXm(hspin._internal[i][j],fspin._internal[i][j]);
@ -439,16 +455,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatri
template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spReconXm(hspin._internal,fspin._internal); spReconXm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spReconXm(hspin._internal[i],fspin._internal[i]); spReconXm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spReconXm(hspin._internal[i][j],fspin._internal[i][j]); spReconXm(hspin._internal[i][j],fspin._internal[i][j]);
@ -457,37 +476,45 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatr
template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumReconXm(hspin._internal,fspin._internal); accumReconXm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumReconXm(hspin._internal[i],fspin._internal[i]); accumReconXm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumReconXm(hspin._internal[i][j],fspin._internal[i][j]); accumReconXm(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
//////// ////////
// Yp // Yp
//////// ////////
template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProjYp(hspin._internal,fspin._internal); spProjYp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProjYp(hspin._internal[i],fspin._internal[i]); spProjYp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProjYp(hspin._internal[i][j],fspin._internal[i][j]); spProjYp(hspin._internal[i][j],fspin._internal[i][j]);
@ -497,16 +524,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatri
template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spReconYp(hspin._internal,fspin._internal); spReconYp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spReconYp(hspin._internal[i],fspin._internal[i]); spReconYp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spReconYp(hspin._internal[i][j],fspin._internal[i][j]); spReconYp(hspin._internal[i][j],fspin._internal[i][j]);
@ -515,55 +545,66 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatr
template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumReconYp(hspin._internal,fspin._internal); accumReconYp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumReconYp(hspin._internal[i],fspin._internal[i]); accumReconYp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumReconYp(hspin._internal[i][j],fspin._internal[i][j]); accumReconYp(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
//////// ////////
// Ym // Ym
//////// ////////
template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProjYm(hspin._internal,fspin._internal); spProjYm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProjYm(hspin._internal[i],fspin._internal[i]); spProjYm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProjYm(hspin._internal[i][j],fspin._internal[i][j]); spProjYm(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spReconYm(hspin._internal,fspin._internal); spReconYm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,const iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spReconYm(hspin._internal[i],fspin._internal[i]); spReconYm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spReconYm(hspin._internal[i][j],fspin._internal[i][j]); spReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@ -572,16 +613,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatr
template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumReconYm(hspin._internal,fspin._internal); accumReconYm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumReconYm(hspin._internal[i],fspin._internal[i]); accumReconYm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumReconYm(hspin._internal[i][j],fspin._internal[i][j]); accumReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@ -594,57 +638,66 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iM
//////// ////////
template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProjZp(hspin._internal,fspin._internal); spProjZp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProjZp(hspin._internal[i],fspin._internal[i]); spProjZp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProjZp(hspin._internal[i][j],fspin._internal[i][j]); spProjZp(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spReconZp(hspin._internal,fspin._internal); spReconZp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spReconZp(hspin._internal[i],fspin._internal[i]); spReconZp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spReconZp(hspin._internal[i][j],fspin._internal[i][j]); spReconZp(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumReconZp(hspin._internal,fspin._internal); accumReconZp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumReconZp(hspin._internal[i],fspin._internal[i]); accumReconZp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumReconZp(hspin._internal[i][j],fspin._internal[i][j]); accumReconZp(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
@ -653,53 +706,62 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iM
//////// ////////
template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProjZm(hspin._internal,fspin._internal); spProjZm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProjZm(hspin._internal[i],fspin._internal[i]); spProjZm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProjZm(hspin._internal[i][j],fspin._internal[i][j]); spProjZm(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spReconZm(hspin._internal,fspin._internal); spReconZm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spReconZm(hspin._internal[i],fspin._internal[i]); spReconZm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spReconZm(hspin._internal[i][j],fspin._internal[i][j]); spReconZm(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumReconZm(hspin._internal,fspin._internal); accumReconZm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumReconZm(hspin._internal[i],fspin._internal[i]); accumReconZm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumReconZm(hspin._internal[i][j],fspin._internal[i][j]); accumReconZm(hspin._internal[i][j],fspin._internal[i][j]);
@ -712,35 +774,41 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iM
//////// ////////
template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProjTp(hspin._internal,fspin._internal); spProjTp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProjTp(hspin._internal[i],fspin._internal[i]); spProjTp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProjTp(hspin._internal[i][j],fspin._internal[i][j]); spProjTp(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spReconTp(hspin._internal,fspin._internal); spReconTp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spReconTp(hspin._internal[i],fspin._internal[i]); spReconTp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spReconTp(hspin._internal[i][j],fspin._internal[i][j]); spReconTp(hspin._internal[i][j],fspin._internal[i][j]);
@ -749,37 +817,44 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatr
template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumReconTp(hspin._internal,fspin._internal); accumReconTp(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumReconTp(hspin._internal[i],fspin._internal[i]); accumReconTp(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumReconTp(hspin._internal[i][j],fspin._internal[i][j]); accumReconTp(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
//////// ////////
// Tm // Tm
//////// ////////
template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProjTm(hspin._internal,fspin._internal); spProjTm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProjTm(hspin._internal[i],fspin._internal[i]); spProjTm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProjTm(hspin._internal[i][j],fspin._internal[i][j]); spProjTm(hspin._internal[i][j],fspin._internal[i][j]);
@ -789,16 +864,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatri
template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spReconTm(hspin._internal,fspin._internal); spReconTm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spReconTm(hspin._internal[i],fspin._internal[i]); spReconTm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spReconTm(hspin._internal[i][j],fspin._internal[i][j]); spReconTm(hspin._internal[i][j],fspin._internal[i][j]);
@ -807,37 +885,44 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatr
template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumReconTm(hspin._internal,fspin._internal); accumReconTm(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumReconTm(hspin._internal[i],fspin._internal[i]); accumReconTm(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumReconTm(hspin._internal[i][j],fspin._internal[i][j]); accumReconTm(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
//////// ////////
// 5p // 5p
//////// ////////
template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProj5p(hspin._internal,fspin._internal); spProj5p(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProj5p(hspin._internal[i],fspin._internal[i]); spProj5p(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProj5p(hspin._internal[i][j],fspin._internal[i][j]); spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@ -846,16 +931,19 @@ template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> acce
template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spRecon5p(hspin._internal,fspin._internal); spRecon5p(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spRecon5p(hspin._internal[i],fspin._internal[i]); spRecon5p(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spRecon5p(hspin._internal[i][j],fspin._internal[i][j]); spRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@ -864,16 +952,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatr
template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumRecon5p(hspin._internal,fspin._internal); accumRecon5p(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumRecon5p(hspin._internal[i],fspin._internal[i]); accumRecon5p(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]); accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@ -881,18 +972,24 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iM
} }
// four spinor projectors for chiral proj // four spinor projectors for chiral proj
template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin) // template<class vtype> accelerator_inline void fspProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
template<class vtype> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProj5p(hspin._internal,fspin._internal); spProj5p(hspin._internal,fspin._internal);
} }
template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin) // template<class vtype,int N> accelerator_inline void fspProj5p (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProj5p(hspin._internal[i],fspin._internal[i]); spProj5p(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin) // template<class vtype,int N> accelerator_inline void fspProj5p (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProj5p(hspin._internal[i][j],fspin._internal[i][j]); spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@ -904,17 +1001,17 @@ template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inli
// 5m // 5m
//////// ////////
template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
spProj5m(hspin._internal,fspin._internal); spProj5m(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProj5m(hspin._internal[i],fspin._internal[i]); spProj5m(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
@ -924,34 +1021,40 @@ template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> acce
template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spRecon5m(hspin._internal,fspin._internal); spRecon5m(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spRecon5m(hspin._internal[i],fspin._internal[i]); spRecon5m(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spRecon5m(hspin._internal[i][j],fspin._internal[i][j]); spRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
}} }}
} }
template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
accumRecon5m(hspin._internal,fspin._internal); accumRecon5m(hspin._internal,fspin._internal);
} }
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
accumRecon5m(hspin._internal[i],fspin._internal[i]); accumRecon5m(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]); accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
@ -960,18 +1063,24 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iM
// four spinor projectors for chiral proj // four spinor projectors for chiral proj
template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin) // template<class vtype> accelerator_inline void fspProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
template<class vtype> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
spProj5m(hspin._internal,fspin._internal); spProj5m(hspin._internal,fspin._internal);
} }
template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin) // template<class vtype,int N> accelerator_inline void fspProj5m (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {
spProj5m(hspin._internal[i],fspin._internal[i]); spProj5m(hspin._internal[i],fspin._internal[i]);
} }
} }
template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin) // template<class vtype,int N> accelerator_inline void fspProj5m (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
{ {
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
for(int i=0;i<N;i++){ for(int i=0;i<N;i++){
for(int j=0;j<N;j++){ for(int j=0;j<N;j++){
spProj5m(hspin._internal[i][j],fspin._internal[i][j]); spProj5m(hspin._internal[i][j],fspin._internal[i][j]);

View File

@ -51,7 +51,7 @@ public:
private: private:
template <class mobj, class robj> template <class mobj, class robj>
static void baryon_site(const mobj &D1, static void BaryonSite(const mobj &D1,
const mobj &D2, const mobj &D2,
const mobj &D3, const mobj &D3,
const Gamma GammaA_left, const Gamma GammaA_left,
@ -61,8 +61,18 @@ public:
const int parity, const int parity,
const bool * wick_contractions, const bool * wick_contractions,
robj &result); robj &result);
template <class mobj, class robj>
static void BaryonSiteMatrix(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool * wick_contractions,
robj &result);
public: public:
static void Wick_Contractions(std::string qi, static void WickContractions(std::string qi,
std::string qf, std::string qf,
bool* wick_contractions); bool* wick_contractions);
static void ContractBaryons(const PropagatorField &q1_left, static void ContractBaryons(const PropagatorField &q1_left,
@ -75,8 +85,17 @@ public:
const bool* wick_contractions, const bool* wick_contractions,
const int parity, const int parity,
ComplexField &baryon_corr); ComplexField &baryon_corr);
static void ContractBaryonsMatrix(const PropagatorField &q1_left,
const PropagatorField &q2_left,
const PropagatorField &q3_left,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool* wick_contractions,
SpinMatrixField &baryon_corr);
template <class mobj, class robj> template <class mobj, class robj>
static void ContractBaryons_Sliced(const mobj &D1, static void ContractBaryonsSliced(const mobj &D1,
const mobj &D2, const mobj &D2,
const mobj &D3, const mobj &D3,
const Gamma GammaA_left, const Gamma GammaA_left,
@ -87,9 +106,20 @@ public:
const int parity, const int parity,
const int nt, const int nt,
robj &result); robj &result);
template <class mobj, class robj>
static void ContractBaryonsSlicedMatrix(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool* wick_contractions,
const int nt,
robj &result);
private: private:
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group1_Site( static void BaryonGamma3ptGroup1Site(
const mobj &Dq1_ti, const mobj &Dq1_ti,
const mobj2 &Dq2_spec, const mobj2 &Dq2_spec,
const mobj2 &Dq3_spec, const mobj2 &Dq3_spec,
@ -101,7 +131,7 @@ public:
robj &result); robj &result);
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group2_Site( static void BaryonGamma3ptGroup2Site(
const mobj2 &Dq1_spec, const mobj2 &Dq1_spec,
const mobj &Dq2_ti, const mobj &Dq2_ti,
const mobj2 &Dq3_spec, const mobj2 &Dq3_spec,
@ -113,7 +143,7 @@ public:
robj &result); robj &result);
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
static void Baryon_Gamma_3pt_Group3_Site( static void BaryonGamma3ptGroup3Site(
const mobj2 &Dq1_spec, const mobj2 &Dq1_spec,
const mobj2 &Dq2_spec, const mobj2 &Dq2_spec,
const mobj &Dq3_ti, const mobj &Dq3_ti,
@ -125,7 +155,7 @@ public:
robj &result); robj &result);
public: public:
template <class mobj> template <class mobj>
static void Baryon_Gamma_3pt( static void BaryonGamma3pt(
const PropagatorField &q_ti, const PropagatorField &q_ti,
const mobj &Dq_spec1, const mobj &Dq_spec1,
const mobj &Dq_spec2, const mobj &Dq_spec2,
@ -138,7 +168,7 @@ public:
SpinMatrixField &stn_corr); SpinMatrixField &stn_corr);
private: private:
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
const mobj2 &Du_spec, const mobj2 &Du_spec,
const mobj &Dd_tf, const mobj &Dd_tf,
const mobj &Ds_ti, const mobj &Ds_ti,
@ -147,7 +177,7 @@ public:
const Gamma GammaB_nucl, const Gamma GammaB_nucl,
robj &result); robj &result);
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
const mobj &Du_tf, const mobj &Du_tf,
const mobj2 &Du_spec, const mobj2 &Du_spec,
const mobj &Dd_tf, const mobj &Dd_tf,
@ -159,7 +189,7 @@ public:
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
const mobj2 &Du_spec, const mobj2 &Du_spec,
const mobj &Dd_tf, const mobj &Dd_tf,
const mobj &Ds_ti, const mobj &Ds_ti,
@ -168,7 +198,7 @@ public:
const Gamma GammaB_nucl, const Gamma GammaB_nucl,
robj &result); robj &result);
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
const mobj &Du_tf, const mobj &Du_tf,
const mobj2 &Du_spec, const mobj2 &Du_spec,
const mobj &Dd_tf, const mobj &Dd_tf,
@ -179,7 +209,7 @@ public:
robj &result); robj &result);
public: public:
template <class mobj> template <class mobj>
static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, static void SigmaToNucleonEye(const PropagatorField &qq_loop,
const mobj &Du_spec, const mobj &Du_spec,
const PropagatorField &qd_tf, const PropagatorField &qd_tf,
const PropagatorField &qs_ti, const PropagatorField &qs_ti,
@ -189,7 +219,7 @@ public:
const std::string op, const std::string op,
SpinMatrixField &stn_corr); SpinMatrixField &stn_corr);
template <class mobj> template <class mobj>
static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, static void SigmaToNucleonNonEye(const PropagatorField &qq_ti,
const PropagatorField &qq_tf, const PropagatorField &qq_tf,
const mobj &Du_spec, const mobj &Du_spec,
const PropagatorField &qd_tf, const PropagatorField &qd_tf,
@ -217,7 +247,7 @@ const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
//This is the old version //This is the old version
template <class FImpl> template <class FImpl>
template <class mobj, class robj> template <class mobj, class robj>
void BaryonUtils<FImpl>::baryon_site(const mobj &D1, void BaryonUtils<FImpl>::BaryonSite(const mobj &D1,
const mobj &D2, const mobj &D2,
const mobj &D3, const mobj &D3,
const Gamma GammaA_i, const Gamma GammaA_i,
@ -329,12 +359,132 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
}} }}
} }
//New version without parity projection or trace
template <class FImpl>
template <class mobj, class robj>
void BaryonUtils<FImpl>::BaryonSiteMatrix(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_i,
const Gamma GammaB_i,
const Gamma GammaA_f,
const Gamma GammaB_f,
const bool * wick_contraction,
robj &result)
{
auto D1_GAi = D1 * GammaA_i;
auto GAf_D1_GAi = GammaA_f * D1_GAi;
auto GBf_D1_GAi = GammaB_f * D1_GAi;
auto D2_GBi = D2 * GammaB_i;
auto GBf_D2_GBi = GammaB_f * D2_GBi;
auto GAf_D2_GBi = GammaA_f * D2_GBi;
auto GBf_D3 = GammaB_f * D3;
auto GAf_D3 = GammaA_f * D3;
for (int ie_f=0; ie_f < 6 ; ie_f++){
int a_f = epsilon[ie_f][0]; //a
int b_f = epsilon[ie_f][1]; //b
int c_f = epsilon[ie_f][2]; //c
for (int ie_i=0; ie_i < 6 ; ie_i++){
int a_i = epsilon[ie_i][0]; //a'
int b_i = epsilon[ie_i][1]; //b'
int c_i = epsilon[ie_i][2]; //c'
Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
//This is the \delta_{456}^{123} part
if (wick_contraction[0]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int rho_f=0; rho_f<Ns; rho_f++){
auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
result()(rho_f,rho_i)() += ee * GAf_D1_GAi_rr_cc
* D2_GBi ()(alpha_f,beta_i)(a_f,a_i)
* GBf_D3 ()(alpha_f,beta_i)(b_f,b_i);
}}
}}
}
//This is the \delta_{456}^{231} part
if (wick_contraction[1]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
auto GBf_D2_GBi_ab_ba = GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i);
for (int rho_f=0; rho_f<Ns; rho_f++){
result()(rho_f,rho_i)() += ee * D1_GAi_ar_ac
* GBf_D2_GBi_ab_ba
* GAf_D3 ()(rho_f,beta_i)(c_f,b_i);
}}
}}
}
//This is the \delta_{456}^{312} part
if (wick_contraction[2]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
auto D3_ab_ab = D3 ()(alpha_f,beta_i)(a_f,b_i);
for (int rho_f=0; rho_f<Ns; rho_f++){
result()(rho_f,rho_i)() += ee * GBf_D1_GAi_ar_bc
* GAf_D2_GBi ()(rho_f,beta_i)(c_f,a_i)
* D3_ab_ab;
}}
}}
}
//This is the \delta_{456}^{132} part
if (wick_contraction[3]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int rho_f=0; rho_f<Ns; rho_f++){
auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
for (int beta_i=0; beta_i<Ns; beta_i++){
result()(rho_f,rho_i)() -= ee * GAf_D1_GAi_rr_cc
* GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i)
* D3 ()(alpha_f,beta_i)(a_f,b_i);
}}
}}
}
//This is the \delta_{456}^{321} part
if (wick_contraction[4]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
auto D2_GBi_ab_aa = D2_GBi()(alpha_f,beta_i)(a_f,a_i);
for (int rho_f=0; rho_f<Ns; rho_f++){
result()(rho_f,rho_i)() -= ee * GBf_D1_GAi_ar_bc
* D2_GBi_ab_aa
* GAf_D3 ()(rho_f,beta_i)(c_f,b_i);
}}
}}
}
//This is the \delta_{456}^{213} part
if (wick_contraction[5]){
for (int rho_i=0; rho_i<Ns; rho_i++){
for (int alpha_f=0; alpha_f<Ns; alpha_f++){
auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
for (int beta_i=0; beta_i<Ns; beta_i++){
auto GBf_D3_ab_bb = GBf_D3()(alpha_f,beta_i)(b_f,b_i);
for (int rho_f=0; rho_f<Ns; rho_f++){
result()(rho_f,rho_i)() -= ee * D1_GAi_ar_ac
* GAf_D2_GBi ()(rho_f,beta_i)(c_f,a_i)
* GBf_D3_ab_bb;
}}
}}
}
}}
}
/* Computes which wick contractions should be performed for a * /* Computes which wick contractions should be performed for a *
* baryon 2pt function given the initial and finals state quark * * baryon 2pt function given the initial and finals state quark *
* flavours. * * flavours. *
* The array wick_contractions must be of length 6 */ * The array wick_contractions must be of length 6 */
template<class FImpl> template<class FImpl>
void BaryonUtils<FImpl>::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) { void BaryonUtils<FImpl>::WickContractions(std::string qi, std::string qf, bool* wick_contractions) {
const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
for (int ie=0; ie < 6 ; ie++) { for (int ie=0; ie < 6 ; ie++) {
wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3 wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3
@ -364,11 +514,6 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl;
std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl;
assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
@ -397,13 +542,62 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
auto D2 = v2[ss]; auto D2 = v2[ss];
auto D3 = v3[ss]; auto D3 = v3[ss];
vobj result=Zero(); vobj result=Zero();
baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
vbaryon_corr[ss] = result; vbaryon_corr[ss] = result;
} );//end loop over lattice sites } );//end loop over lattice sites
t += usecond(); t += usecond();
std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
}
template<class FImpl>
void BaryonUtils<FImpl>::ContractBaryonsMatrix(const PropagatorField &q1_left,
const PropagatorField &q2_left,
const PropagatorField &q3_left,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool* wick_contractions,
SpinMatrixField &baryon_corr)
{
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
GridBase *grid = q1_left.Grid();
autoView(vbaryon_corr, baryon_corr,CpuWrite);
autoView( v1 , q1_left, CpuRead);
autoView( v2 , q2_left, CpuRead);
autoView( v3 , q3_left, CpuRead);
// Real bytes =0.;
// bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
// for (int ie=0; ie < 6 ; ie++){
// if(ie==0 or ie==3){
// bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
// }
// else{
// bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
// }
// }
// Real t=0.;
// t =-usecond();
accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto D1 = v1[ss];
auto D2 = v2[ss];
auto D3 = v3[ss];
sobj result=Zero();
BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result);
vbaryon_corr[ss] = result;
} );//end loop over lattice sites
// t += usecond();
// std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
} }
@ -414,7 +608,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
* Wick_Contractions function above */ * Wick_Contractions function above */
template <class FImpl> template <class FImpl>
template <class mobj, class robj> template <class mobj, class robj>
void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1, void BaryonUtils<FImpl>::ContractBaryonsSliced(const mobj &D1,
const mobj &D2, const mobj &D2,
const mobj &D3, const mobj &D3,
const Gamma GammaA_left, const Gamma GammaA_left,
@ -429,16 +623,33 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl;
std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl;
assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
for (int t=0; t<nt; t++) { for (int t=0; t<nt; t++) {
baryon_site(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]); BaryonSite(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
}
}
template <class FImpl>
template <class mobj, class robj>
void BaryonUtils<FImpl>::ContractBaryonsSlicedMatrix(const mobj &D1,
const mobj &D2,
const mobj &D3,
const Gamma GammaA_left,
const Gamma GammaB_left,
const Gamma GammaA_right,
const Gamma GammaB_right,
const bool* wick_contractions,
const int nt,
robj &result)
{
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
for (int t=0; t<nt; t++) {
BaryonSiteMatrix(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result[t]);
} }
} }
@ -454,7 +665,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
* Dq4_tf is a quark line from t_f to t_J */ * Dq4_tf is a quark line from t_f to t_J */
template<class FImpl> template<class FImpl>
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site( void BaryonUtils<FImpl>::BaryonGamma3ptGroup1Site(
const mobj &Dq1_ti, const mobj &Dq1_ti,
const mobj2 &Dq2_spec, const mobj2 &Dq2_spec,
const mobj2 &Dq3_spec, const mobj2 &Dq3_spec,
@ -546,7 +757,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
* Dq4_tf is a quark line from t_f to t_J */ * Dq4_tf is a quark line from t_f to t_J */
template<class FImpl> template<class FImpl>
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site( void BaryonUtils<FImpl>::BaryonGamma3ptGroup2Site(
const mobj2 &Dq1_spec, const mobj2 &Dq1_spec,
const mobj &Dq2_ti, const mobj &Dq2_ti,
const mobj2 &Dq3_spec, const mobj2 &Dq3_spec,
@ -636,7 +847,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
* Dq4_tf is a quark line from t_f to t_J */ * Dq4_tf is a quark line from t_f to t_J */
template<class FImpl> template<class FImpl>
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site( void BaryonUtils<FImpl>::BaryonGamma3ptGroup3Site(
const mobj2 &Dq1_spec, const mobj2 &Dq1_spec,
const mobj2 &Dq2_spec, const mobj2 &Dq2_spec,
const mobj &Dq3_ti, const mobj &Dq3_ti,
@ -728,7 +939,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
* https://aportelli.github.io/Hadrons-doc/#/mcontraction */ * https://aportelli.github.io/Hadrons-doc/#/mcontraction */
template<class FImpl> template<class FImpl>
template <class mobj> template <class mobj>
void BaryonUtils<FImpl>::Baryon_Gamma_3pt( void BaryonUtils<FImpl>::BaryonGamma3pt(
const PropagatorField &q_ti, const PropagatorField &q_ti,
const mobj &Dq_spec1, const mobj &Dq_spec1,
const mobj &Dq_spec2, const mobj &Dq_spec2,
@ -751,7 +962,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
auto Dq_ti = vq_ti[ss]; auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss]; auto Dq_tf = vq_tf[ss];
sobj result=Zero(); sobj result=Zero();
Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result; vcorr[ss] += result;
});//end loop over lattice sites });//end loop over lattice sites
} else if (group == 2) { } else if (group == 2) {
@ -759,7 +970,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
auto Dq_ti = vq_ti[ss]; auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss]; auto Dq_tf = vq_tf[ss];
sobj result=Zero(); sobj result=Zero();
Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result; vcorr[ss] += result;
});//end loop over lattice sites });//end loop over lattice sites
} else if (group == 3) { } else if (group == 3) {
@ -767,7 +978,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
auto Dq_ti = vq_ti[ss]; auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss]; auto Dq_tf = vq_tf[ss];
sobj result=Zero(); sobj result=Zero();
Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
vcorr[ss] += result; vcorr[ss] += result;
});//end loop over lattice sites });//end loop over lattice sites
@ -787,7 +998,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
* Ds_ti is a quark line from t_i to t_H */ * Ds_ti is a quark line from t_i to t_H */
template <class FImpl> template <class FImpl>
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, void BaryonUtils<FImpl>::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
const mobj2 &Du_spec, const mobj2 &Du_spec,
const mobj &Dd_tf, const mobj &Dd_tf,
const mobj &Ds_ti, const mobj &Ds_ti,
@ -838,7 +1049,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
* Ds_ti is a quark line from t_i to t_H */ * Ds_ti is a quark line from t_i to t_H */
template <class FImpl> template <class FImpl>
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, void BaryonUtils<FImpl>::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
const mobj &Du_tf, const mobj &Du_tf,
const mobj2 &Du_spec, const mobj2 &Du_spec,
const mobj &Dd_tf, const mobj &Dd_tf,
@ -897,7 +1108,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
* Ds_ti is a quark line from t_i to t_H */ * Ds_ti is a quark line from t_i to t_H */
template <class FImpl> template <class FImpl>
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, void BaryonUtils<FImpl>::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
const mobj2 &Du_spec, const mobj2 &Du_spec,
const mobj &Dd_tf, const mobj &Dd_tf,
const mobj &Ds_ti, const mobj &Ds_ti,
@ -948,7 +1159,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
* Ds_ti is a quark line from t_i to t_H */ * Ds_ti is a quark line from t_i to t_H */
template <class FImpl> template <class FImpl>
template <class mobj, class mobj2, class robj> template <class mobj, class mobj2, class robj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, void BaryonUtils<FImpl>::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
const mobj &Du_tf, const mobj &Du_tf,
const mobj2 &Du_spec, const mobj2 &Du_spec,
const mobj &Dd_tf, const mobj &Dd_tf,
@ -1002,7 +1213,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
template<class FImpl> template<class FImpl>
template <class mobj> template <class mobj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
const mobj &Du_spec, const mobj &Du_spec,
const PropagatorField &qd_tf, const PropagatorField &qd_tf,
const PropagatorField &qs_ti, const PropagatorField &qs_ti,
@ -1029,9 +1240,9 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
auto Ds_ti = vs_ti[ss]; auto Ds_ti = vs_ti[ss];
sobj result=Zero(); sobj result=Zero();
if(op == "Q1"){ if(op == "Q1"){
Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
} else if(op == "Q2"){ } else if(op == "Q2"){
Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
} else { } else {
assert(0 && "Weak Operator not correctly specified"); assert(0 && "Weak Operator not correctly specified");
} }
@ -1041,7 +1252,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
template<class FImpl> template<class FImpl>
template <class mobj> template <class mobj>
void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
const PropagatorField &qq_tf, const PropagatorField &qq_tf,
const mobj &Du_spec, const mobj &Du_spec,
const PropagatorField &qd_tf, const PropagatorField &qd_tf,
@ -1071,9 +1282,9 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
auto Ds_ti = vs_ti[ss]; auto Ds_ti = vs_ti[ss];
sobj result=Zero(); sobj result=Zero();
if(op == "Q1"){ if(op == "Q1"){
Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
} else if(op == "Q2"){ } else if(op == "Q2"){
Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
} else { } else {
assert(0 && "Weak Operator not correctly specified"); assert(0 && "Weak Operator not correctly specified");
} }

View File

@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
decltype(coalescedRead(y_v[ss+sp])) tmp; decltype(coalescedRead(y_v[ss+sp])) tmp;
spProj5m(tmp,y_v(ss+sp)); spProj5m(tmp,y_v(ss+sp));
tmp = a*x_v(ss+s)+b*tmp; tmp = a*x_v(ss+s)+b*tmp;
coalescedWrite(z_v[ss+s],tmp); coalescedWrite(z_v[ss+s],tmp);
}); });
} }
@ -188,6 +188,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
z.Checkerboard() = x.Checkerboard(); z.Checkerboard() = x.Checkerboard();
conformable(x,z); conformable(x,z);
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Algebra::Gamma5);
autoView( x_v, x, AcceleratorRead); autoView( x_v, x, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite); autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
@ -195,13 +196,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sp = Ls-1-s; int sp = Ls-1-s;
auto tmp = x_v(ss+s); coalescedWrite(z_v[ss+sp],G5*x_v(ss+s));
decltype(tmp) tmp_p;
decltype(tmp) tmp_m;
spProj5p(tmp_p,tmp);
spProj5m(tmp_m,tmp);
// Use of spProj5m, 5p captures the coarse space too
coalescedWrite(z_v[ss+sp],tmp_p - tmp_m);
} }
}); });
} }
@ -213,20 +208,10 @@ void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
z.Checkerboard() = x.Checkerboard(); z.Checkerboard() = x.Checkerboard();
conformable(x, z); conformable(x, z);
autoView( x_v, x, AcceleratorRead); Gamma G5(Gamma::Algebra::Gamma5);
autoView( z_v, z, AcceleratorWrite); z = G5 * x;
uint64_t nloop = grid->oSites();
accelerator_for(ss,nloop,vobj::Nsimd(),{
auto tmp = x_v(ss);
decltype(tmp) tmp_p;
decltype(tmp) tmp_m;
spProj5p(tmp_p,tmp);
spProj5m(tmp_m,tmp);
coalescedWrite(z_v[ss],tmp_p - tmp_m);
});
} }
/*
template<class CComplex, int nbasis> template<class CComplex, int nbasis>
void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x) void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
{ {
@ -249,7 +234,6 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
} }
}); });
} }
*/
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -449,7 +449,8 @@ public:
LatticeReal alpha(grid); LatticeReal alpha(grid);
// std::cout<<GridLogMessage<<"xi "<<xi <<std::endl; // std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
alpha = toReal(2.0 * xi); xi = 2.0 *xi;
alpha = toReal(xi);
do { do {
// A. Generate two uniformly distributed pseudo-random numbers R and R', // A. Generate two uniformly distributed pseudo-random numbers R and R',

View File

@ -26,7 +26,7 @@
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/Grid.h> #include <Grid/Grid.h>
#ifndef __NVCC__ #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);

View File

@ -41,6 +41,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
typedef struct { uint16_t x;} half;
#endif
typedef struct Half2_t { half x; half y; } Half2;
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
template<class pair> template<class pair>
@ -125,14 +130,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
} }
constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half); constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2); constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float); constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2); constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double); constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2); constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer); constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer);
typedef GpuComplex<half2 > GpuComplexH; typedef GpuComplex<Half2 > GpuComplexH;
typedef GpuComplex<float2 > GpuComplexF; typedef GpuComplex<float2 > GpuComplexF;
typedef GpuComplex<double2> GpuComplexD; typedef GpuComplex<double2> GpuComplexD;
@ -147,11 +152,9 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
accelerator_inline float half2float(half h) accelerator_inline float half2float(half h)
{ {
float f; float f;
#ifdef GRID_SIMT #if defined(GRID_CUDA) || defined(GRID_HIP)
f = __half2float(h); f = __half2float(h);
#else #else
//f = __half2float(h);
__half_raw hr(h);
Grid_half hh; Grid_half hh;
hh.x = hr.x; hh.x = hr.x;
f= sfw_half_to_float(hh); f= sfw_half_to_float(hh);
@ -161,13 +164,11 @@ accelerator_inline float half2float(half h)
accelerator_inline half float2half(float f) accelerator_inline half float2half(float f)
{ {
half h; half h;
#ifdef GRID_SIMT #if defined(GRID_CUDA) || defined(GRID_HIP)
h = __float2half(f); h = __float2half(f);
#else #else
Grid_half hh = sfw_float_to_half(f); Grid_half hh = sfw_float_to_half(f);
__half_raw hr; h.x = hh.x;
hr.x = hh.x;
h = __half(hr);
#endif #endif
return h; return h;
} }
@ -523,7 +524,7 @@ namespace Optimization {
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// Single / Half // Single / Half
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
int N = GpuVectorCF::N; int N = GpuVectorCF::N;
GpuVectorCH h; GpuVectorCH h;
for(int i=0;i<N;i++) { for(int i=0;i<N;i++) {

View File

@ -125,14 +125,6 @@ accelerator_inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
return SimdApply(SqrtRealFunctor<S>(), r); return SimdApply(SqrtRealFunctor<S>(), r);
} }
template <class S, class V> template <class S, class V>
accelerator_inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
return SimdApply(RSqrtRealFunctor<S>(), r);
}
template <class Scalar>
accelerator_inline Scalar rsqrt(const Scalar &r) {
return (RSqrtRealFunctor<Scalar>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) { accelerator_inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
return SimdApply(CosRealFunctor<S>(), r); return SimdApply(CosRealFunctor<S>(), r);
} }

View File

@ -92,17 +92,22 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
{ {
// need a check for the group type? // need a check for the group type?
iMatrix<vtype,N> ret(arg); iMatrix<vtype,N> ret(arg);
vtype rnrm;
vtype nrm; vtype nrm;
vtype inner; vtype inner;
for(int c1=0;c1<N;c1++){ for(int c1=0;c1<N;c1++){
// Normalises row c1
zeroit(inner); zeroit(inner);
for(int c2=0;c2<N;c2++) for(int c2=0;c2<N;c2++)
inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]); inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
nrm = rsqrt(inner); nrm = sqrt(inner);
nrm = 1.0/nrm;
for(int c2=0;c2<N;c2++) for(int c2=0;c2<N;c2++)
ret._internal[c1][c2]*= nrm; ret._internal[c1][c2]*= nrm;
// Remove c1 from rows c1+1...N-1
for (int b=c1+1; b<N; ++b){ for (int b=c1+1; b<N; ++b){
decltype(ret._internal[b][b]*ret._internal[b][b]) pr; decltype(ret._internal[b][b]*ret._internal[b][b]) pr;
zeroit(pr); zeroit(pr);

View File

@ -84,7 +84,6 @@ NAMESPACE_BEGIN(Grid);
} }
UNARY(sqrt); UNARY(sqrt);
UNARY(rsqrt);
UNARY(sin); UNARY(sin);
UNARY(cos); UNARY(cos);
UNARY(asin); UNARY(asin);

View File

@ -48,13 +48,14 @@ void acceleratorInit(void)
prop = gpu_props[i]; prop = gpu_props[i];
totalDeviceMem = prop.totalGlobalMem; totalDeviceMem = prop.totalGlobalMem;
if ( world_rank == 0) { if ( world_rank == 0) {
#ifndef GRID_IBM_SUMMIT #ifndef GRID_DEFAULT_GPU
if ( i==rank ) { if ( i==rank ) {
printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: ========================\n",rank);
printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i); printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i);
printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: ========================\n",rank);
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name); printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
GPU_PROP_FMT(totalGlobalMem,"%lld"); GPU_PROP_FMT(totalGlobalMem,"%lld");
GPU_PROP(managedMemory); GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard); GPU_PROP(isMultiGpuBoard);
@ -72,11 +73,17 @@ void acceleratorInit(void)
#undef GPU_PROP_FMT #undef GPU_PROP_FMT
#undef GPU_PROP #undef GPU_PROP
#ifdef GRID_IBM_SUMMIT #ifdef GRID_DEFAULT_GPU
// IBM Jsrun makes cuda Device numbering screwy and not match rank // IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n"); if ( world_rank == 0 ) {
printf("AcceleratorCudaInit: using default device \n");
printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n");
}
#else #else
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank); printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n");
cudaSetDevice(rank); cudaSetDevice(rank);
#endif #endif
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
@ -109,20 +116,24 @@ void acceleratorInit(void)
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
printf("world_rank %d has %d devices\n",world_rank,nDevices);
size_t totalDeviceMem=0;
for (int i = 0; i < nDevices; i++) { for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
hipGetDeviceProperties(&gpu_props[i], i); hipGetDeviceProperties(&gpu_props[i], i);
hipDeviceProp_t prop;
prop = gpu_props[i];
totalDeviceMem = prop.totalGlobalMem;
if ( world_rank == 0) { if ( world_rank == 0) {
hipDeviceProp_t prop;
prop = gpu_props[i];
printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: ========================\n");
printf("AcceleratorHipInit: Device Number : %d\n", i); printf("AcceleratorHipInit: Device Number : %d\n", i);
printf("AcceleratorHipInit: ========================\n"); printf("AcceleratorHipInit: ========================\n");
printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
GPU_PROP_FMT(totalGlobalMem,"%lu");
// GPU_PROP(managedMemory); // GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard); GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize); GPU_PROP(warpSize);
@ -131,13 +142,21 @@ void acceleratorInit(void)
// GPU_PROP(singleToDoublePrecisionPerfRatio); // GPU_PROP(singleToDoublePrecisionPerfRatio);
} }
} }
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
#undef GPU_PROP_FMT #undef GPU_PROP_FMT
#undef GPU_PROP #undef GPU_PROP
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank #ifdef GRID_DEFAULT_GPU
if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n"); if ( world_rank == 0 ) {
printf("AcceleratorHipInit: using default device \n");
printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
}
#else #else
if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n"); if ( world_rank == 0 ) {
printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
}
hipSetDevice(rank); hipSetDevice(rank);
#endif #endif
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");

View File

@ -151,9 +151,6 @@ inline void *acceleratorAllocShared(size_t bytes)
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err)); printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
} }
// size_t free,total;
// cudaMemGetInfo(&free,&total);
// std::cout << "Malloc managed "<<bytes<<" "<<free<<"/"<<total<<std::endl;
return ptr; return ptr;
}; };
inline void *acceleratorAllocDevice(size_t bytes) inline void *acceleratorAllocDevice(size_t bytes)
@ -164,23 +161,23 @@ inline void *acceleratorAllocDevice(size_t bytes)
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
} }
// size_t free,total;
// cudaMemGetInfo(&free,&total);
// std::cout << "Malloc device "<<bytes<<" "<<free<<"/"<<total<<std::endl;
return ptr; return ptr;
}; };
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline int acceleratorIsCommunicable(void *ptr) inline int acceleratorIsCommunicable(void *ptr)
{ {
int uvm; // int uvm=0;
auto // auto
cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); // cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
assert(cuerr == cudaSuccess ); // assert(cuerr == cudaSuccess );
if(uvm) return 0; // if(uvm) return 0;
else return 1; // else return 1;
return 1;
} }
#endif #endif
@ -235,8 +232,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr) inline int acceleratorIsCommunicable(void *ptr)
{ {
#if 0 #if 0
@ -313,17 +312,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
inline void *acceleratorAllocShared(size_t bytes) inline void *acceleratorAllocShared(size_t bytes)
{ {
#if 0
void *ptr=NULL; void *ptr=NULL;
auto err = hipMallocManaged((void **)&ptr,bytes); auto err = hipMallocManaged((void **)&ptr,bytes);
if( err != hipSuccess ) { if( err != hipSuccess ) {
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err)); printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
} }
return ptr; return ptr;
#else
return malloc(bytes);
#endif
}; };
inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline int acceleratorIsCommunicable(void *ptr){ return 1; }
@ -333,15 +328,17 @@ inline void *acceleratorAllocDevice(size_t bytes)
auto err = hipMalloc((void **)&ptr,bytes); auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) { if( err != hipSuccess ) {
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err)); printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
} }
return ptr; return ptr;
}; };
inline void acceleratorFreeShared(void *ptr){ free(ptr);}; inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
#endif #endif
@ -379,8 +376,10 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
@ -403,6 +402,8 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
inline void acceleratorFreeCpu (void *ptr){free(ptr);}; inline void acceleratorFreeCpu (void *ptr){free(ptr);};
#endif #endif
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// Synchronise across local threads for divergence resynch // Synchronise across local threads for divergence resynch
/////////////////////////////////////////////////// ///////////////////////////////////////////////////

View File

@ -473,11 +473,13 @@ void Grid_init(int *argc,char ***argv)
LebesgueOrder::UseLebesgueOrder=1; LebesgueOrder::UseLebesgueOrder=1;
} }
CartesianCommunicator::nCommThreads = 1; CartesianCommunicator::nCommThreads = 1;
#ifdef GRID_COMMS_THREADS
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
assert(CartesianCommunicator::nCommThreads > 0); assert(CartesianCommunicator::nCommThreads > 0);
} }
#endif
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block); GridCmdOptionIntVector(arg,LebesgueOrder::Block);

33
README
View File

@ -111,11 +111,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
``` bash ``` bash
mkdir build; cd build mkdir build; cd build
../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path> ../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
``` ```
where `--enable-precision=` set the default precision, where `--enable-simd=` set the SIMD type, `--enable-
`--enable-simd=` set the SIMD type, `--enable-
comms=`, and `<path>` should be replaced by the prefix path where you want to comms=`, and `<path>` should be replaced by the prefix path where you want to
install Grid. Other options are detailed in the next section, you can also use `configure install Grid. Other options are detailed in the next section, you can also use `configure
--help` to display them. Like with any other program using GNU autotool, the --help` to display them. Like with any other program using GNU autotool, the
@ -146,8 +145,8 @@ If you want to build all the tests at once just use `make tests`.
- `--enable-numa`: enable NUMA first touch optimisation - `--enable-numa`: enable NUMA first touch optimisation
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`). - `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
- `--disable-timers`: disable system dependent high-resolution timers. - `--disable-timers`: disable system dependent high-resolution timers.
- `--enable-chroma`: enable Chroma regression tests. - `--enable-chroma`: enable Chroma regression tests.
@ -201,8 +200,7 @@ Alternatively, some CPU codenames can be directly used:
The following configuration is recommended for the Intel Knights Landing platform: The following configuration is recommended for the Intel Knights Landing platform:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -212,8 +210,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi \ --enable-comms=mpi \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -232,8 +229,7 @@ for interior communication. This is the mpi3 communications implementation.
We recommend four ranks per node for best performance, but optimum is local volume dependent. We recommend four ranks per node for best performance, but optimum is local volume dependent.
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi3-auto \ --enable-comms=mpi3-auto \
--enable-mkl \ --enable-mkl \
CC=icpc MPICXX=mpiicpc CC=icpc MPICXX=mpiicpc
@ -244,8 +240,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
The following configuration is recommended for the Intel Haswell platform: The following configuration is recommended for the Intel Haswell platform:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi3-auto \ --enable-comms=mpi3-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -262,8 +257,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -280,8 +274,7 @@ This is the default.
The following configuration is recommended for the Intel Skylake platform: The following configuration is recommended for the Intel Skylake platform:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX512 \
--enable-simd=AVX512 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=mpiicpc CXX=mpiicpc
@ -298,8 +291,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX512 \
--enable-simd=AVX512 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -330,8 +322,7 @@ and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform. The following configuration is recommended for the AMD EPYC platform.
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
CXX=mpicxx CXX=mpicxx
``` ```

View File

@ -115,11 +115,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
``` bash ``` bash
mkdir build; cd build mkdir build; cd build
../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path> ../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
``` ```
where `--enable-precision=` set the default precision, where `--enable-simd=` set the SIMD type, `--enable-
`--enable-simd=` set the SIMD type, `--enable-
comms=`, and `<path>` should be replaced by the prefix path where you want to comms=`, and `<path>` should be replaced by the prefix path where you want to
install Grid. Other options are detailed in the next section, you can also use `configure install Grid. Other options are detailed in the next section, you can also use `configure
--help` to display them. Like with any other program using GNU autotool, the --help` to display them. Like with any other program using GNU autotool, the
@ -150,8 +149,8 @@ If you want to build all the tests at once just use `make tests`.
- `--enable-numa`: enable NUMA first touch optimisation - `--enable-numa`: enable NUMA first touch optimisation
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`). - `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
- `--disable-timers`: disable system dependent high-resolution timers. - `--disable-timers`: disable system dependent high-resolution timers.
- `--enable-chroma`: enable Chroma regression tests. - `--enable-chroma`: enable Chroma regression tests.
@ -205,8 +204,7 @@ Alternatively, some CPU codenames can be directly used:
The following configuration is recommended for the Intel Knights Landing platform: The following configuration is recommended for the Intel Knights Landing platform:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -216,8 +214,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi \ --enable-comms=mpi \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -236,8 +233,7 @@ for interior communication. This is the mpi3 communications implementation.
We recommend four ranks per node for best performance, but optimum is local volume dependent. We recommend four ranks per node for best performance, but optimum is local volume dependent.
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi3-auto \ --enable-comms=mpi3-auto \
--enable-mkl \ --enable-mkl \
CC=icpc MPICXX=mpiicpc CC=icpc MPICXX=mpiicpc
@ -248,8 +244,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
The following configuration is recommended for the Intel Haswell platform: The following configuration is recommended for the Intel Haswell platform:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi3-auto \ --enable-comms=mpi3-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -266,8 +261,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -284,8 +278,7 @@ This is the default.
The following configuration is recommended for the Intel Skylake platform: The following configuration is recommended for the Intel Skylake platform:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX512 \
--enable-simd=AVX512 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=mpiicpc CXX=mpiicpc
@ -302,8 +295,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX512 \
--enable-simd=AVX512 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -334,8 +326,7 @@ and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform. The following configuration is recommended for the AMD EPYC platform.
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi3 \ --enable-comms=mpi3 \
CXX=mpicxx CXX=mpicxx
``` ```

View File

@ -12,31 +12,31 @@ module load mpi/openmpi-aarch64
scl enable gcc-toolset-10 bash scl enable gcc-toolset-10 bash
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
* gcc 10.1 prebuild w/ MPI, QPACE4 interactive login * gcc 10.1 prebuild w/ MPI, QPACE4 interactive login
scl enable gcc-toolset-10 bash scl enable gcc-toolset-10 bash
module load mpi/openmpi-aarch64 module load mpi/openmpi-aarch64
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
* armclang 20.2 (qp4) * armclang 20.2 (qp4)
../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN" ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
* gcc 10.0.1 VLA (merlin) * gcc 10.0.1 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
* gcc 10.0.1 fixed-size ACLE (merlin) * gcc 10.0.1 fixed-size ACLE (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
* gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI * gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI
@ -46,34 +46,34 @@ export OMPI_CXX=g++-10.0.1
export MPICH_CC=gcc-10.0.1 export MPICH_CC=gcc-10.0.1
export MPICH_CXX=g++-10.0.1 export MPICH_CXX=g++-10.0.1
$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" $ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"
-------------------------------------------------------- --------------------------------------------------------
* armclang 20.0 VLA (merlin) * armclang 20.0 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
TODO check ARMCLANGCOMPAT TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA (merlin) * armclang 20.1 VLA (merlin)
../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
TODO check ARMCLANGCOMPAT TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA (fjt cluster) * armclang 20.1 VLA (fjt cluster)
../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" ../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
TODO check ARMCLANGCOMPAT TODO check ARMCLANGCOMPAT
* armclang 20.1 VLA w/MPI (fjt cluster) * armclang 20.1 VLA w/MPI (fjt cluster)
../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64" ../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"
No ARMCLANGCOMPAT -> still correct ? No ARMCLANGCOMPAT -> still correct ?
@ -81,9 +81,9 @@ No ARMCLANGCOMPAT -> still correct ?
* Fujitsu fcc * Fujitsu fcc
../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"
* Fujitsu fcc w/ MPI * Fujitsu fcc w/ MPI
../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"

View File

@ -1,8 +1,16 @@
#include "Benchmark_IO.hpp" #include "Benchmark_IO.hpp"
#ifndef BENCH_IO_LMIN
#define BENCH_IO_LMIN 8
#endif
#ifndef BENCH_IO_LMAX #ifndef BENCH_IO_LMAX
#define BENCH_IO_LMAX 40 #define BENCH_IO_LMAX 32
#endif
#ifndef BENCH_IO_NPASS
#define BENCH_IO_NPASS 10
#endif #endif
using namespace Grid; using namespace Grid;
@ -12,37 +20,179 @@ std::string filestem(const int l)
return "iobench_l" + std::to_string(l); return "iobench_l" + std::to_string(l);
} }
int vol(const int i)
{
return BENCH_IO_LMIN + 2*i;
}
int volInd(const int l)
{
return (l - BENCH_IO_LMIN)/2;
}
template <typename Mat>
void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
{
auto nr = data[0].rows(), nc = data[0].cols();
Eigen::MatrixXd sqSum(nr, nc);
double n = static_cast<double>(data.size());
assert(n > 1.);
mean = Mat::Zero(nr, nc);
sqSum = Mat::Zero(nr, nc);
for (auto &d: data)
{
mean += d;
sqSum += d.cwiseProduct(d);
}
stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt();
mean /= n;
}
#define grid_printf(...) \
{\
char _buf[1024];\
sprintf(_buf, __VA_ARGS__);\
MSG << _buf;\
}
enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3};
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
#ifdef HAVE_LIME
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
int64_t threads = GridThread::GetThreads(); int64_t threads = GridThread::GetThreads();
auto mpi = GridDefaultMpi();
unsigned int nVol = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1;
unsigned int nRelVol = (BENCH_IO_LMAX - 24)/2 + 1;
std::vector<Eigen::MatrixXd> perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4));
std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
std::vector<int> latt;
MSG << "Grid is setup to use " << threads << " threads" << std::endl; MSG << "Grid is setup to use " << threads << " threads" << std::endl;
MSG << SEP << std::endl; MSG << "MPI partition " << mpi << std::endl;
MSG << "Benchmark Lime write" << std::endl; for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
MSG << SEP << std::endl;
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
{ {
auto mpi = GridDefaultMpi(); MSG << BIGSEP << std::endl;
std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
MSG << BIGSEP << std::endl;
MSG << SEP << std::endl;
MSG << "Benchmark std write" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
std::cout << "-- Local volume " << l << "^4" << std::endl; MSG << "-- Local volume " << l << "^4" << std::endl;
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>); writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
}
MSG << SEP << std::endl;
MSG << "Benchmark std read" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
}
#ifdef HAVE_LIME
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime write" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
}
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime read" << std::endl;
MSG << SEP << std::endl;
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
MSG << "-- Local volume " << l << "^4" << std::endl;
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
}
#endif
avPerf[i].fill(0.);
for (int f = 0; f < 4; ++f)
for (int l = 24; l <= BENCH_IO_LMAX; l += 2)
{
avPerf[i](f) += perf[i](volInd(l), f);
}
avPerf[i] /= nRelVol;
} }
MSG << "Benchmark Lime read" << std::endl; Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
MSG << SEP << std::endl; Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
for (int l = 4; l <= BENCH_IO_LMAX; l += 2) double n = BENCH_IO_NPASS;
{
auto mpi = GridDefaultMpi();
std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
std::cout << "-- Local volume " << l << "^4" << std::endl; stats(mean, stdDev, perf);
readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>); stats(avMean, avStdDev, avPerf);
rob.fill(100.);
rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs());
avRob.fill(100.);
avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs());
MSG << BIGSEP << std::endl;
MSG << "SUMMARY" << std::endl;
MSG << BIGSEP << std::endl;
MSG << "Summary of individual results (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
"L", "std read", "std dev", "std write", "std dev",
"Grid read", "std dev", "Grid write", "std dev");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
mean(volInd(l), gRead), stdDev(volInd(l), gRead),
mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
} }
MSG << std::endl;
MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s\n",
"L", "std read", "std write", "Grid read", "Grid write");
for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
{
grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
rob(volInd(l), gRead), rob(volInd(l), gWrite));
}
MSG << std::endl;
MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
"std read", "std dev", "std write", "std dev",
"Grid read", "std dev", "Grid write", "std dev");
grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
MSG << std::endl;
MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s\n",
"std read", "std write", "Grid read", "Grid write");
grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));
Grid_finalize(); Grid_finalize();
#endif
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -5,6 +5,8 @@
#ifdef HAVE_LIME #ifdef HAVE_LIME
#define MSG std::cout << GridLogMessage #define MSG std::cout << GridLogMessage
#define SEP \ #define SEP \
"-----------------------------------------------------------------------------"
#define BIGSEP \
"=============================================================================" "============================================================================="
namespace Grid { namespace Grid {
@ -14,13 +16,152 @@ using WriterFn = std::function<void(const std::string, Field &)> ;
template <typename Field> template <typename Field>
using ReaderFn = std::function<void(Field &, const std::string)>; using ReaderFn = std::function<void(Field &, const std::string)>;
// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
//
// template <typename Field>
// void stdWrite(const std::string filestem, Field &vec)
// {
// std::string rankStr = std::to_string(vec.Grid()->ThisRank());
// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
// size_t size;
// uint32_t crc;
// GridStopWatch ioWatch, crcWatch;
// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
// autoView(vec_v, vec, CpuRead);
// crcWatch.Start();
// crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
// std::fwrite(&crc, sizeof(uint32_t), 1, file);
// crcWatch.Stop();
// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
// ioWatch.Start();
// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
// ioWatch.Stop();
// std::fclose(file);
// size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf;
// p.size = size;
// p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
// }
//
// template <typename Field>
// void stdRead(Field &vec, const std::string filestem)
// {
// std::string rankStr = std::to_string(vec.Grid()->ThisRank());
// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
// size_t size;
// uint32_t crcRead, crcData;
// GridStopWatch ioWatch, crcWatch;
// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
// crcWatch.Start();
// std::fread(&crcRead, sizeof(uint32_t), 1, file);
// crcWatch.Stop();
// {
// autoView(vec_v, vec, CpuWrite);
// ioWatch.Start();
// std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
// ioWatch.Stop();
// std::fclose(file);
// }
// {
// autoView(vec_v, vec, CpuRead);
// crcWatch.Start();
// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
// crcWatch.Stop();
// }
// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
// assert(crcData == crcRead);
// size *= vec.Grid()->ProcessorCount();
// auto &p = BinaryIO::lastPerf;
// p.size = size;
// p.time = ioWatch.useconds();
// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
// << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
// }
template <typename Field>
void stdWrite(const std::string filestem, Field &vec)
{
std::string rankStr = std::to_string(vec.Grid()->ThisRank());
std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
size_t size, sizec;
uint32_t crc;
GridStopWatch ioWatch, crcWatch;
size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
sizec = size/sizeof(char); // just in case of...
autoView(vec_v, vec, CpuRead);
crcWatch.Start();
crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t)/sizeof(char));
crcWatch.Stop();
MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
ioWatch.Start();
file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
file.flush();
ioWatch.Stop();
size *= vec.Grid()->ProcessorCount();
auto &p = BinaryIO::lastPerf;
p.size = size;
p.time = ioWatch.useconds();
p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
}
template <typename Field>
void stdRead(Field &vec, const std::string filestem)
{
std::string rankStr = std::to_string(vec.Grid()->ThisRank());
std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
size_t size, sizec;
uint32_t crcRead, crcData;
GridStopWatch ioWatch, crcWatch;
size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
sizec = size/sizeof(char); // just in case of...
crcWatch.Start();
file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t)/sizeof(char));
crcWatch.Stop();
{
autoView(vec_v, vec, CpuWrite);
ioWatch.Start();
file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
ioWatch.Stop();
}
{
autoView(vec_v, vec, CpuRead);
crcWatch.Start();
crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
crcWatch.Stop();
}
MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
assert(crcData == crcRead);
size *= vec.Grid()->ProcessorCount();
auto &p = BinaryIO::lastPerf;
p.size = size;
p.time = ioWatch.useconds();
p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
<< ", " << p.mbytesPerSecond << " MB/s" << std::endl;
MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
}
template <typename Field> template <typename Field>
void limeWrite(const std::string filestem, Field &vec) void limeWrite(const std::string filestem, Field &vec)
{ {
emptyUserRecord record; emptyUserRecord record;
ScidacWriter binWriter(vec.Grid()->IsBoss()); ScidacWriter binWriter(vec.Grid()->IsBoss());
binWriter.open(filestem + ".bin"); binWriter.open(filestem + ".lime.bin");
binWriter.writeScidacFieldRecord(vec, record); binWriter.writeScidacFieldRecord(vec, record);
binWriter.close(); binWriter.close();
} }
@ -31,7 +172,7 @@ void limeRead(Field &vec, const std::string filestem)
emptyUserRecord record; emptyUserRecord record;
ScidacReader binReader; ScidacReader binReader;
binReader.open(filestem + ".bin"); binReader.open(filestem + ".lime.bin");
binReader.readScidacFieldRecord(vec, record); binReader.readScidacFieldRecord(vec, record);
binReader.close(); binReader.close();
} }
@ -73,12 +214,18 @@ void writeBenchmark(const Coordinate &latt, const std::string filename,
auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
std::shared_ptr<GridBase> gPt; std::shared_ptr<GridBase> gPt;
std::random_device rd;
makeGrid(gPt, gBasePt, Ls, rb); makeGrid(gPt, gBasePt, Ls, rb);
GridBase *g = gPt.get(); GridBase *g = gPt.get();
GridParallelRNG rng(g); GridParallelRNG rng(g);
Field vec(g); Field vec(g);
rng.SeedFixedIntegers({static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd()),
static_cast<int>(rd()), static_cast<int>(rd())});
random(rng, vec); random(rng, vec);
write(filename, vec); write(filename, vec);
@ -96,8 +243,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename,
makeGrid(gPt, gBasePt, Ls, rb); makeGrid(gPt, gBasePt, Ls, rb);
GridBase *g = gPt.get(); GridBase *g = gPt.get();
Field vec(g); Field vec(g);
read(vec, filename); read(vec, filename);
} }

View File

@ -1,14 +1,9 @@
#include "Benchmark_IO.hpp" #include "Benchmark_IO.hpp"
#define MSG std::cout << GridLogMessage
#define SEP \
"============================================================================="
using namespace Grid; using namespace Grid;
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
#ifdef HAVE_LIME
std::vector<std::string> dir; std::vector<std::string> dir;
unsigned int Ls; unsigned int Ls;
bool rb; bool rb;
@ -34,46 +29,71 @@ int main (int argc, char ** argv)
} }
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
int64_t threads = GridThread::GetThreads(); int64_t threads = GridThread::GetThreads();
auto mpi = GridDefaultMpi();
MSG << "Grid is setup to use " << threads << " threads" << std::endl; MSG << "Grid is setup to use " << threads << " threads" << std::endl;
MSG << SEP << std::endl; MSG << "MPI partition " << mpi << std::endl;
MSG << "Benchmark double precision Lime write" << std::endl;
MSG << SEP << std::endl;
for (auto &d: dir)
{
MSG << "-- Directory " << d << std::endl;
writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
}
MSG << SEP << std::endl; MSG << SEP << std::endl;
MSG << "Benchmark double precision Lime read" << std::endl; MSG << "Benchmark Grid std write" << std::endl;
MSG << SEP << std::endl; MSG << SEP << std::endl;
for (auto &d: dir) for (auto &d: dir)
{ {
MSG << "-- Directory " << d << std::endl; MSG << "-- Directory " << d << std::endl;
readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb); writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench",
stdWrite<LatticeFermion>, Ls, rb);
}
MSG << SEP << std::endl;
MSG << "Benchmark Grid std read" << std::endl;
MSG << SEP << std::endl;
for (auto &d: dir)
{
MSG << "-- Directory " << d << std::endl;
readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench",
stdRead<LatticeFermion>, Ls, rb);
} }
#ifdef HAVE_LIME
MSG << SEP << std::endl; MSG << SEP << std::endl;
MSG << "Benchmark single precision Lime write" << std::endl; MSG << "Benchmark Grid C-Lime write" << std::endl;
MSG << SEP << std::endl; MSG << SEP << std::endl;
for (auto &d: dir) for (auto &d: dir)
{ {
MSG << "-- Directory " << d << std::endl; MSG << "-- Directory " << d << std::endl;
writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb); writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench",
limeWrite<LatticeFermion>, Ls, rb);
} }
MSG << SEP << std::endl;
MSG << "Benchmark Grid C-Lime read" << std::endl;
MSG << SEP << std::endl;
for (auto &d: dir)
{
MSG << "-- Directory " << d << std::endl;
readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench",
limeRead<LatticeFermion>, Ls, rb);
}
#endif
MSG << SEP << std::endl; // MSG << SEP << std::endl;
MSG << "Benchmark single precision Lime read" << std::endl; // MSG << "Benchmark single precision Lime write" << std::endl;
MSG << SEP << std::endl; // MSG << SEP << std::endl;
for (auto &d: dir) // for (auto &d: dir)
{ // {
MSG << "-- Directory " << d << std::endl; // MSG << "-- Directory " << d << std::endl;
readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb); // writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
} // }
// MSG << SEP << std::endl;
// MSG << "Benchmark single precision Lime read" << std::endl;
// MSG << SEP << std::endl;
// for (auto &d: dir)
// {
// MSG << "-- Directory " << d << std::endl;
// readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
// }
Grid_finalize(); Grid_finalize();
#endif
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -62,7 +62,7 @@ struct time_statistics{
void comms_header(){ void comms_header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t" std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl; <<"bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)"<<std::endl;
}; };
Gamma::Algebra Gmu [] = { Gamma::Algebra Gmu [] = {
@ -125,7 +125,7 @@ public:
lat*mpi_layout[1], lat*mpi_layout[1],
lat*mpi_layout[2], lat*mpi_layout[2],
lat*mpi_layout[3]}); lat*mpi_layout[3]});
std::cout << GridLogMessage<< latt_size <<std::endl;
GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors; RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
@ -137,8 +137,8 @@ public:
for(int d=0;d<8;d++){ for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
} }
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@ -189,11 +189,11 @@ public:
// double rbytes = dbytes*0.5; // double rbytes = dbytes*0.5;
double bidibytes = dbytes; double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) << bytes << " \t "
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " <<xbytes/timestat.mean<<" \t "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " \t "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min <<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " << "\t\t"<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
} }
@ -202,6 +202,8 @@ public:
return; return;
} }
static void Memory(void) static void Memory(void)
{ {
const int Nvec=8; const int Nvec=8;
@ -222,7 +224,7 @@ public:
uint64_t lmax=32; uint64_t lmax=32;
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat) #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=8){ for(int lat=8;lat<=lmax;lat+=8){
@ -247,11 +249,6 @@ public:
double start=usecond(); double start=usecond();
for(int i=0;i<Nloop;i++){ for(int i=0;i<Nloop;i++){
z=a*x-y; z=a*x-y;
autoView( x_v , x, CpuWrite);
autoView( y_v , y, CpuWrite);
autoView( z_v , z, CpuRead);
x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
y_v[4]=z_v[4];
} }
double stop=usecond(); double stop=usecond();
double time = (stop-start)/Nloop*1000; double time = (stop-start)/Nloop*1000;
@ -266,6 +263,61 @@ public:
}; };
static void SU4(void)
{
const int Nc4=4;
typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t NN;
uint64_t lmax=32;
#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
NN =Grid.NodeCount();
LatticeSU4 z(&Grid); z=Zero();
LatticeSU4 x(&Grid); x=Zero();
LatticeSU4 y(&Grid); y=Zero();
double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=x*y;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
<< "\t\t"<< bytes/time/NN <<std::endl;
}
};
static double DWF(int Ls,int L) static double DWF(int Ls,int L)
{ {
RealD mass=0.1; RealD mass=0.1;
@ -282,8 +334,9 @@ public:
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L}); Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi()); GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount(); uint64_t NP = TmpGrid->RankCount();
@ -291,11 +344,11 @@ public:
NN_global=NN; NN_global=NN;
uint64_t SHM=NP/NN; uint64_t SHM=NP/NN;
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
///////// Welcome message //////////// ///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl; std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Nc : "<<Nc<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl; std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl; std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl; std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
@ -324,7 +377,7 @@ public:
typedef LatticeGaugeFieldF Gauge; typedef LatticeGaugeFieldF Gauge;
///////// Source preparation //////////// ///////// Source preparation ////////////
Gauge Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); Gauge Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
Fermion src (FGrid); random(RNG5,src); Fermion src (FGrid); random(RNG5,src);
Fermion src_e (FrbGrid); Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid); Fermion src_o (FrbGrid);
@ -369,7 +422,7 @@ public:
} }
FGrid->Barrier(); FGrid->Barrier();
double t1=usecond(); double t1=usecond();
uint64_t ncall = 50; uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall)); FGrid->Broadcast(0,&ncall,sizeof(ncall));
@ -387,7 +440,17 @@ public:
FGrid->Barrier(); FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
// Nc=3 gives
// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2
// double flops=(1344.0*volume)/2;
#if 0
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2;
#else
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2;
#endif
double flops=(fps*volume)/2;
double mf_hi, mf_lo, mf_err; double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time); timestat.statistics(t_time);
@ -402,6 +465,7 @@ public:
if ( mflops>mflops_best ) mflops_best = mflops; if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops; if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
@ -438,8 +502,9 @@ public:
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L}); Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi()); GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount(); uint64_t NP = TmpGrid->RankCount();
@ -447,7 +512,6 @@ public:
NN_global=NN; NN_global=NN;
uint64_t SHM=NP/NN; uint64_t SHM=NP/NN;
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
///////// Welcome message //////////// ///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@ -478,7 +542,7 @@ public:
typedef typename Action::FermionField Fermion; typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge; typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU3::HotConfiguration(RNG4,Umu); Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
typename Action::ImplParams params; typename Action::ImplParams params;
Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
@ -596,11 +660,12 @@ int main (int argc, char ** argv)
#endif #endif
Benchmark::Decomposition(); Benchmark::Decomposition();
int do_su4=1;
int do_memory=1; int do_memory=1;
int do_comms =1; int do_comms =1;
int sel=2; int sel=4;
std::vector<int> L_list({16,24,32}); std::vector<int> L_list({8,12,16,24,32});
int selm1=sel-1; int selm1=sel-1;
std::vector<double> wilson; std::vector<double> wilson;
@ -624,7 +689,6 @@ int main (int argc, char ** argv)
dwf4.push_back(result); dwf4.push_back(result);
} }
/*
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl; std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@ -632,14 +696,13 @@ int main (int argc, char ** argv)
double result = Benchmark::Staggered(L_list[l]) ; double result = Benchmark::Staggered(L_list[l]) ;
staggered.push_back(result); staggered.push_back(result);
} }
*/
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl; std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" <<std::endl;
for(int l=0;l<L_list.size();l++){ for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl; std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
} }
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@ -651,6 +714,13 @@ int main (int argc, char ** argv)
Benchmark::Memory(); Benchmark::Memory();
} }
if ( do_su4 ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::SU4();
}
if ( do_comms && (NN>1) ) { if ( do_comms && (NN>1) ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Communications benchmark " <<std::endl; std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
@ -661,9 +731,9 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 " <<std::endl; std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " <<std::endl;
for(int l=0;l<L_list.size();l++){ for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl; std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
} }
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

View File

@ -94,8 +94,8 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode; RealD ppn = Nrank/Nnode;
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8); std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8); std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
for(int mu=0;mu<8;mu++){ for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls); xbuf[mu].resize(lat*lat*lat*Ls);

View File

@ -0,0 +1,260 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_comms.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
struct time_statistics{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v){
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<std::setw(11)<<"bytes\t\t"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
int Nloop=250;
int nmu=0;
int maxlat=32;
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=8;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);
rbuf[mu].resize(lat*lat*lat*Ls);
}
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
int ncomm;
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
double start=usecond();
for(int i=0;i<Nloop;i++){
ncomm=0;
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
}
comm_proc = mpi_layout[mu]-1;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
Grid.Barrier();
double stop=usecond();
double mean=(stop-start)/Nloop;
double dbytes = bytes*ppn;
double xbytes = dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
<<std::right<< xbytes/mean<<" "
<< "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
}
}
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory "<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=8;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
}
int ncomm;
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
double start=usecond();
for(int i=0;i<Nloop;i++){
ncomm=0;
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
}
comm_proc = mpi_layout[mu]-1;
{
std::vector<CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
Grid.Barrier();
double stop=usecond();
double mean=(stop-start)/Nloop;
double dbytes = bytes*ppn;
double xbytes = dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
<<std::right<< xbytes/mean<<" "
<< "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
}
}
for(int d=0;d<8;d++){
acceleratorFreeDevice(xbuf[d]);
acceleratorFreeDevice(rbuf[d]);
}
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
Grid_finalize();
}

View File

@ -108,7 +108,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "Drawing gauge field" << std::endl; std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeField Umu(UGrid); LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu); SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl; std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0 #if 0
Umu=1.0; Umu=1.0;

View File

@ -0,0 +1,364 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls=8;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src (FGrid); random(RNG5,src);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
LatticeFermionF result(FGrid); result=Zero();
LatticeFermionF ref(FGrid); ref=Zero();
LatticeFermionF tmp(FGrid);
LatticeFermionF err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrixF ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,FGrid);
{
autoView( Umu5d_v, Umu5d, CpuWrite);
autoView( Umu_v , Umu , CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
}
}
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =1000;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
Dw.Dhop(src,result,0);
__SSC_STOP;
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
//exit(0);
if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
Dw.Report();
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
}
}
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}
}
}
ref = -0.5*ref;
}
// dump=1;
Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermionF src_e (FrbGrid);
LatticeFermionF src_o (FrbGrid);
LatticeFermionF r_e (FrbGrid);
LatticeFermionF r_o (FrbGrid);
LatticeFermionF r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
Dw.ZeroCounters();
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
#ifdef CUDA_PROFILE
if(i==10) cudaProfilerStart();
#endif
Dw.DhopEO(src_o,r_e,DaggerNo);
#ifdef CUDA_PROFILE
if(i==20) cudaProfilerStop();
#endif
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
Dw.Report();
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
Dw.Dhop (src ,result,DaggerNo);
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@ -63,7 +63,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "Drawing gauge field" << std::endl; std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid); LatticeGaugeFieldF Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu); SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl; std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
RealD mass=0.1; RealD mass=0.1;

View File

@ -30,7 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
;
int main (int argc, char ** argv) int main (int argc, char ** argv)
@ -53,7 +53,7 @@ int main (int argc, char ** argv)
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Seeded"<<std::endl; std::cout << GridLogMessage << "Seeded"<<std::endl;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "made random gauge fields"<<std::endl; std::cout << GridLogMessage << "made random gauge fields"<<std::endl;

View File

@ -36,12 +36,12 @@ int main (int argc, char ** argv)
{ {
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
#define LMAX (48) #define LMAX (40)
#define LMIN (8) #define LMIN (8)
#define LADD (8) #define LADD (8)
int64_t Nwarm=50; int64_t Nwarm=10;
int64_t Nloop=500; int64_t Nloop=100;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
@ -118,6 +118,41 @@ int main (int argc, char ** argv)
} }
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 z=z+ x*y"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=LMIN;lat<=LMAX;lat+=LADD){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeColourMatrix z(&Grid); random(pRNG,z);
LatticeColourMatrix x(&Grid); random(pRNG,x);
LatticeColourMatrix y(&Grid); random(pRNG,y);
for(int64_t i=0;i<Nwarm;i++){
z=z+x*y;
}
double start=usecond();
for(int64_t i=0;i<Nloop;i++){
z=z+x*y;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000.0;
double bytes=4*vol*Nc*Nc*sizeof(Complex);
double flops=Nc*Nc*(6+8+8)*vol;
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 mult(z,x,y)"<<std::endl; std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 mult(z,x,y)"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@ -143,7 +178,6 @@ int main (int argc, char ** argv)
double start=usecond(); double start=usecond();
for(int64_t i=0;i<Nloop;i++){ for(int64_t i=0;i<Nloop;i++){
mult(z,x,y); mult(z,x,y);
// mac(z,x,y);
} }
double stop=usecond(); double stop=usecond();
double time = (stop-start)/Nloop*1000.0; double time = (stop-start)/Nloop*1000.0;
@ -191,7 +225,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 CovShiftForward(z,x,y)"<<std::endl; std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 CovShiftForward(z,x,y)"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GB/s (incl Cshift)\t\t GFlop/s"<<std::endl; std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=LMIN;lat<=LMAX;lat+=LADD){ for(int lat=LMIN;lat<=LMAX;lat+=LADD){
@ -216,16 +250,15 @@ int main (int argc, char ** argv)
double bytes=3*vol*Nc*Nc*sizeof(Complex); double bytes=3*vol*Nc*Nc*sizeof(Complex);
double ncbytes=5*vol*Nc*Nc*sizeof(Complex);
double flops=Nc*Nc*(6+8+8)*vol; double flops=Nc*Nc*(6+8+8)*vol;
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<ncbytes/time<<"\t\t" << flops/time<<std::endl; std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
} }
} }
#if 1 #if 1
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 z= x * Cshift(y)"<<std::endl; std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 z= x * Cshift(y)"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GB/s (incl Cshift)\t\t GFlop/s"<<std::endl; std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=LMIN;lat<=LMAX;lat+=LADD){ for(int lat=LMIN;lat<=LMAX;lat+=LADD){
@ -259,11 +292,10 @@ int main (int argc, char ** argv)
tmult = tmult /Nloop; tmult = tmult /Nloop;
double bytes=3*vol*Nc*Nc*sizeof(Complex); double bytes=3*vol*Nc*Nc*sizeof(Complex);
double ncbytes=5*vol*Nc*Nc*sizeof(Complex);
double flops=Nc*Nc*(6+8+8)*vol; double flops=Nc*Nc*(6+8+8)*vol;
std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl; std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl;
time = time * 1000; // convert to NS for GB/s time = time * 1000; // convert to NS for GB/s
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" <<ncbytes/time<<"\t\t" << flops/time<<std::endl; std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
} }
} }
#endif #endif

View File

@ -187,7 +187,8 @@ int main (int argc, char ** argv)
auto xx = coalescedRead(x_v[ss]); auto xx = coalescedRead(x_v[ss]);
auto yy = coalescedRead(y_v[ss]); auto yy = coalescedRead(y_v[ss]);
auto zz = coalescedRead(z_v[ss]); auto zz = coalescedRead(z_v[ss]);
zz = zz+xx*yy; //zz = zz+xx*yy;
mac(&zz,&xx,&yy);
coalescedWrite(z_v[ss],zz); coalescedWrite(z_v[ss],zz);
}); });
} }

View File

@ -123,6 +123,24 @@ case ${ac_LAPACK} in
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
esac esac
############### Nc
AC_ARG_ENABLE([Nc],
[AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
[ac_Nc=${enable_Nc}], [ac_Nc=3])
case ${ac_Nc} in
2)
AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
3)
AC_DEFINE([Config_Nc],[3],[Gauge group Nc]);;
4)
AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);;
5)
AC_DEFINE([Config_Nc],[5],[Gauge group Nc]);;
*)
AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
esac
############### FP16 conversions ############### FP16 conversions
AC_ARG_ENABLE([sfw-fp16], AC_ARG_ENABLE([sfw-fp16],
[AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])], [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
@ -135,18 +153,28 @@ case ${ac_SFW_FP16} in
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
esac esac
############### SUMMIT JSRUN ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
AC_ARG_ENABLE([summit], AC_ARG_ENABLE([accelerator-cshift],
[AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])], [AC_HELP_STRING([--enable-accelerator-cshift=yes|no], [run cshift on the device])],
[ac_SUMMIT=${enable_summit}], [ac_SUMMIT=no]) [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
case ${ac_SUMMIT} in
no);; AC_ARG_ENABLE([ucx-buggy],
[AC_HELP_STRING([--enable-ucx-buggy=yes|no], [enable workaround for UCX device buffer bugs])],
[ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
case ${ac_UCXBUGGY} in
yes) yes)
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; ac_ACC_CSHIFT=no;;
*) *);;
AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
esac esac
case ${ac_ACC_CSHIFT} in
yes)
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
*);;
esac
############### SYCL/CUDA/HIP/none ############### SYCL/CUDA/HIP/none
AC_ARG_ENABLE([accelerator], AC_ARG_ENABLE([accelerator],
[AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])], [AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])],
@ -163,8 +191,9 @@ case ${ac_ACCELERATOR} in
echo HIP acceleration echo HIP acceleration
AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);; AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);;
none) none)
echo NO acceleration echo NO acceleration ;;
;; no)
echo NO acceleration ;;
*) *)
AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);; AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);;
esac esac
@ -330,12 +359,18 @@ case ${CXXTEST} in
fi fi
;; ;;
hipcc) hipcc)
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" # CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
CXXLD=${CXX} CXXLD=${CXX}
if test $ac_openmp = yes; then if test $ac_openmp = yes; then
CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
fi fi
;; ;;
dpcpp)
LDFLAGS="$LDFLAGS"
CXXFLAGS="$CXXFLAGS"
CXXLD=${CXX}
;;
*) *)
CXXLD=${CXX} CXXLD=${CXX}
CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
@ -453,27 +488,26 @@ esac
AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS" AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS" AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"
############### Precision selection ###### PRECISION ALWAYS DOUBLE
AC_ARG_ENABLE([precision], AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
[AC_HELP_STRING([--enable-precision=single|double],
[Select default word size of Real])],
[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
case ${ac_PRECISION} in #########################################################
single) ###################### set GPU device to rank in node ##
AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] ) #########################################################
;; AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice],
double) [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no])
AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) case ${ac_SETDEVICE} in
;; yes);;
*) *)
AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]); AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] )
;; ;;
esac esac
###################### Shared memory allocation technique under MPI3 #########################################################
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone], ###################### Shared memory intranode #########
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen]) #########################################################
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no],
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no])
case ${ac_SHM} in case ${ac_SHM} in
@ -492,8 +526,12 @@ case ${ac_SHM} in
AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] ) AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
;; ;;
shmnone) shmnone | no)
AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] ) AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
AC_DEFINE([GRID_SHM_DISABLE],[1],[USE MPI for intranode comms]);;
nvlink)
AC_DEFINE([GRID_MPI3_SHM_NVLINK],[1],[GRID_MPI3_SHM_NVLINK] )
;; ;;
hugetlbfs) hugetlbfs)
@ -512,10 +550,23 @@ AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
[ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/]) [ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/])
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing]) AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
############### communication type selection
AC_ARG_ENABLE([comms-threads],[AC_HELP_STRING([--enable-comms-threads | --disable-comms-threads],
[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
case ${ac_COMMS_THREADS} in
yes)
AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
;;
*) ;;
esac
############### communication type selection ############### communication type selection
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto], AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto],
[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
case ${ac_COMMS} in case ${ac_COMMS} in
none) none)
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
@ -650,6 +701,7 @@ os (target) : $target_os
compiler vendor : ${ax_cv_cxx_compiler_vendor} compiler vendor : ${ax_cv_cxx_compiler_vendor}
compiler version : ${ax_cv_gxx_version} compiler version : ${ax_cv_gxx_version}
----- BUILD OPTIONS ----------------------------------- ----- BUILD OPTIONS -----------------------------------
Nc : ${ac_Nc}
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp} Threading : ${ac_openmp}
Acceleration : ${ac_ACCELERATOR} Acceleration : ${ac_ACCELERATOR}

View File

@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu
This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging: This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Debug
#### 2. `Release` #### 2. `Release`
Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation): Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`:
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Release
#### 3. `MPIDebug` #### 3. `MPIDebug`
Debug configuration with MPI: Debug configuration with MPI:
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug
### 5.3 Build Grid ### 5.3 Build Grid

View File

@ -178,15 +178,10 @@ Then enter the cloned directory and set up the build system::
Now you can execute the `configure` script to generate makefiles (here from a build directory):: Now you can execute the `configure` script to generate makefiles (here from a build directory)::
mkdir build; cd build mkdir build; cd build
../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto \ ../configure --enable-simd=AVX --enable-comms=mpi-auto \
--prefix=<path> --prefix=<path>
where:: ::
--enable-precision=single|double
sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations,
and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.::
--enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX --enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX
@ -236,7 +231,7 @@ Detailed build configuration options
--enable-mkl[=path] use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). --enable-mkl[=path] use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
--enable-simd=code setup Grid for the SIMD target `<code>`(default: `GEN`). A list of possible SIMD targets is detailed in a section below. --enable-simd=code setup Grid for the SIMD target `<code>`(default: `GEN`). A list of possible SIMD targets is detailed in a section below.
--enable-gen-simd-width=size select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes. --enable-gen-simd-width=size select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes.
--enable-precision=single|double set the default precision (default: `double`). --enable-precision=single|double set the default precision (default: `double`). **Deprecated option**
--enable-comms=mpi|none use `<comm>` for message passing (default: `none`). --enable-comms=mpi|none use `<comm>` for message passing (default: `none`).
--enable-rng=sitmo|ranlux48|mt19937 choose the RNG (default: `sitmo`). --enable-rng=sitmo|ranlux48|mt19937 choose the RNG (default: `sitmo`).
--disable-timers disable system dependent high-resolution timers. --disable-timers disable system dependent high-resolution timers.
@ -304,8 +299,7 @@ Build setup for Intel Knights Landing platform
The following configuration is recommended for the Intel Knights Landing platform:: The following configuration is recommended for the Intel Knights Landing platform::
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -314,8 +308,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi \ --enable-comms=mpi \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -332,8 +325,7 @@ presently performs better with use of more than one rank per node, using shared
for interior communication. for interior communication.
We recommend four ranks per node for best performance, but optimum is local volume dependent. :: We recommend four ranks per node for best performance, but optimum is local volume dependent. ::
../configure --enable-precision=double\ ../configure --enable-simd=KNL \
--enable-simd=KNL \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-mkl \ --enable-mkl \
CC=icpc MPICXX=mpiicpc CC=icpc MPICXX=mpiicpc
@ -343,8 +335,7 @@ Build setup for Intel Haswell Xeon platform
The following configuration is recommended for the Intel Haswell platform:: The following configuration is recommended for the Intel Haswell platform::
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-mkl \ --enable-mkl \
CXX=icpc MPICXX=mpiicpc CXX=icpc MPICXX=mpiicpc
@ -360,8 +351,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi \ --enable-comms=mpi \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -379,8 +369,7 @@ Build setup for Intel Skylake Xeon platform
The following configuration is recommended for the Intel Skylake platform:: The following configuration is recommended for the Intel Skylake platform::
../configure --enable-precision=double\ ../configure --enable-simd=AVX512 \
--enable-simd=AVX512 \
--enable-comms=mpi \ --enable-comms=mpi \
--enable-mkl \ --enable-mkl \
CXX=mpiicpc CXX=mpiicpc
@ -396,8 +385,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
../configure --enable-precision=double\ ../configure --enable-simd=AVX512 \
--enable-simd=AVX512 \
--enable-comms=mpi \ --enable-comms=mpi \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
@ -422,8 +410,7 @@ and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform:: The following configuration is recommended for the AMD EPYC platform::
../configure --enable-precision=double\ ../configure --enable-simd=AVX2 \
--enable-simd=AVX2 \
--enable-comms=mpi \ --enable-comms=mpi \
CXX=mpicxx CXX=mpicxx

View File

@ -69,7 +69,7 @@ int main (int argc, char ** argv)
std::vector<LatticeColourMatrix> U(4,&Fine); std::vector<LatticeColourMatrix> U(4,&Fine);
SU3::HotConfiguration(pRNGa,Umu); SU<Nc>::HotConfiguration(pRNGa,Umu);
FieldMetaData header; FieldMetaData header;

View File

@ -84,7 +84,7 @@ int main (int argc, char ** argv)
std::vector<LatticeColourMatrix> U(4,&Fine); std::vector<LatticeColourMatrix> U(4,&Fine);
SU3::HotConfiguration(pRNGa,Umu); SU<Nc>::HotConfiguration(pRNGa,Umu);
FieldMetaData header; FieldMetaData header;
std::string file("./ckpoint_lat.4000"); std::string file("./ckpoint_lat.4000");

View File

@ -80,7 +80,7 @@ int main (int argc, char ** argv)
GridParallelRNG sRNG5(sFGrid); sRNG5.SeedFixedIntegers(seeds5); GridParallelRNG sRNG5(sFGrid); sRNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid); LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu); SU<Nc>::HotConfiguration(RNG4,Umu);
RealD mass=0.1; RealD mass=0.1;
RealD M5 =1.8; RealD M5 =1.8;

View File

@ -202,7 +202,7 @@ int main (int argc, char ** argv) {
std::vector<int> seeds4({1,2,3,4}); std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid); LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu); SU<Nc>::HotConfiguration(RNG4,Umu);
// FieldMetaData header; // FieldMetaData header;
// NerscIO::readConfiguration(Umu,header,Params.config); // NerscIO::readConfiguration(Umu,header,Params.config);

View File

@ -71,7 +71,7 @@ int main (int argc, char ** argv)
LatticeGaugeFieldD Umu(UGrid); LatticeGaugeFieldD Umu(UGrid);
LatticeGaugeFieldF Umu_f(UGrid_f); LatticeGaugeFieldF Umu_f(UGrid_f);
SU3::HotConfiguration(RNG4,Umu); SU<Nc>::HotConfiguration(RNG4,Umu);
precisionChange(Umu_f,Umu); precisionChange(Umu_f,Umu);

View File

@ -69,7 +69,7 @@ int main (int argc, char ** argv)
LatticeGaugeFieldD Umu(UGrid); LatticeGaugeFieldD Umu(UGrid);
LatticeGaugeFieldF Umu_f(UGrid_f); LatticeGaugeFieldF Umu_f(UGrid_f);
SU3::HotConfiguration(RNG4,Umu); SU<Nc>::HotConfiguration(RNG4,Umu);
precisionChange(Umu_f,Umu); precisionChange(Umu_f,Umu);

View File

@ -64,7 +64,7 @@ int main (int argc, char ** argv)
LatticeFermion ref(FGrid); ref=Zero(); LatticeFermion ref(FGrid); ref=Zero();
LatticeFermion tmp(FGrid); LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid); LatticeFermion err(FGrid);
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid); std::vector<LatticeColourMatrix> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){

View File

@ -131,7 +131,7 @@ int main (int argc, char ** argv)
// LatticeFermion result(FGrid); result=Zero(); // LatticeFermion result(FGrid); result=Zero();
// LatticeGaugeField Umu(UGrid); // LatticeGaugeField Umu(UGrid);
// SU3::HotConfiguration(RNG4,Umu); // SU<Nc>::HotConfiguration(RNG4,Umu);
// std::vector<LatticeColourMatrix> U(4,UGrid); // std::vector<LatticeColourMatrix> U(4,UGrid);
// for(int mu=0;mu<Nd;mu++){ // for(int mu=0;mu<Nd;mu++){

View File

@ -69,7 +69,7 @@ int main (int argc, char ** argv)
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid); std::vector<LatticeColourMatrix> U(4,UGrid);
RealD mass=0.1; RealD mass=0.1;

View File

@ -73,7 +73,7 @@ int main (int argc, char ** argv)
LatticeFermion ref (FGrid); ref = Zero(); LatticeFermion ref (FGrid); ref = Zero();
LatticeFermion tmp (FGrid); tmp = Zero(); LatticeFermion tmp (FGrid); tmp = Zero();
LatticeFermion err (FGrid); err = Zero(); LatticeFermion err (FGrid); err = Zero();
LatticeGaugeField Umu (UGrid); SU3::HotConfiguration(RNG4, Umu); LatticeGaugeField Umu (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
std::vector<LatticeColourMatrix> U(4,UGrid); std::vector<LatticeColourMatrix> U(4,UGrid);
// Only one non-zero (y) // Only one non-zero (y)

View File

@ -72,7 +72,7 @@ int main (int argc, char ** argv)
LatticeFermion ref(FGrid); ref=Zero(); LatticeFermion ref(FGrid); ref=Zero();
LatticeFermion tmp(FGrid); tmp=Zero(); LatticeFermion tmp(FGrid); tmp=Zero();
LatticeFermion err(FGrid); tmp=Zero(); LatticeFermion err(FGrid); tmp=Zero();
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid); std::vector<LatticeColourMatrix> U(4,UGrid);
// Only one non-zero (y) // Only one non-zero (y)

View File

@ -29,91 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/Grid.h> #include <Grid/Grid.h>
using namespace Grid; using namespace Grid;
;
void MomentumSpacePropagatorTest(RealD mass,RealD M5, LatticePropagator &prop)
{
// what type LatticeComplex
GridBase *_grid = prop.Grid();
typedef LatticeFermion FermionField;
typedef LatticePropagator PropagatorField;
typedef typename FermionField::vector_type vector_type;
typedef typename FermionField::scalar_type ScalComplex;
typedef iSinglet<ScalComplex> Tcomplex;
typedef Lattice<iSinglet<vector_type> > LatComplex;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
Coordinate latt_size = _grid->_fdimensions;
PropagatorField num (_grid); num = Zero();
LatComplex sk(_grid); sk = Zero();
LatComplex sk2(_grid); sk2= Zero();
LatComplex W(_grid); W= Zero();
LatComplex a(_grid); a= Zero();
LatComplex one (_grid); one = ScalComplex(1.0,0.0);
LatComplex denom(_grid); denom= Zero();
LatComplex cosha(_grid);
LatComplex kmu(_grid);
LatComplex Wea(_grid);
LatComplex Wema(_grid);
ScalComplex ci(0.0,1.0);
SpinColourMatrixD identity = ComplexD(1.0);
for(int mu=0;mu<Nd;mu++) {
LatticeCoordinate(kmu,mu);
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
kmu = TwoPiL * kmu;
// kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
sk = sk + sin(kmu) *sin(kmu);
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*identity);
}
W = one - M5 + sk2;
////////////////////////////////////////////
// Cosh alpha -> alpha
////////////////////////////////////////////
cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd);
Tcomplex cc;
// RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a_v,lcoor);
}}
Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) );
num = num + ( one - Wema ) * mass * identity;
denom= ( Wea - one ) + mass*mass * (one - Wema);
prop = num/denom;
}
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
@ -222,7 +138,7 @@ int main (int argc, char ** argv)
LatticeGaugeFieldD Umu(&GRID); LatticeGaugeFieldD Umu(&GRID);
SU3::ColdConfiguration(pRNG,Umu); // Unit gauge SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
// Umu=Zero(); // Umu=Zero();
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Wilson test // Wilson test
@ -391,17 +307,6 @@ int main (int argc, char ** argv)
RealD M5 =0.8; RealD M5 =0.8;
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5); DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5);
/////////////////// Test code for (1-m)^2 ///////////////
LatticePropagatorD prop1(&GRID);
LatticePropagatorD prop2(&GRID);
LatticeComplexD ratio(&GRID);
MomentumSpacePropagatorTest(0.0,M5,prop1);
MomentumSpacePropagatorTest(0.3,M5,prop2);
ratio=localNorm2(prop2);
ratio=ratio/localNorm2(prop1);
std::cout << ratio;
/////////////////// Test code for (1-m)^2 factor ///////////////
// Momentum space prop // Momentum space prop
std::cout << " Solving by FFT and Feynman rules" <<std::endl; std::cout << " Solving by FFT and Feynman rules" <<std::endl;
bool fiveD = false; //calculate 4d free propagator bool fiveD = false; //calculate 4d free propagator

View File

@ -73,11 +73,11 @@ int main (int argc, char ** argv)
LatticeColourMatrix xform2(&GRID); // Gauge xform LatticeColourMatrix xform2(&GRID); // Gauge xform
LatticeColourMatrix xform3(&GRID); // Gauge xform LatticeColourMatrix xform3(&GRID); // Gauge xform
SU3::ColdConfiguration(pRNG,Umu); // Unit gauge SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
Uorg=Umu; Uorg=Umu;
Urnd=Umu; Urnd=Umu;
SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge SU<Nc>::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge
Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu); Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
std::cout << " Initial plaquette "<<plaq << std::endl; std::cout << " Initial plaquette "<<plaq << std::endl;
@ -121,7 +121,7 @@ int main (int argc, char ** argv)
std::cout<< "* Testing non-unit configuration *" <<std::endl; std::cout<< "* Testing non-unit configuration *" <<std::endl;
std::cout<< "*****************************************************************" <<std::endl; std::cout<< "*****************************************************************" <<std::endl;
SU3::HotConfiguration(pRNG,Umu); // Unit gauge SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge
plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu); plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
std::cout << " Initial plaquette "<<plaq << std::endl; std::cout << " Initial plaquette "<<plaq << std::endl;
@ -136,7 +136,7 @@ int main (int argc, char ** argv)
std::cout<< "*****************************************************************" <<std::endl; std::cout<< "*****************************************************************" <<std::endl;
Umu=Urnd; Umu=Urnd;
SU3::HotConfiguration(pRNG,Umu); // Unit gauge SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge
plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu); plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
std::cout << " Initial plaquette "<<plaq << std::endl; std::cout << " Initial plaquette "<<plaq << std::endl;

View File

@ -114,7 +114,7 @@ int main (int argc, char ** argv)
GridParallelRNG RNG4_2f(UGrid_2f); RNG4_2f.SeedFixedIntegers(seeds4); GridParallelRNG RNG4_2f(UGrid_2f); RNG4_2f.SeedFixedIntegers(seeds4);
GparityGaugeField Umu_2f(UGrid_2f); GparityGaugeField Umu_2f(UGrid_2f);
SU3::HotConfiguration(RNG4_2f,Umu_2f); SU<Nc>::HotConfiguration(RNG4_2f,Umu_2f);
StandardFermionField src (FGrid_2f); StandardFermionField src (FGrid_2f);
StandardFermionField tmpsrc(FGrid_2f); StandardFermionField tmpsrc(FGrid_2f);

View File

@ -61,7 +61,7 @@ int main (int argc, char ** argv)
FermionField ref(&Grid); ref=Zero(); FermionField ref(&Grid); ref=Zero();
FermionField tmp(&Grid); tmp=Zero(); FermionField tmp(&Grid); tmp=Zero();
FermionField err(&Grid); tmp=Zero(); FermionField err(&Grid); tmp=Zero();
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid); std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1; double volume=1;

View File

@ -66,7 +66,7 @@ int main(int argc, char** argv) {
std::cout << GridLogMessage << "*********************************************" std::cout << GridLogMessage << "*********************************************"
<< std::endl; << std::endl;
std::cout << GridLogMessage << "* Generators for SU(3)" << std::endl; std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl;
std::cout << GridLogMessage << "*********************************************" std::cout << GridLogMessage << "*********************************************"
<< std::endl; << std::endl;
SU3::printGenerators(); SU3::printGenerators();
@ -114,8 +114,8 @@ int main(int argc, char** argv) {
LatticeGaugeField U(grid), V(grid); LatticeGaugeField U(grid), V(grid);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U); SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V); SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V);
// Adjoint representation // Adjoint representation
// Test group structure // Test group structure
@ -123,8 +123,8 @@ int main(int argc, char** argv) {
LatticeGaugeField UV(grid); LatticeGaugeField UV(grid);
UV = Zero(); UV = Zero();
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
SU<Nc>::LatticeMatrix Umu = peekLorentz(U,mu); SU3::LatticeMatrix Umu = peekLorentz(U,mu);
SU<Nc>::LatticeMatrix Vmu = peekLorentz(V,mu); SU3::LatticeMatrix Vmu = peekLorentz(V,mu);
pokeLorentz(UV,Umu*Vmu, mu); pokeLorentz(UV,Umu*Vmu, mu);
} }
@ -151,16 +151,16 @@ int main(int argc, char** argv) {
// Check correspondence of algebra and group transformations // Check correspondence of algebra and group transformations
// Create a random vector // Create a random vector
SU<Nc>::LatticeAlgebraVector h_adj(grid); SU3::LatticeAlgebraVector h_adj(grid);
typename AdjointRep<Nc>::LatticeMatrix Ar(grid); typename AdjointRep<Nc>::LatticeMatrix Ar(grid);
random(gridRNG,h_adj); random(gridRNG,h_adj);
h_adj = real(h_adj); h_adj = real(h_adj);
SU_Adjoint<Nc>::AdjointLieAlgebraMatrix(h_adj,Ar); SU_Adjoint<Nc>::AdjointLieAlgebraMatrix(h_adj,Ar);
// Re-extract h_adj // Re-extract h_adj
SU<Nc>::LatticeAlgebraVector h_adj2(grid); SU3::LatticeAlgebraVector h_adj2(grid);
SU_Adjoint<Nc>::projectOnAlgebra(h_adj2, Ar); SU_Adjoint<Nc>::projectOnAlgebra(h_adj2, Ar);
SU<Nc>::LatticeAlgebraVector h_diff = h_adj - h_adj2; SU3::LatticeAlgebraVector h_diff = h_adj - h_adj2;
std::cout << GridLogMessage << "Projections structure check vector difference (Adjoint representation) : " << norm2(h_diff) << std::endl; std::cout << GridLogMessage << "Projections structure check vector difference (Adjoint representation) : " << norm2(h_diff) << std::endl;
// Exponentiate // Exponentiate
@ -183,14 +183,14 @@ int main(int argc, char** argv) {
// Construct the fundamental matrix in the group // Construct the fundamental matrix in the group
SU<Nc>::LatticeMatrix Af(grid); SU3::LatticeMatrix Af(grid);
SU<Nc>::FundamentalLieAlgebraMatrix(h_adj,Af); SU3::FundamentalLieAlgebraMatrix(h_adj,Af);
SU<Nc>::LatticeMatrix Ufund(grid); SU3::LatticeMatrix Ufund(grid);
Ufund = expMat(Af, 1.0, 16); Ufund = expMat(Af, 1.0, 16);
// Check unitarity // Check unitarity
SU<Nc>::LatticeMatrix uno_f(grid); SU3::LatticeMatrix uno_f(grid);
uno_f = 1.0; uno_f = 1.0;
SU<Nc>::LatticeMatrix UnitCheck(grid); SU3::LatticeMatrix UnitCheck(grid);
UnitCheck = Ufund * adj(Ufund) - uno_f; UnitCheck = Ufund * adj(Ufund) - uno_f;
std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck) std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck)
<< std::endl; << std::endl;
@ -311,14 +311,14 @@ int main(int argc, char** argv) {
// Test group structure // Test group structure
// (U_f * V_f)_r = U_r * V_r // (U_f * V_f)_r = U_r * V_r
LatticeGaugeField U2(grid), V2(grid); LatticeGaugeField U2(grid), V2(grid);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U2); SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V2); SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2);
LatticeGaugeField UV2(grid); LatticeGaugeField UV2(grid);
UV2 = Zero(); UV2 = Zero();
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
SU<Nc>::LatticeMatrix Umu2 = peekLorentz(U2,mu); SU3::LatticeMatrix Umu2 = peekLorentz(U2,mu);
SU<Nc>::LatticeMatrix Vmu2 = peekLorentz(V2,mu); SU3::LatticeMatrix Vmu2 = peekLorentz(V2,mu);
pokeLorentz(UV2,Umu2*Vmu2, mu); pokeLorentz(UV2,Umu2*Vmu2, mu);
} }
@ -345,16 +345,16 @@ int main(int argc, char** argv) {
// Check correspondence of algebra and group transformations // Check correspondence of algebra and group transformations
// Create a random vector // Create a random vector
SU<Nc>::LatticeAlgebraVector h_sym(grid); SU3::LatticeAlgebraVector h_sym(grid);
typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ar_sym(grid); typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ar_sym(grid);
random(gridRNG,h_sym); random(gridRNG,h_sym);
h_sym = real(h_sym); h_sym = real(h_sym);
SU_TwoIndex<Nc,Symmetric>::TwoIndexLieAlgebraMatrix(h_sym,Ar_sym); SU_TwoIndex<Nc,Symmetric>::TwoIndexLieAlgebraMatrix(h_sym,Ar_sym);
// Re-extract h_sym // Re-extract h_sym
SU<Nc>::LatticeAlgebraVector h_sym2(grid); SU3::LatticeAlgebraVector h_sym2(grid);
SU_TwoIndex< Nc, Symmetric>::projectOnAlgebra(h_sym2, Ar_sym); SU_TwoIndex< Nc, Symmetric>::projectOnAlgebra(h_sym2, Ar_sym);
SU<Nc>::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2; SU3::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl; std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl;
@ -379,11 +379,11 @@ int main(int argc, char** argv) {
// Construct the fundamental matrix in the group // Construct the fundamental matrix in the group
SU<Nc>::LatticeMatrix Af_sym(grid); SU3::LatticeMatrix Af_sym(grid);
SU<Nc>::FundamentalLieAlgebraMatrix(h_sym,Af_sym); SU3::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
SU<Nc>::LatticeMatrix Ufund2(grid); SU3::LatticeMatrix Ufund2(grid);
Ufund2 = expMat(Af_sym, 1.0, 16); Ufund2 = expMat(Af_sym, 1.0, 16);
SU<Nc>::LatticeMatrix UnitCheck2(grid); SU3::LatticeMatrix UnitCheck2(grid);
UnitCheck2 = Ufund2 * adj(Ufund2) - uno_f; UnitCheck2 = Ufund2 * adj(Ufund2) - uno_f;
std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2) std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2)
<< std::endl; << std::endl;
@ -421,14 +421,14 @@ int main(int argc, char** argv) {
// Test group structure // Test group structure
// (U_f * V_f)_r = U_r * V_r // (U_f * V_f)_r = U_r * V_r
LatticeGaugeField U2A(grid), V2A(grid); LatticeGaugeField U2A(grid), V2A(grid);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U2A); SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V2A); SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
LatticeGaugeField UV2A(grid); LatticeGaugeField UV2A(grid);
UV2A = Zero(); UV2A = Zero();
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
SU<Nc>::LatticeMatrix Umu2A = peekLorentz(U2,mu); SU3::LatticeMatrix Umu2A = peekLorentz(U2,mu);
SU<Nc>::LatticeMatrix Vmu2A = peekLorentz(V2,mu); SU3::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
pokeLorentz(UV2A,Umu2A*Vmu2A, mu); pokeLorentz(UV2A,Umu2A*Vmu2A, mu);
} }
@ -455,16 +455,16 @@ int main(int argc, char** argv) {
// Check correspondence of algebra and group transformations // Check correspondence of algebra and group transformations
// Create a random vector // Create a random vector
SU<Nc>::LatticeAlgebraVector h_Asym(grid); SU3::LatticeAlgebraVector h_Asym(grid);
typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid); typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid);
random(gridRNG,h_Asym); random(gridRNG,h_Asym);
h_Asym = real(h_Asym); h_Asym = real(h_Asym);
SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym); SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym);
// Re-extract h_sym // Re-extract h_sym
SU<Nc>::LatticeAlgebraVector h_Asym2(grid); SU3::LatticeAlgebraVector h_Asym2(grid);
SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym); SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym);
SU<Nc>::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2; SU3::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl; std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl;
@ -489,11 +489,11 @@ int main(int argc, char** argv) {
// Construct the fundamental matrix in the group // Construct the fundamental matrix in the group
SU<Nc>::LatticeMatrix Af_Asym(grid); SU3::LatticeMatrix Af_Asym(grid);
SU<Nc>::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym); SU3::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
SU<Nc>::LatticeMatrix Ufund2A(grid); SU3::LatticeMatrix Ufund2A(grid);
Ufund2A = expMat(Af_Asym, 1.0, 16); Ufund2A = expMat(Af_Asym, 1.0, 16);
SU<Nc>::LatticeMatrix UnitCheck2A(grid); SU3::LatticeMatrix UnitCheck2A(grid);
UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f; UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f;
std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A) std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A)
<< std::endl; << std::endl;

View File

@ -444,7 +444,7 @@ int main(int argc, char **argv) {
// Lattice 12x12 GEMM // Lattice 12x12 GEMM
scFooBar = scFoo * scBar; scFooBar = scFoo * scBar;
// Benchmark some simple operations LatticeSU3 * Lattice SU3. // Benchmark some simple operations LatticeSU<Nc> * Lattice SU<Nc>.
double t0, t1, flops; double t0, t1, flops;
double bytes; double bytes;
int ncall = 5000; int ncall = 5000;

View File

@ -73,7 +73,7 @@ int main (int argc, char ** argv)
LatticeFermion ref (FGrid); ref = Zero(); LatticeFermion ref (FGrid); ref = Zero();
LatticeFermion tmp (FGrid); tmp = Zero(); LatticeFermion tmp (FGrid); tmp = Zero();
LatticeFermion err (FGrid); err = Zero(); LatticeFermion err (FGrid); err = Zero();
LatticeGaugeField Umu (UGrid); SU3::HotConfiguration(RNG4, Umu); LatticeGaugeField Umu (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
std::vector<LatticeColourMatrix> U(4,UGrid); std::vector<LatticeColourMatrix> U(4,UGrid);
// Only one non-zero (y) // Only one non-zero (y)

View File

@ -55,7 +55,7 @@ int main (int argc, char ** argv)
GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds);
// SU3 colour operatoions // SU<Nc> colour operatoions
LatticeColourMatrix link(grid); LatticeColourMatrix link(grid);
LatticeColourMatrix staple(grid); LatticeColourMatrix staple(grid);
@ -87,10 +87,10 @@ int main (int argc, char ** argv)
link = PeekIndex<LorentzIndex>(Umu,mu); link = PeekIndex<LorentzIndex>(Umu,mu);
for( int subgroup=0;subgroup<SU3::su2subgroups();subgroup++ ) { for( int subgroup=0;subgroup<SU<Nc>::su2subgroups();subgroup++ ) {
// update Even checkerboard // update Even checkerboard
SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask); SU<Nc>::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask);
} }

View File

@ -64,7 +64,7 @@ int main (int argc, char ** argv)
FermionField err(&Grid); tmp=Zero(); FermionField err(&Grid); tmp=Zero();
FermionField phi (&Grid); random(pRNG,phi); FermionField phi (&Grid); random(pRNG,phi);
FermionField chi (&Grid); random(pRNG,chi); FermionField chi (&Grid); random(pRNG,chi);
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid); std::vector<LatticeColourMatrix> U(4,&Grid);

View File

@ -75,7 +75,7 @@ int main (int argc, char ** argv)
FermionField phi (FGrid); random(pRNG5,phi); FermionField phi (FGrid); random(pRNG5,phi);
FermionField chi (FGrid); random(pRNG5,chi); FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu); LatticeGaugeField Umu(UGrid); SU<Nc>::ColdConfiguration(pRNG4,Umu);
LatticeGaugeField Umua(UGrid); Umua=Umu; LatticeGaugeField Umua(UGrid); Umua=Umu;
double volume=Ls; double volume=Ls;

View File

@ -84,7 +84,7 @@ int main (int argc, char ** argv)
FermionField chi (FGrid); random(pRNG5,chi); FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeField Umu(UGrid); LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(pRNG4,Umu); SU<Nc>::HotConfiguration(pRNG4,Umu);
/* /*
for(int mu=1;mu<4;mu++){ for(int mu=1;mu<4;mu++){

View File

@ -83,7 +83,7 @@ int main (int argc, char ** argv)
FermionField chi (FGrid); random(pRNG5,chi); FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeFieldF Umu(UGrid); LatticeGaugeFieldF Umu(UGrid);
SU3::HotConfiguration(pRNG4,Umu); SU<Nc>::HotConfiguration(pRNG4,Umu);
/* /*
for(int mu=1;mu<4;mu++){ for(int mu=1;mu<4;mu++){

View File

@ -64,7 +64,7 @@ int main (int argc, char ** argv)
FermionField err(&Grid); tmp=Zero(); FermionField err(&Grid); tmp=Zero();
FermionField phi (&Grid); random(pRNG,phi); FermionField phi (&Grid); random(pRNG,phi);
FermionField chi (&Grid); random(pRNG,chi); FermionField chi (&Grid); random(pRNG,chi);
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid); std::vector<LatticeColourMatrix> U(4,&Grid);

106
tests/core/Test_unary.cc Normal file
View File

@ -0,0 +1,106 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_quenched_update.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt({8,8,8,8});
GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt,
GridDefaultSimd(Nd,vComplexD::Nsimd()),
GridDefaultMpi());
GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt,
GridDefaultSimd(Nd,vComplexF::Nsimd()),
GridDefaultMpi());
///////////////////////////////
// Configuration of known size
///////////////////////////////
LatticeColourMatrixD ident(grid);
LatticeColourMatrixD U(grid);
LatticeColourMatrixD tmp(grid);
LatticeColourMatrixD org(grid);
LatticeColourMatrixF UF(gridF);
LatticeGaugeField Umu(grid);
ident =1.0;
// RNG set up for test
std::vector<int> pseeds({1,2,3,4,5}); // once I caught a fish alive
std::vector<int> sseeds({6,7,8,9,10});// then i let it go again
GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds);
SU<Nc>::HotConfiguration(pRNG,Umu);
U = PeekIndex<LorentzIndex>(Umu,0);
org=U;
tmp= U*adj(U) - ident ;
RealD Def1 = norm2( tmp );
std::cout << " Defect1 "<<Def1<<std::endl;
tmp = U - org;
std::cout << "Diff1 "<<norm2(tmp)<<std::endl;
precisionChange(UF,U);
precisionChange(U,UF);
tmp= U*adj(U) - ident ;
RealD Def2 = norm2( tmp );
std::cout << " Defect2 "<<Def2<<std::endl;
tmp = U - org;
std::cout << "Diff2 "<<norm2(tmp)<<std::endl;
U = ProjectOnGroup(U);
tmp= U*adj(U) - ident ;
RealD Def3 = norm2( tmp);
std::cout << " Defect3 "<<Def3<<std::endl;
tmp = U - org;
std::cout << "Diff3 "<<norm2(tmp)<<std::endl;
Grid_finalize();
}

View File

@ -74,7 +74,7 @@ int main(int argc, char **argv)
FermionField chi(&Grid); FermionField chi(&Grid);
random(pRNG, chi); random(pRNG, chi);
LatticeGaugeField Umu(&Grid); LatticeGaugeField Umu(&Grid);
SU3::HotConfiguration(pRNG, Umu); SU<Nc>::HotConfiguration(pRNG, Umu);
std::vector<LatticeColourMatrix> U(4, &Grid); std::vector<LatticeColourMatrix> U(4, &Grid);
double volume = 1; double volume = 1;

View File

@ -70,7 +70,7 @@ int main (int argc, char ** argv)
LatticeFermion tmp(&Grid); tmp=Zero(); LatticeFermion tmp(&Grid); tmp=Zero();
LatticeFermion err(&Grid); tmp=Zero(); LatticeFermion err(&Grid); tmp=Zero();
LatticeGaugeField Umu(&Grid); LatticeGaugeField Umu(&Grid);
SU3::HotConfiguration(pRNG,Umu); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid); std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1; double volume=1;

View File

@ -71,7 +71,7 @@ int main (int argc, char ** argv)
LatticeFermion ref(&Grid); ref=Zero(); LatticeFermion ref(&Grid); ref=Zero();
LatticeFermion tmp(&Grid); tmp=Zero(); LatticeFermion tmp(&Grid); tmp=Zero();
LatticeFermion err(&Grid); tmp=Zero(); LatticeFermion err(&Grid); tmp=Zero();
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid); std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1; double volume=1;

Some files were not shown because too many files have changed in this diff Show More