From b562b50196a699587a3477e37fcedc4e3f86861c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 19 May 2015 21:29:07 +0100 Subject: [PATCH 01/27] Reworking to keep intel compiler happy --- benchmarks/Makefile.am | 2 +- lib/algorithms/LinearOperator.h | 2 +- lib/algorithms/approx/bigfloat.h | 2 +- lib/simd/Grid_vComplexD.h | 18 ++++++++++++++---- lib/simd/Grid_vComplexF.h | 18 +++++++++++++----- lib/simd/Grid_vRealD.h | 18 +++++++++++++----- lib/simd/Grid_vRealF.h | 20 +++++++++++++++----- tests/Grid_nersc_io.cc | 6 ++++-- 8 files changed, 62 insertions(+), 24 deletions(-) diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index 77ce7663..ae09a6db 100644 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -16,7 +16,7 @@ Grid_wilson_cg_unprec_LDADD = -lGrid Grid_comms_SOURCES = Grid_comms.cc Grid_comms_LDADD = -lGrid -Grid_su3_SOURCES = Grid_su3.cc +Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc Grid_su3_LDADD = -lGrid Grid_memory_bandwidth_SOURCES = Grid_memory_bandwidth.cc diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index bdb6c387..167213ec 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -25,7 +25,7 @@ namespace Grid { ///////////////////////////////////////////////////////////////////////////////////////////// template class HermitianOperatorBase : public LinearOperatorBase { public: - virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2); + virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2)=0; void AdjOp(const Field &in, Field &out) { Op(in,out); }; diff --git a/lib/algorithms/approx/bigfloat.h b/lib/algorithms/approx/bigfloat.h index aee71d03..a749fa81 100755 --- a/lib/algorithms/approx/bigfloat.h +++ b/lib/algorithms/approx/bigfloat.h @@ -31,7 +31,7 @@ public: bigfloat(const double d) { mpf_init_set_d(x, d); } bigfloat(const char *str) { mpf_init_set_str(x, (char*)str, 10); } ~bigfloat(void) { mpf_clear(x); } - operator const double (void) const { return (double)mpf_get_d(x); } + operator double (void) const { return (double)mpf_get_d(x); } static void setDefaultPrecision(unsigned long dprec) { unsigned long bprec = (unsigned long)(3.321928094 * (double)dprec); mpf_set_default_prec(bprec); diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h index 2b03c825..116c2866 100644 --- a/lib/simd/Grid_vComplexD.h +++ b/lib/simd/Grid_vComplexD.h @@ -345,20 +345,30 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ // REDUCE FIXME must be a cleaner implementation friend inline ComplexD Reduce(const vComplexD & in) { + vComplexD v1,v2; + union { + zvec v; + double f[sizeof(zvec)/sizeof(double)]; + } conv; + #ifdef SSE4 - return ComplexD(in.v[0],in.v[1]); + v1=in; #endif #if defined(AVX1) || defined (AVX2) - vComplexD v1; permute(v1,in,0); // sse 128; paired complex single v1=v1+in; - return ComplexD(v1.v[0],v1.v[1]); #endif #ifdef AVX512 - return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v)); + permute(v1,in,0); // sse 128; paired complex single + v1=v1+in; + permute(v2,v1,1); // avx 256; quad complex single + v1=v1+v2; #endif #ifdef QPX +#error #endif + conv.v = v1.v; + return ComplexD(conv.f[0],conv.f[1]); } // Unary negation diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index 713bbdd8..cf74ff03 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -234,26 +234,34 @@ namespace Grid { } friend inline ComplexF Reduce(const vComplexF & in) { + vComplexF v1,v2; + union { + cvec v; + float f[sizeof(cvec)/sizeof(float)]; + } conv; #ifdef SSE4 - vComplexF v1; permute(v1,in,0); // sse 128; paired complex single v1=v1+in; - return ComplexF(v1.v[0],v1.v[1]); #endif #if defined(AVX1) || defined (AVX2) - vComplexF v1,v2; permute(v1,in,0); // sse 128; paired complex single v1=v1+in; permute(v2,v1,1); // avx 256; quad complex single v1=v1+v2; - return ComplexF(v1.v[0],v1.v[1]); #endif #ifdef AVX512 - return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v)); + permute(v1,in,0); // avx512 octo-complex single + v1=v1+in; + permute(v2,v1,1); + v1=v1+v2; + permute(v2,v1,2); + v1=v1+v2; #endif #ifdef QPX #error #endif + conv.v = v1.v; + return ComplexF(conv.f[0],conv.f[1]); } friend inline vComplexF operator * (const ComplexF &a, vComplexF b){ diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h index 83979d14..a32e579e 100644 --- a/lib/simd/Grid_vRealD.h +++ b/lib/simd/Grid_vRealD.h @@ -210,25 +210,33 @@ namespace Grid { friend inline RealD Reduce(const vRealD & in) { + vRealD v1,v2; + union { + dvec v; + double f[sizeof(dvec)/sizeof(double)]; + } conv; #ifdef SSE4 - vRealD v1; permute(v1,in,0); // sse 128; paired real double v1=v1+in; - return RealD(v1.v[0]); #endif #if defined(AVX1) || defined (AVX2) - vRealD v1,v2; permute(v1,in,0); // avx 256; quad double v1=v1+in; permute(v2,v1,1); v1=v1+v2; - return v1.v[0]; #endif #ifdef AVX512 - return _mm512_reduce_add_pd(in.v); + permute(v1,in,0); // avx 512; octo-double + v1=v1+in; + permute(v2,v1,1); + v1=v1+v2; + permute(v2,v1,2); + v1=v1+v2; #endif #ifdef QPX #endif + conv.v=v1.v; + return conv.f[0]; } // *=,+=,-= operators diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Grid_vRealF.h index 754c5121..05bfa2f5 100644 --- a/lib/simd/Grid_vRealF.h +++ b/lib/simd/Grid_vRealF.h @@ -243,29 +243,39 @@ friend inline void vstore(const vRealF &ret, float *a){ } friend inline RealF Reduce(const vRealF & in) { -#ifdef SSE4 vRealF v1,v2; + union { + fvec v; + float f[sizeof(fvec)/sizeof(double)]; + } conv; +#ifdef SSE4 permute(v1,in,0); // sse 128; quad single v1=v1+in; permute(v2,v1,1); v1=v1+v2; - return v1.v[0]; #endif #if defined(AVX1) || defined (AVX2) - vRealF v1,v2; permute(v1,in,0); // avx 256; octo-double v1=v1+in; permute(v2,v1,1); v1=v1+v2; permute(v2,v1,2); v1=v1+v2; - return v1.v[0]; #endif #ifdef AVX512 - return _mm512_reduce_add_ps(in.v); + permute(v1,in,0); // avx 256; octo-double + v1=v1+in; + permute(v2,v1,1); + v1=v1+v2; + permute(v2,v1,2); + v1=v1+v2; + permute(v2,v1,3); + v1=v1+v2; #endif #ifdef QPX #endif + conv.v=v1.v; + return conv.f[0]; } // *=,+=,-= operators diff --git a/tests/Grid_nersc_io.cc b/tests/Grid_nersc_io.cc index fbef3cb1..6fe587a6 100644 --- a/tests/Grid_nersc_io.cc +++ b/tests/Grid_nersc_io.cc @@ -13,7 +13,7 @@ int main (int argc, char ** argv) std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); - std::vector latt_size ({16,16,16,32}); + std::vector latt_size = GridDefaultLatt(); std::vector clatt_size ({4,4,4,8}); int orthodir=3; int orthosz =latt_size[orthodir]; @@ -44,13 +44,15 @@ int main (int argc, char ** argv) // (1+2+3)=6 = N(N-1)/2 terms LatticeComplex Plaq(&Fine); LatticeComplex cPlaq(&Coarse); + Plaq = zero; +#if 1 for(int mu=1;mu Date: Tue, 19 May 2015 21:29:40 +0100 Subject: [PATCH 02/27] Build a simple kernel to compare intel compiler and clang in simple environment --- benchmarks/Grid_su3_test.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 benchmarks/Grid_su3_test.cc diff --git a/benchmarks/Grid_su3_test.cc b/benchmarks/Grid_su3_test.cc new file mode 100644 index 00000000..e4d4ec0a --- /dev/null +++ b/benchmarks/Grid_su3_test.cc @@ -0,0 +1,10 @@ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +void su3_test_mult(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y) +{ + z=x*y; +} From 35055ed5c1249c1391a64217a203fe5b3d410b8b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2015 06:34:33 +0100 Subject: [PATCH 03/27] Unroll pragma abstraction --- lib/Grid_threads.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Grid_threads.h b/lib/Grid_threads.h index 89a56d80..24581855 100644 --- a/lib/Grid_threads.h +++ b/lib/Grid_threads.h @@ -5,6 +5,8 @@ #define GRID_OMP #endif +#define UNROLL _Pragma("unroll") + #ifdef GRID_OMP #include #define PARALLEL_FOR_LOOP _Pragma("omp parallel for") From d8065816662c56de8741569686947e0b9c03fe06 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2015 06:35:46 +0100 Subject: [PATCH 04/27] better comms benchmarking --- benchmarks/Grid_comms.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/Grid_comms.cc b/benchmarks/Grid_comms.cc index e844c5d3..59d709a0 100644 --- a/benchmarks/Grid_comms.cc +++ b/benchmarks/Grid_comms.cc @@ -23,7 +23,7 @@ int main (int argc, char ** argv) - for(int lat=4;lat<=16;lat+=4){ + for(int lat=4;lat<=32;lat+=2){ for(int Ls=1;Ls<=16;Ls*=2){ std::vector latt_size ({lat,lat,lat,lat}); @@ -94,8 +94,7 @@ int main (int argc, char ** argv) std::cout << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"< latt_size ({lat,lat,lat,lat}); From 57a01e6bbb69f3b2d98fb0e060d58987d283f94a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2015 06:36:15 +0100 Subject: [PATCH 05/27] Didn't like a print statement --- lib/algorithms/iterative/ConjugateGradient.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index f7f8559e..85b196e9 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -15,7 +15,6 @@ public: Integer MaxIterations; ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { - std::cout << Tolerance< &Linop,const Field &src, Field &psi) {assert(0);}; From c96af471eedb11379b1700fd2667aa462526191a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2015 06:36:47 +0100 Subject: [PATCH 06/27] useful to dump assembler --- benchmarks/Grid_su3_test.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/Grid_su3_test.cc b/benchmarks/Grid_su3_test.cc index e4d4ec0a..8a7f6246 100644 --- a/benchmarks/Grid_su3_test.cc +++ b/benchmarks/Grid_su3_test.cc @@ -4,7 +4,8 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; -void su3_test_mult(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y) + +void su3_test_mult_routine(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y) { - z=x*y; + mult(z,x,y); } From 3e1d1aff185c7917670893361dfb298e84b7581b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2015 06:37:20 +0100 Subject: [PATCH 07/27] Minor change --- benchmarks/Grid_wilson_cg_unprec.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Grid_wilson_cg_unprec.cc b/benchmarks/Grid_wilson_cg_unprec.cc index 2a91864b..93023534 100644 --- a/benchmarks/Grid_wilson_cg_unprec.cc +++ b/benchmarks/Grid_wilson_cg_unprec.cc @@ -30,7 +30,7 @@ int main (int argc, char ** argv) GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); LatticeFermion src(&Grid); random(pRNG,src); - std::cout << "src norm" << norm2(src)< Date: Thu, 21 May 2015 06:37:46 +0100 Subject: [PATCH 08/27] adding two routines containing only a single operation so I can easily see the assembly dump --- benchmarks/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index ae09a6db..fc513e86 100644 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -16,7 +16,7 @@ Grid_wilson_cg_unprec_LDADD = -lGrid Grid_comms_SOURCES = Grid_comms.cc Grid_comms_LDADD = -lGrid -Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc +Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc Grid_su3_expr.cc Grid_su3_LDADD = -lGrid Grid_memory_bandwidth_SOURCES = Grid_memory_bandwidth.cc From 874b2eb32de3334528e856d59cb6399cb849c7ae Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2015 06:39:00 +0100 Subject: [PATCH 09/27] Compile time select if we do the streaming store copy. Relies on Clang++ eliminating object copies, and other compliers do not necessarily cope. --- lib/lattice/Grid_lattice_arith.h | 61 ++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h index f1e566a2..ff966578 100644 --- a/lib/lattice/Grid_lattice_arith.h +++ b/lib/lattice/Grid_lattice_arith.h @@ -12,10 +12,13 @@ namespace Grid { conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); - // mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#else + mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif } } @@ -24,9 +27,13 @@ PARALLEL_FOR_LOOP conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif } } @@ -35,9 +42,13 @@ PARALLEL_FOR_LOOP conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif } } template strong_inline @@ -45,9 +56,13 @@ PARALLEL_FOR_LOOP conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); +#endif } } @@ -81,9 +96,13 @@ PARALLEL_FOR_LOOP conformable(lhs,ret); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; sub(&tmp,&lhs._odata[ss],&rhs); vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs._odata[ss],&rhs); +#endif } } template strong_inline @@ -91,9 +110,13 @@ PARALLEL_FOR_LOOP conformable(lhs,ret); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; add(&tmp,&lhs._odata[ss],&rhs); vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs._odata[ss],&rhs); +#endif } } @@ -105,9 +128,13 @@ PARALLEL_FOR_LOOP conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; mult(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + mult(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif } } @@ -116,9 +143,13 @@ PARALLEL_FOR_LOOP conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; mac(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + mac(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif } } @@ -127,9 +158,13 @@ PARALLEL_FOR_LOOP conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; sub(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + sub(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif } } template strong_inline @@ -137,9 +172,13 @@ PARALLEL_FOR_LOOP conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES obj1 tmp; add(&tmp,&lhs,&rhs._odata[ss]); vstream(ret._odata[ss],tmp); +#else + add(&ret._odata[ss],&lhs,&rhs._odata[ss]); +#endif } } @@ -148,8 +187,12 @@ PARALLEL_FOR_LOOP conformable(x,y); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES vobj tmp = a*x._odata[ss]+y._odata[ss]; vstream(ret._odata[ss],tmp); +#else + ret._odata[ss]=a*x._odata[ss]+y._odata[ss]; +#endif } } template strong_inline @@ -157,29 +200,25 @@ PARALLEL_FOR_LOOP conformable(x,y); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ +#ifdef STREAMING_STORES vobj tmp = a*x._odata[ss]+b*y._odata[ss]; vstream(ret._odata[ss],tmp); +#else + ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss]; +#endif } } template strong_inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ conformable(x,y); -#pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - vobj tmp = a*x._odata[ss]+y._odata[ss]; - vstream(ret._odata[ss],tmp); - } + axpy(ret,a,x,y); return norm2(ret); } template strong_inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ conformable(x,y); -#pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - vobj tmp = a*x._odata[ss]+b*y._odata[ss]; - vstream(ret._odata[ss],tmp); - } + axpby(ret,a,b,x,y); return norm2(ret); // FIXME implement parallel norm in ss loop } From d8061afe2456dc8c46499bd803e1726e6391fc51 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2015 06:47:05 +0100 Subject: [PATCH 10/27] Streaming store option ifdef --- lib/lattice/Grid_lattice_base.h | 41 ++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h index 0016bafa..41f349b7 100644 --- a/lib/lattice/Grid_lattice_base.h +++ b/lib/lattice/Grid_lattice_base.h @@ -1,6 +1,8 @@ #ifndef GRID_LATTICE_BASE_H #define GRID_LATTICE_BASE_H +#define STREAMING_STORES + namespace Grid { // TODO: @@ -66,8 +68,12 @@ public: { PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ - vobj tmp= eval(ss,expr); +#ifdef STREAMING_STORES + vobj tmp = eval(ss,expr); vstream(_odata[ss] ,tmp); +#else + _odata[ss]=eval(ss,expr); +#endif } return *this; } @@ -75,8 +81,12 @@ PARALLEL_FOR_LOOP { PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ - vobj tmp= eval(ss,expr); +#ifdef STREAMING_STORES + vobj tmp = eval(ss,expr); vstream(_odata[ss] ,tmp); +#else + _odata[ss]=eval(ss,expr); +#endif } return *this; } @@ -84,8 +94,12 @@ PARALLEL_FOR_LOOP { PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ - vobj tmp= eval(ss,expr); +#ifdef STREAMING_STORES + vobj tmp = eval(ss,expr); vstream(_odata[ss] ,tmp); +#else + _odata[ss] = eval(ss,expr); +#endif } return *this; } @@ -97,7 +111,12 @@ PARALLEL_FOR_LOOP _odata.resize(_grid->oSites()); PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ - _odata[ss] = eval(ss,expr); +#ifdef STREAMING_STORES + vobj tmp = eval(ss,expr); + vstream(_odata[ss] ,tmp); +#else + _odata[ss]=_eval(ss,expr); +#endif } }; template @@ -107,7 +126,12 @@ PARALLEL_FOR_LOOP _odata.resize(_grid->oSites()); PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ - _odata[ss] = eval(ss,expr); +#ifdef STREAMING_STORES + vobj tmp = eval(tmp,ss,expr); + vstream(_odata[ss] ,tmp); +#else + _odata[ss]=eval(ss,expr); +#endif } }; template @@ -117,7 +141,12 @@ PARALLEL_FOR_LOOP _odata.resize(_grid->oSites()); PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ - _odata[ss] = eval(ss,expr); +#ifdef STREAMING_STORES + vobj tmp = eval(ss,expr); + vstream(_odata[ss] ,tmp); +#else + _odata[ss]=eval(ss,expr); +#endif } }; From ae58a9ada21a6fb8e05bd254bd2c99861ae95ef9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 23 May 2015 09:30:28 +0100 Subject: [PATCH 11/27] Iterator required --- lib/Grid_init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Grid_init.cc b/lib/Grid_init.cc index 6af0be84..572672eb 100644 --- a/lib/Grid_init.cc +++ b/lib/Grid_init.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include From 764732944f1501368d07f92fafd5f65901764a47 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 23 May 2015 09:31:15 +0100 Subject: [PATCH 12/27] Cosmetic --- lib/math/Grid_math_arith_mul.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/math/Grid_math_arith_mul.h b/lib/math/Grid_math_arith_mul.h index 04b05bb4..fcdd60ee 100644 --- a/lib/math/Grid_math_arith_mul.h +++ b/lib/math/Grid_math_arith_mul.h @@ -16,19 +16,18 @@ strong_inline void mult(iScalar * __restrict__ ret,const iScalar * template strong_inline void mult(iMatrix * __restrict__ ret,const iMatrix * __restrict__ lhs,const iMatrix * __restrict__ rhs){ for(int c1=0;c1_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); + mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]); } } - for(int c3=1;c3_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]); } } } - return; + return; } template From a2928321b6c9fa23686208fb18225916980698bc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 23 May 2015 09:32:37 +0100 Subject: [PATCH 13/27] Better pragma use --- lib/lattice/Grid_lattice_arith.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h index ff966578..a6f59b56 100644 --- a/lib/lattice/Grid_lattice_arith.h +++ b/lib/lattice/Grid_lattice_arith.h @@ -185,7 +185,7 @@ PARALLEL_FOR_LOOP template strong_inline void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ conformable(x,y); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES vobj tmp = a*x._odata[ss]+y._odata[ss]; @@ -198,7 +198,7 @@ PARALLEL_FOR_LOOP template strong_inline void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ conformable(x,y); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES vobj tmp = a*x._odata[ss]+b*y._odata[ss]; From d07a5c084d803450f2379721854b4e61e1faf79d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 23 May 2015 09:33:42 +0100 Subject: [PATCH 14/27] Rely on default constructors --- lib/math/Grid_math_tensors.h | 120 +++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 54 deletions(-) diff --git a/lib/math/Grid_math_tensors.h b/lib/math/Grid_math_tensors.h index a0424576..32ca78e5 100644 --- a/lib/math/Grid_math_tensors.h +++ b/lib/math/Grid_math_tensors.h @@ -34,62 +34,64 @@ public: // Scalar no action // template using tensor_reduce_level = typename iScalar::tensor_reduce_level >; - iScalar()=default; + iScalar() = default; + /* + iScalar(const iScalar ©me)=default; + iScalar(iScalar &©me)=default; + iScalar & operator= (const iScalar ©me) = default; + iScalar & operator= (iScalar &©me) = default; + */ iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type iScalar(const Zero &z){ *this = zero; }; - iScalar & operator= (const Zero &hero){ - zeroit(*this); - return *this; - } - friend strong_inline void vstream(iScalar &out,const iScalar &in){ - vstream(out._internal,in._internal); - } + iScalar & operator= (const Zero &hero){ + zeroit(*this); + return *this; + } + friend strong_inline void vstream(iScalar &out,const iScalar &in){ + vstream(out._internal,in._internal); + } + friend strong_inline void zeroit(iScalar &that){ + zeroit(that._internal); + } + friend strong_inline void prefetch(iScalar &that){ + prefetch(that._internal); + } + friend strong_inline void permute(iScalar &out,const iScalar &in,int permutetype){ + permute(out._internal,in._internal,permutetype); + } - - friend strong_inline void zeroit(iScalar &that){ - zeroit(that._internal); - } - friend strong_inline void prefetch(iScalar &that){ - prefetch(that._internal); - } - friend strong_inline void permute(iScalar &out,const iScalar &in,int permutetype){ - permute(out._internal,in._internal,permutetype); - } - - // Unary negation - friend strong_inline iScalar operator -(const iScalar &r) { - iScalar ret; - ret._internal= -r._internal; - return ret; - } - // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour - strong_inline iScalar &operator *=(const iScalar &r) { - *this = (*this)*r; - return *this; - } - strong_inline iScalar &operator -=(const iScalar &r) { - *this = (*this)-r; - return *this; - } - strong_inline iScalar &operator +=(const iScalar &r) { - *this = (*this)+r; - return *this; - } - - strong_inline vtype & operator ()(void) { - return _internal; - } - - strong_inline const vtype & operator ()(void) const { - return _internal; - } - - operator ComplexD () const { return(TensorRemove(_internal)); }; - operator RealD () const { return(real(TensorRemove(_internal))); } - - // convert from a something to a scalar - template::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar + // Unary negation + friend strong_inline iScalar operator -(const iScalar &r) { + iScalar ret; + ret._internal= -r._internal; + return ret; + } + // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour + strong_inline iScalar &operator *=(const iScalar &r) { + *this = (*this)*r; + return *this; + } + strong_inline iScalar &operator -=(const iScalar &r) { + *this = (*this)-r; + return *this; + } + strong_inline iScalar &operator +=(const iScalar &r) { + *this = (*this)+r; + return *this; + } + strong_inline vtype & operator ()(void) { + return _internal; + } + strong_inline const vtype & operator ()(void) const { + return _internal; + } + + operator ComplexD () const { return(TensorRemove(_internal)); }; + operator RealD () const { return(real(TensorRemove(_internal))); } + + // convert from a something to a scalar + template::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar { _internal = vtype(arg); return *this; @@ -125,6 +127,12 @@ public: enum { TensorLevel = GridTypeMapper::TensorLevel + 1}; iVector(const Zero &z){ *this = zero; }; iVector() =default; + /* + iVector(const iVector ©me)=default; + iVector(iVector &©me)=default; + iVector & operator= (const iVector ©me) = default; + iVector & operator= (iVector &©me) = default; + */ iVector & operator= (const Zero &hero){ zeroit(*this); @@ -205,8 +213,12 @@ public: iMatrix(const Zero &z){ *this = zero; }; iMatrix() =default; - - + /* + iMatrix(const iMatrix ©me)=default; + iMatrix(iMatrix &©me)=default; + iMatrix & operator= (const iMatrix ©me) = default; + iMatrix & operator= (iMatrix &©me) = default; + */ iMatrix & operator= (const Zero &hero){ zeroit(*this); return *this; From 65f2e6b2696394992656f8a6a01c4ffd80507488 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 23 May 2015 09:34:16 +0100 Subject: [PATCH 15/27] Improving even odd sector; lot of work and through required cleaning this --- lib/qcd/Grid_qcd_wilson_dop.cc | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc index 9ef4af0c..2020a9ec 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.cc +++ b/lib/qcd/Grid_qcd_wilson_dop.cc @@ -97,45 +97,44 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out) { - Dhop(in,out,0); + this->Dhop(in,out,0); out = (4+mass)*in - 0.5*out ; // FIXME : axpby_norm! fusion fun return norm2(out); } RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out) { - Dhop(in,out,1); + this->Dhop(in,out,1); out = (4+mass)*in - 0.5*out ; // FIXME : axpby_norm! fusion fun return norm2(out); } void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out) { - Dhop(in,out,0); + this->Dhop(in,out,0); out = 0.5*out; // FIXME : scale factor in Dhop } void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out) { - Dhop(in,out,1); + this->Dhop(in,out,1); + out = 0.5*out; // FIXME : scale factor in Dhop } void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out) { out = (4.0+mass)*in; return ; } -void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out) -{ - out = (1.0/(4.0+mass))*in; - return ; -} void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out) +{ + this->Mooee(in,out); +} +void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out) { out = (1.0/(4.0+mass))*in; return ; } void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out) { - out = (1.0/(4.0+mass))*in; - return ; + this->MooeeInv(in,out); } void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out) @@ -451,8 +450,6 @@ PARALLEL_FOR_LOOP } - - - -}} +} +} From b8fdb65fbf89273f8cfd7e7b8691bf16a8ea89b1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 23 May 2015 09:34:50 +0100 Subject: [PATCH 16/27] More targets --- scripts/configure-all | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/configure-all b/scripts/configure-all index f42b1960..ad91034d 100755 --- a/scripts/configure-all +++ b/scripts/configure-all @@ -1,6 +1,6 @@ #!/bin/bash -DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse" +DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp" for D in $DIRS do From 73ee36c48d14d02c5be8937b84eb6eda1778b4e6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 23 May 2015 09:35:37 +0100 Subject: [PATCH 17/27] Extra targets --- scripts/configure-commands | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/configure-commands b/scripts/configure-commands index c7c35e1c..538f3661 100755 --- a/scripts/configure-commands +++ b/scripts/configure-commands @@ -25,6 +25,12 @@ g++-avx) icpc-avx) CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none ;; +icpc-avx-openmp-mpi) +CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi +;; +icpc-avx-openmp) +CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi +;; icpc-avx2) CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none ;; From 280627334076bfafe30a77b30a5d1eab824f98cd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 23 May 2015 09:36:01 +0100 Subject: [PATCH 18/27] Added --- benchmarks/Grid_su3_expr.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 benchmarks/Grid_su3_expr.cc diff --git a/benchmarks/Grid_su3_expr.cc b/benchmarks/Grid_su3_expr.cc new file mode 100644 index 00000000..e20bcef4 --- /dev/null +++ b/benchmarks/Grid_su3_expr.cc @@ -0,0 +1,11 @@ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +void su3_test_mult_expression(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y) +{ + z=x*y; +} + From 17a06af1ff2c79edd662cc436ddf1ba061fe2f31 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 25 May 2015 13:42:12 +0100 Subject: [PATCH 19/27] red black fix --- lib/Grid_stencil.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Grid_stencil.h b/lib/Grid_stencil.h index ce987746..fa22361b 100644 --- a/lib/Grid_stencil.h +++ b/lib/Grid_stencil.h @@ -120,8 +120,8 @@ namespace Grid { // Map to always positive shift modulo global full dimension. int shift = (displacement+fd)%fd; - int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); - assert (checkerboard== _checkerboard); + // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); + assert (source.checkerboard== _checkerboard); // the permute type int simd_layout = _grid->_simd_layout[dimension]; From 9b5633ff4f6b13bfd07cebffadbe7bd70177efea Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 25 May 2015 13:42:36 +0100 Subject: [PATCH 20/27] Herm op --- lib/algorithms/LinearOperator.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 167213ec..cb6240ba 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -106,6 +106,10 @@ namespace Grid { public: virtual void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) = 0; }; + template class HermitianOperatorFunction { + public: + virtual void operator() (HermitianOperatorBase &Linop, const Field &in, Field &out) = 0; + }; // FIXME : To think about From 29f72292ba44ec5408b73dc94da2cedc5a03b3c0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 25 May 2015 13:43:58 +0100 Subject: [PATCH 21/27] Updates now schur red black solver working --- lib/algorithms/SparseMatrix.h | 18 ++--- lib/algorithms/iterative/ConjugateGradient.h | 49 ++++++++----- lib/algorithms/iterative/SchurRedBlack.h | 73 +++++++++++--------- 3 files changed, 82 insertions(+), 58 deletions(-) diff --git a/lib/algorithms/SparseMatrix.h b/lib/algorithms/SparseMatrix.h index 6c54fc92..7bfe959b 100644 --- a/lib/algorithms/SparseMatrix.h +++ b/lib/algorithms/SparseMatrix.h @@ -10,6 +10,7 @@ namespace Grid { ///////////////////////////////////////////////////////////////////////////////////////////// template class SparseMatrixBase { public: + GridBase *_grid; // Full checkerboar operations virtual RealD M (const Field &in, Field &out)=0; virtual RealD Mdag (const Field &in, Field &out)=0; @@ -18,6 +19,7 @@ namespace Grid { ni=M(in,tmp); no=Mdag(tmp,out); } + SparseMatrixBase(GridBase *grid) : _grid(grid) {}; }; ///////////////////////////////////////////////////////////////////////////////////////////// @@ -25,7 +27,7 @@ namespace Grid { ///////////////////////////////////////////////////////////////////////////////////////////// template class CheckerBoardedSparseMatrixBase : public SparseMatrixBase { public: - + GridBase *_cbgrid; // half checkerboard operaions virtual void Meooe (const Field &in, Field &out)=0; virtual void Mooee (const Field &in, Field &out)=0; @@ -44,9 +46,7 @@ namespace Grid { Meooe(out,tmp); Mooee(in,out); - out=out-tmp; // axpy_norm - RealD n=norm2(out); - return n; + return axpy_norm(out,-1.0,tmp,out); } virtual RealD MpcDag (const Field &in, Field &out){ Field tmp(in._grid); @@ -56,15 +56,15 @@ namespace Grid { MeooeDag(out,tmp); MooeeDag(in,out); - out=out-tmp; // axpy_norm - RealD n=norm2(out); - return n; + return axpy_norm(out,-1.0,tmp,out); } - virtual void MpcDagMpc(const Field &in, Field &out,RealD ni,RealD no) { + virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { Field tmp(in._grid); ni=Mpc(in,tmp); - no=Mpc(tmp,out); + no=MpcDag(tmp,out); + // std::cout<<"MpcDagMpc "<(grid), _cbgrid(cbgrid) {}; }; } diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index 85b196e9..d4f89662 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -9,17 +9,21 @@ namespace Grid { ///////////////////////////////////////////////////////////// template - class ConjugateGradient : public OperatorFunction { + class ConjugateGradient : public HermitianOperatorFunction { public: RealD Tolerance; Integer MaxIterations; - + int verbose; ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { + verbose=0; }; - void operator() (LinearOperatorBase &Linop,const Field &src, Field &psi) {assert(0);}; + void operator() (HermitianOperatorBase &Linop,const Field &src, Field &psi){ + psi.checkerboard = src.checkerboard; + conformable(psi,src); + RealD cp,c,a,d,b,ssq,qq,b_pred; Field p(src); @@ -37,14 +41,16 @@ public: a =norm2(p); cp =a; ssq=norm2(src); - - std::cout < sol_e = M_ee^-1 * ( src_e - Meo sol_o )... * */ - namespace Grid { /////////////////////////////////////////////////////////////////////////////////////////////////////// // Take a matrix and form a Red Black solver calling a Herm solver // Use of RB info prevents making SchurRedBlackSolve conform to standard interface /////////////////////////////////////////////////////////////////////////////////////////////////////// - template class SchurRedBlackSolve : public OperatorFunction{ + template class SchurRedBlackSolve { private: - SparseMatrixBase & _Matrix; - OperatorFunction & _HermitianRBSolver; + HermitianOperatorFunction & _HermitianRBSolver; int CBfactorise; public: ///////////////////////////////////////////////////// // Wrap the usual normal equations Schur trick ///////////////////////////////////////////////////// - SchurRedBlackSolve(SparseMatrixBase &Matrix, OperatorFunction &HermitianRBSolver) - : _Matrix(Matrix), _HermitianRBSolver(HermitianRBSolver) { + SchurRedBlackSolve(HermitianOperatorFunction &HermitianRBSolver) : + _HermitianRBSolver(HermitianRBSolver) + { CBfactorise=0; }; - void operator() (const Field &in, Field &out){ + template + void operator() (Matrix & _Matrix,const Field &in, Field &out){ // FIXME CGdiagonalMee not implemented virtual function - // FIXME need to make eo grid from full grid. // FIXME use CBfactorise to control schur decomp - const int Even=0; - const int Odd =1; - - // Make a cartesianRedBlack from full Grid - GridRedBlackCartesian grid(in._grid); + GridBase *grid = _Matrix._cbgrid; + GridBase *fgrid= _Matrix._grid; - Field src_e(&grid); - Field src_o(&grid); - Field sol_e(&grid); - Field sol_o(&grid); - Field tmp(&grid); - Field Mtmp(&grid); - + Field src_e(grid); + Field src_o(grid); + Field sol_e(grid); + Field sol_o(grid); + Field tmp(grid); + Field Mtmp(grid); + Field resid(fgrid); + pickCheckerboard(Even,src_e,in); pickCheckerboard(Odd ,src_o,in); ///////////////////////////////////////////////////// // src_o = Mdag * (source_o - Moe MeeInv source_e) ///////////////////////////////////////////////////// - _Matrix.MooeeInv(src_e,tmp); // MooeeInv(source[Even],tmp,DaggerNo,Even); - _Matrix.Meooe (tmp,Mtmp); // Meo (tmp,src,Odd,DaggerNo); - tmp=src_o-Mtmp; // axpy (tmp,src,source[Odd],-1.0); - _Matrix.MpcDag(tmp,src_o); // Mprec(tmp,src,Mtmp,DaggerYes); + _Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even); + _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); + tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); + _Matrix.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd); ////////////////////////////////////////////////////////////// // Call the red-black solver ////////////////////////////////////////////////////////////// - _HermitianRBSolver(src_o,sol_o); // CGNE_prec_MdagM(solution[Odd],src); + HermitianCheckerBoardedOperator _HermOpEO(_Matrix); + std::cout << "SchurRedBlack solver calling the MpcDagMp solver" < Date: Mon, 25 May 2015 13:44:35 +0100 Subject: [PATCH 22/27] move constants into red black --- lib/cartesian/Grid_cartesian_red_black.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/cartesian/Grid_cartesian_red_black.h b/lib/cartesian/Grid_cartesian_red_black.h index 475b47c2..ff2d3ba8 100644 --- a/lib/cartesian/Grid_cartesian_red_black.h +++ b/lib/cartesian/Grid_cartesian_red_black.h @@ -4,6 +4,13 @@ namespace Grid { + static const int CbRed =0; + static const int CbBlack=1; + static const int Even =CbRed; + static const int Odd =CbBlack; + static const int DaggerNo=0; + static const int DaggerYes=1; + // Specialise this for red black grids storing half the data like a chess board. class GridRedBlackCartesian : public GridBase { @@ -44,6 +51,9 @@ public: return source_cb; } }; + + GridRedBlackCartesian(GridBase *base) : GridRedBlackCartesian(base->_fdimensions,base->_simd_layout,base->_processors) {}; + GridRedBlackCartesian(std::vector &dimensions, std::vector &simd_layout, std::vector &processor_grid ) : GridBase(processor_grid) From 3358a77c7a111193b6310ed7b68b2cee48d34775 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 25 May 2015 13:45:08 +0100 Subject: [PATCH 23/27] Better checkerboard tracking. --- lib/lattice/Grid_lattice_ET.h | 41 ++++++++++++++++++++++++++ lib/lattice/Grid_lattice_arith.h | 28 ++++++++++++++++-- lib/lattice/Grid_lattice_base.h | 50 ++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 2 deletions(-) diff --git a/lib/lattice/Grid_lattice_ET.h b/lib/lattice/Grid_lattice_ET.h index b0205f2f..2a7172ea 100644 --- a/lib/lattice/Grid_lattice_ET.h +++ b/lib/lattice/Grid_lattice_ET.h @@ -91,6 +91,47 @@ inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression< GridFromExpression(grid,std::get<2>(expr.second)); } + +////////////////////////////////////////////////////////////////////////// +// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion +////////////////////////////////////////////////////////////////////////// +template::value, T1>::type * =nullptr > +inline void CBFromExpression(int &cb,const T1& lat) // Lattice leaf +{ + if ( (cb==Odd) || (cb==Even) ) { + assert(cb==lat.checkerboard); + } + cb=lat.checkerboard; + // std::cout<<"Lattice leaf cb "<::value, T1>::type * = nullptr > +inline void CBFromExpression(int &cb,const T1& notlat) // non-lattice leaf +{ + // std::cout<<"Non lattice leaf cb"< +inline void CBFromExpression(int &cb,const LatticeUnaryExpression &expr) +{ + CBFromExpression(cb,std::get<0>(expr.second));// recurse + // std::cout<<"Unary node cb "< +inline void CBFromExpression(int &cb,const LatticeBinaryExpression &expr) +{ + CBFromExpression(cb,std::get<0>(expr.second));// recurse + CBFromExpression(cb,std::get<1>(expr.second)); + // std::cout<<"Binary node cb "< +inline void CBFromExpression( int &cb,const LatticeTrinaryExpression &expr) +{ + CBFromExpression(cb,std::get<0>(expr.second));// recurse + CBFromExpression(cb,std::get<1>(expr.second)); + CBFromExpression(cb,std::get<2>(expr.second)); + // std::cout<<"Trinary node cb "< strong_inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,rhs); conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -24,6 +26,8 @@ PARALLEL_FOR_LOOP template strong_inline void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,rhs); conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -39,6 +43,8 @@ PARALLEL_FOR_LOOP template strong_inline void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,rhs); conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -53,6 +59,8 @@ PARALLEL_FOR_LOOP } template strong_inline void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + ret.checkerboard = lhs.checkerboard; + conformable(ret,rhs); conformable(lhs,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -71,6 +79,7 @@ PARALLEL_FOR_LOOP ////////////////////////////////////////////////////////////////////////////////////////////////////// template strong_inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + ret.checkerboard = lhs.checkerboard; conformable(lhs,ret); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -82,7 +91,8 @@ PARALLEL_FOR_LOOP template strong_inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ - conformable(lhs,ret); + ret.checkerboard = lhs.checkerboard; + conformable(ret,lhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; @@ -93,7 +103,8 @@ PARALLEL_FOR_LOOP template strong_inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ - conformable(lhs,ret); + ret.checkerboard = lhs.checkerboard; + conformable(ret,lhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ #ifdef STREAMING_STORES @@ -107,6 +118,7 @@ PARALLEL_FOR_LOOP } template strong_inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + ret.checkerboard = lhs.checkerboard; conformable(lhs,ret); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -125,6 +137,7 @@ PARALLEL_FOR_LOOP ////////////////////////////////////////////////////////////////////////////////////////////////////// template strong_inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -140,6 +153,7 @@ PARALLEL_FOR_LOOP template strong_inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -155,6 +169,7 @@ PARALLEL_FOR_LOOP template strong_inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -169,6 +184,7 @@ PARALLEL_FOR_LOOP } template strong_inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -184,6 +200,8 @@ PARALLEL_FOR_LOOP template strong_inline void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + ret.checkerboard = x.checkerboard; + conformable(ret,x); conformable(x,y); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -197,6 +215,8 @@ PARALLEL_FOR_LOOP } template strong_inline void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + ret.checkerboard = x.checkerboard; + conformable(ret,x); conformable(x,y); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -211,12 +231,16 @@ PARALLEL_FOR_LOOP template strong_inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + ret.checkerboard = x.checkerboard; + conformable(ret,x); conformable(x,y); axpy(ret,a,x,y); return norm2(ret); } template strong_inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + ret.checkerboard = x.checkerboard; + conformable(ret,x); conformable(x,y); axpby(ret,a,b,x,y); return norm2(ret); // FIXME implement parallel norm in ss loop diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h index 41f349b7..1d3b1efb 100644 --- a/lib/lattice/Grid_lattice_base.h +++ b/lib/lattice/Grid_lattice_base.h @@ -66,6 +66,16 @@ public: //////////////////////////////////////////////////////////////////////////////// template strong_inline Lattice & operator=(const LatticeUnaryExpression &expr) { + GridBase *egrid(nullptr); + GridFromExpression(egrid,expr); + assert(egrid!=nullptr); + conformable(_grid,egrid); + + int cb=-1; + CBFromExpression(cb,expr); + assert( (cb==Odd) || (cb==Even)); + checkerboard=cb; + PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ #ifdef STREAMING_STORES @@ -79,6 +89,16 @@ PARALLEL_FOR_LOOP } template strong_inline Lattice & operator=(const LatticeBinaryExpression &expr) { + GridBase *egrid(nullptr); + GridFromExpression(egrid,expr); + assert(egrid!=nullptr); + conformable(_grid,egrid); + + int cb=-1; + CBFromExpression(cb,expr); + assert( (cb==Odd) || (cb==Even)); + checkerboard=cb; + PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ #ifdef STREAMING_STORES @@ -92,6 +112,16 @@ PARALLEL_FOR_LOOP } template strong_inline Lattice & operator=(const LatticeTrinaryExpression &expr) { + GridBase *egrid(nullptr); + GridFromExpression(egrid,expr); + assert(egrid!=nullptr); + conformable(_grid,egrid); + + int cb=-1; + CBFromExpression(cb,expr); + assert( (cb==Odd) || (cb==Even)); + checkerboard=cb; + PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ #ifdef STREAMING_STORES @@ -106,8 +136,15 @@ PARALLEL_FOR_LOOP //GridFromExpression is tricky to do template Lattice(const LatticeUnaryExpression & expr): _grid(nullptr){ + GridFromExpression(_grid,expr); assert(_grid!=nullptr); + + int cb=-1; + CBFromExpression(cb,expr); + assert( (cb==Odd) || (cb==Even)); + checkerboard=cb; + _odata.resize(_grid->oSites()); PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ @@ -123,6 +160,12 @@ PARALLEL_FOR_LOOP Lattice(const LatticeBinaryExpression & expr): _grid(nullptr){ GridFromExpression(_grid,expr); assert(_grid!=nullptr); + + int cb=-1; + CBFromExpression(cb,expr); + assert( (cb==Odd) || (cb==Even)); + checkerboard=cb; + _odata.resize(_grid->oSites()); PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ @@ -138,6 +181,12 @@ PARALLEL_FOR_LOOP Lattice(const LatticeTrinaryExpression & expr): _grid(nullptr){ GridFromExpression(_grid,expr); assert(_grid!=nullptr); + + int cb=-1; + CBFromExpression(cb,expr); + assert( (cb==Odd) || (cb==Even)); + checkerboard=cb; + _odata.resize(_grid->oSites()); PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ @@ -169,6 +218,7 @@ PARALLEL_FOR_LOOP return *this; } template strong_inline Lattice & operator = (const Lattice & r){ + this->checkerboard = r.checkerboard; conformable(*this,r); std::cout<<"Lattice operator ="< Date: Mon, 25 May 2015 13:45:32 +0100 Subject: [PATCH 24/27] Most cosmetic --- lib/math/Grid_math_tensors.h | 113 ++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/lib/math/Grid_math_tensors.h b/lib/math/Grid_math_tensors.h index 32ca78e5..4e6752e9 100644 --- a/lib/math/Grid_math_tensors.h +++ b/lib/math/Grid_math_tensors.h @@ -134,66 +134,65 @@ public: iVector & operator= (iVector &©me) = default; */ - iVector & operator= (const Zero &hero){ - zeroit(*this); - return *this; + iVector & operator= (const Zero &hero){ + zeroit(*this); + return *this; + } + friend strong_inline void zeroit(iVector &that){ + for(int i=0;i &that){ - for(int i=0;i &that){ + for(int i=0;i &out,const iVector &in){ + for(int i=0;i &that){ - for(int i=0;i &out,const iVector &in,int permutetype){ + for(int i=0;i &out,const iVector &in){ - for(int i=0;i operator -(const iVector &r) { + iVector ret; + for(int i=0;i &operator *=(const iScalar &r) { + *this = (*this)*r; + return *this; + } + strong_inline iVector &operator -=(const iVector &r) { + *this = (*this)-r; + return *this; + } + strong_inline iVector &operator +=(const iVector &r) { + *this = (*this)+r; + return *this; + } + strong_inline vtype & operator ()(int i) { + return _internal[i]; + } + strong_inline const vtype & operator ()(int i) const { + return _internal[i]; + } + friend std::ostream& operator<< (std::ostream& stream, const iVector &o){ + stream<< "V<"<{"; + for(int i=0;i &out,const iVector &in,int permutetype){ - for(int i=0;i operator -(const iVector &r) { - iVector ret; - for(int i=0;i &operator *=(const iScalar &r) { - *this = (*this)*r; - return *this; - } - strong_inline iVector &operator -=(const iVector &r) { - *this = (*this)-r; - return *this; - } - strong_inline iVector &operator +=(const iVector &r) { - *this = (*this)+r; - return *this; - } - strong_inline vtype & operator ()(int i) { - return _internal[i]; - } - strong_inline const vtype & operator ()(int i) const { - return _internal[i]; - } - friend std::ostream& operator<< (std::ostream& stream, const iVector &o){ - stream<< "V<"<{"; - for(int i=0;i class iMatrix @@ -213,6 +212,8 @@ public: iMatrix(const Zero &z){ *this = zero; }; iMatrix() =default; + iMatrix(scalar_type s) { (*this) = s ;};// recurse down and hit the constructor for vector_type + /* iMatrix(const iMatrix ©me)=default; iMatrix(iMatrix &©me)=default; From 1a9841a0f1991b0e4d58c0361936bea74bbc6c3f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 25 May 2015 13:46:28 +0100 Subject: [PATCH 25/27] Better EO support letting Schur solver work --- lib/qcd/Grid_qcd.h | 3 - lib/qcd/Grid_qcd_wilson_dop.cc | 317 +++++++++++++++++++-------------- lib/qcd/Grid_qcd_wilson_dop.h | 36 ++-- 3 files changed, 210 insertions(+), 146 deletions(-) diff --git a/lib/qcd/Grid_qcd.h b/lib/qcd/Grid_qcd.h index 0bf68632..959f3529 100644 --- a/lib/qcd/Grid_qcd.h +++ b/lib/qcd/Grid_qcd.h @@ -10,9 +10,6 @@ namespace QCD { static const int Nhs=2; // half spinor static const int Nds=8; // double stored gauge field - static const int CbRed =0; - static const int CbBlack=1; - ////////////////////////////////////////////////////////////////////////////// // QCD iMatrix types // Index conventions: Lorentz x Spin x Colour diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc index 2020a9ec..318e18df 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.cc +++ b/lib/qcd/Grid_qcd_wilson_dop.cc @@ -4,8 +4,8 @@ namespace Grid { namespace QCD { -const std::vector WilsonMatrix::directions ({0,1,2,3, 0, 1, 2, 3,0}); -const std::vector WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1,0}); +const std::vector WilsonMatrix::directions ({0,1,2,3, 0, 1, 2, 3}); +const std::vector WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1}); // Should be in header? const int WilsonMatrix::Xp = 0; @@ -16,7 +16,6 @@ const int WilsonMatrix::Xm = 4; const int WilsonMatrix::Ym = 5; const int WilsonMatrix::Zm = 6; const int WilsonMatrix::Tm = 7; - //const int WilsonMatrix::X0 = 8; class WilsonCompressor { public: @@ -72,20 +71,27 @@ const int WilsonMatrix::Tm = 7; } }; - WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,double _mass) - : Stencil(Umu._grid,npoint,0,directions,displacements), + WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid, double _mass) : + CheckerBoardedSparseMatrixBase(&Fgrid,&Hgrid), + Stencil ( _grid,npoint,Even,directions,displacements), + StencilEven(_cbgrid,npoint,Even,directions,displacements), // source is Even + StencilOdd (_cbgrid,npoint,Odd ,directions,displacements), // source is Odd mass(_mass), - Umu(_Umu._grid) -{ - // Allocate the required comms buffer - grid = _Umu._grid; - comm_buf.resize(Stencil._unified_buffer_size); - DoubleStore(Umu,_Umu); -} + Umu(_grid), + UmuEven(_cbgrid), + UmuOdd (_cbgrid) + { + // Allocate the required comms buffer + comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO + + DoubleStore(Umu,_Umu); + pickCheckerboard(Even,UmuEven,Umu); + pickCheckerboard(Odd ,UmuOdd,Umu); + } void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu) { - LatticeColourMatrix U(grid); + LatticeColourMatrix U(_grid); for(int mu=0;mu(Umu,mu); @@ -97,47 +103,63 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out) { - this->Dhop(in,out,0); + out.checkerboard=in.checkerboard; + Dhop(in,out,DaggerNo); out = (4+mass)*in - 0.5*out ; // FIXME : axpby_norm! fusion fun return norm2(out); } RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out) { - this->Dhop(in,out,1); + out.checkerboard=in.checkerboard; + Dhop(in,out,DaggerYes); out = (4+mass)*in - 0.5*out ; // FIXME : axpby_norm! fusion fun return norm2(out); } void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out) { - this->Dhop(in,out,0); - out = 0.5*out; // FIXME : scale factor in Dhop + if ( in.checkerboard == Odd ) { + DhopEO(in,out,DaggerNo); + } else { + DhopOE(in,out,DaggerNo); + } + out = (-0.5)*out; // FIXME : scale factor in Dhop } void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out) { - this->Dhop(in,out,1); - out = 0.5*out; // FIXME : scale factor in Dhop + if ( in.checkerboard == Odd ) { + DhopEO(in,out,DaggerYes); + } else { + DhopOE(in,out,DaggerYes); + } + out = (-0.5)*out; // FIXME : scale factor in Dhop } void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out) { + out.checkerboard = in.checkerboard; out = (4.0+mass)*in; return ; } void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out) { - this->Mooee(in,out); + out.checkerboard = in.checkerboard; + Mooee(in,out); } void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out) { + out.checkerboard = in.checkerboard; out = (1.0/(4.0+mass))*in; return ; } void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out) { - this->MooeeInv(in,out); + out.checkerboard = in.checkerboard; + MooeeInv(in,out); } -void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out) +void WilsonMatrix::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) { vHalfSpinColourVector tmp; vHalfSpinColourVector chi; @@ -145,79 +167,78 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out vHalfSpinColourVector Uchi; int offset,local,perm, ptype; - // int ss = Stencil._LebesgueReorder[sss]; - int ssu= ss; + //#define VERBOSE( A) if ( ss<10 ) { std::cout << "site " < > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) { vHalfSpinColourVector tmp; vHalfSpinColourVector chi; @@ -291,78 +315,76 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion & vHalfSpinColourVector Uchi; int offset,local,perm, ptype; - int ssu= ss; - // Xp - offset = Stencil._offsets [Xm][ss]; - local = Stencil._is_local[Xm][ss]; - perm = Stencil._permute[Xm][ss]; + offset = st._offsets [Xm][ss]; + local = st._is_local[Xm][ss]; + perm = st._permute[Xm][ss]; - ptype = Stencil._permute_type[Xm]; + ptype = st._permute_type[Xm]; if ( local && perm ) { spProjXp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { spProjXp(chi,in._odata[offset]); } else { - chi=comm_buf[offset]; + chi=buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); + mult(&Uchi(),&U._odata[ss](Xm),&chi()); spReconXp(result,Uchi); // Yp - offset = Stencil._offsets [Ym][ss]; - local = Stencil._is_local[Ym][ss]; - perm = Stencil._permute[Ym][ss]; - ptype = Stencil._permute_type[Ym]; + offset = st._offsets [Ym][ss]; + local = st._is_local[Ym][ss]; + perm = st._permute[Ym][ss]; + ptype = st._permute_type[Ym]; if ( local && perm ) { spProjYp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { spProjYp(chi,in._odata[offset]); } else { - chi=comm_buf[offset]; + chi=buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); + mult(&Uchi(),&U._odata[ss](Ym),&chi()); accumReconYp(result,Uchi); // Zp - offset = Stencil._offsets [Zm][ss]; - local = Stencil._is_local[Zm][ss]; - perm = Stencil._permute[Zm][ss]; - ptype = Stencil._permute_type[Zm]; + offset = st._offsets [Zm][ss]; + local = st._is_local[Zm][ss]; + perm = st._permute[Zm][ss]; + ptype = st._permute_type[Zm]; if ( local && perm ) { spProjZp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { spProjZp(chi,in._odata[offset]); } else { - chi=comm_buf[offset]; + chi=buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); + mult(&Uchi(),&U._odata[ss](Zm),&chi()); accumReconZp(result,Uchi); // Tp - offset = Stencil._offsets [Tm][ss]; - local = Stencil._is_local[Tm][ss]; - perm = Stencil._permute[Tm][ss]; - ptype = Stencil._permute_type[Tm]; + offset = st._offsets [Tm][ss]; + local = st._is_local[Tm][ss]; + perm = st._permute[Tm][ss]; + ptype = st._permute_type[Tm]; if ( local && perm ) { spProjTp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { spProjTp(chi,in._odata[offset]); } else { - chi=comm_buf[offset]; + chi=buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); + mult(&Uchi(),&U._odata[ss](Tm),&chi()); accumReconTp(result,Uchi); // Xm - offset = Stencil._offsets [Xp][ss]; - local = Stencil._is_local[Xp][ss]; - perm = Stencil._permute[Xp][ss]; - ptype = Stencil._permute_type[Xp]; + offset = st._offsets [Xp][ss]; + local = st._is_local[Xp][ss]; + perm = st._permute[Xp][ss]; + ptype = st._permute_type[Xp]; if ( local && perm ) { @@ -371,17 +393,16 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion & } else if ( local ) { spProjXm(chi,in._odata[offset]); } else { - chi=comm_buf[offset]; + chi=buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); + mult(&Uchi(),&U._odata[ss](Xp),&chi()); accumReconXm(result,Uchi); - // Ym - offset = Stencil._offsets [Yp][ss]; - local = Stencil._is_local[Yp][ss]; - perm = Stencil._permute[Yp][ss]; - ptype = Stencil._permute_type[Yp]; + offset = st._offsets [Yp][ss]; + local = st._is_local[Yp][ss]; + perm = st._permute[Yp][ss]; + ptype = st._permute_type[Yp]; if ( local && perm ) { spProjYm(tmp,in._odata[offset]); @@ -389,67 +410,97 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion & } else if ( local ) { spProjYm(chi,in._odata[offset]); } else { - chi=comm_buf[offset]; + chi=buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); + mult(&Uchi(),&U._odata[ss](Yp),&chi()); accumReconYm(result,Uchi); // Zm - offset = Stencil._offsets [Zp][ss]; - local = Stencil._is_local[Zp][ss]; - perm = Stencil._permute[Zp][ss]; - ptype = Stencil._permute_type[Zp]; + offset = st._offsets [Zp][ss]; + local = st._is_local[Zp][ss]; + perm = st._permute[Zp][ss]; + ptype = st._permute_type[Zp]; if ( local && perm ) { spProjZm(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { spProjZm(chi,in._odata[offset]); } else { - chi=comm_buf[offset]; + chi=buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); + mult(&Uchi(),&U._odata[ss](Zp),&chi()); accumReconZm(result,Uchi); // Tm - offset = Stencil._offsets [Tp][ss]; - local = Stencil._is_local[Tp][ss]; - perm = Stencil._permute[Tp][ss]; - ptype = Stencil._permute_type[Tp]; + offset = st._offsets [Tp][ss]; + local = st._is_local[Tp][ss]; + perm = st._permute[Tp][ss]; + ptype = st._permute_type[Tp]; if ( local && perm ) { spProjTm(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { spProjTm(chi,in._odata[offset]); } else { - chi=comm_buf[offset]; + chi=buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); + mult(&Uchi(),&U._odata[ss](Tp),&chi()); accumReconTm(result,Uchi); vstream(out._odata[ss],result); } -void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) +void WilsonMatrix::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U, + const LatticeFermion &in, LatticeFermion &out,int dag) { - assert((dag==0) ||(dag==1)); - + assert((dag==DaggerNo) ||(dag==DaggerYes)); WilsonCompressor compressor(dag); - Stencil.HaloExchange(in,comm_buf,compressor); - if ( dag ) { + st.HaloExchange(in,comm_buf,compressor); + + if ( dag == DaggerYes ) { PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - DhopSiteDag(sss,in,out); + for(int sss=0;sssoSites();sss++){ + DhopSiteDag(st,U,comm_buf,sss,in,out); } } else { PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - DhopSite(sss,in,out); + for(int sss=0;sssoSites();sss++){ + DhopSite(st,U,comm_buf,sss,in,out); } } } +void WilsonMatrix::DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag) +{ + conformable(in._grid,_cbgrid); // verifies half grid + conformable(in._grid,out._grid); // drops the cb check + assert(in.checkerboard==Even); + out.checkerboard = Odd; + DhopInternal(StencilEven,UmuOdd,in,out,dag); } +void WilsonMatrix::DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag) +{ + conformable(in._grid,_cbgrid); // verifies half grid + conformable(in._grid,out._grid); // drops the cb check + + assert(in.checkerboard==Odd); + out.checkerboard = Even; + + DhopInternal(StencilOdd,UmuEven,in,out,dag); +} +void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) +{ + conformable(in._grid,_grid); // verifies full grid + conformable(in._grid,out._grid); + + out.checkerboard = in.checkerboard; + + DhopInternal(Stencil,Umu,in,out,dag); } +}} + + + diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h index 8d93a926..96b29cd0 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.h +++ b/lib/qcd/Grid_qcd_wilson_dop.h @@ -11,14 +11,20 @@ namespace Grid { //NB r=1; public: double mass; - GridBase *grid; + // GridBase * grid; // Inherited + // GridBase * cbgrid; - // Copy of the gauge field - LatticeDoubledGaugeField Umu; + //Defines the stencils for even and odd + CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; - //Defines the stencil - CartesianStencil Stencil; - static const int npoint=9; + // Copy of the gauge field , with even and odd subsets + LatticeDoubledGaugeField Umu; + LatticeDoubledGaugeField UmuEven; + LatticeDoubledGaugeField UmuOdd; + + static const int npoint=8; static const std::vector directions ; static const std::vector displacements; static const int Xp,Xm,Yp,Ym,Zp,Zm,Tp,Tm; @@ -27,7 +33,7 @@ namespace Grid { std::vector > comm_buf; // Constructor - WilsonMatrix(LatticeGaugeField &Umu,double mass); + WilsonMatrix(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid,double _mass); // DoubleStore void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu); @@ -45,9 +51,19 @@ namespace Grid { virtual void MooeeInvDag (const LatticeFermion &in, LatticeFermion &out); // non-hermitian hopping term; half cb or both - void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag); - void DhopSite (int ss,const LatticeFermion &in, LatticeFermion &out); - void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out); + void Dhop (const LatticeFermion &in, LatticeFermion &out,int dag); + void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag); + void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag); + void DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField &U, + const LatticeFermion &in, LatticeFermion &out,int dag); + // These ones will need to be package intelligently. WilsonType base class + // for use by DWF etc.. + void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out); + void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out); typedef iScalar > matrix; From 2ae6214104f945ebcb4d6876e5052bb4dcb24878 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 25 May 2015 13:47:12 +0100 Subject: [PATCH 26/27] Schur complement based red-black inversion working --- benchmarks/Grid_wilson.cc | 17 ++- benchmarks/Grid_wilson_cg_prec.cc | 61 +++++++++ benchmarks/Grid_wilson_cg_schur.cc | 48 +++++++ benchmarks/Grid_wilson_cg_unprec.cc | 7 +- benchmarks/Grid_wilson_evenodd.cc | 201 ++++++++++++++++++++++++++++ lib/simd/Grid_sse4.h | 1 + lib/simd/Grid_vector_types.h | 32 ++--- tests/Grid_main.cc | 3 +- 8 files changed, 336 insertions(+), 34 deletions(-) create mode 100644 benchmarks/Grid_wilson_cg_prec.cc create mode 100644 benchmarks/Grid_wilson_cg_schur.cc create mode 100644 benchmarks/Grid_wilson_evenodd.cc diff --git a/benchmarks/Grid_wilson.cc b/benchmarks/Grid_wilson.cc index adfb6fd4..32255b3e 100644 --- a/benchmarks/Grid_wilson.cc +++ b/benchmarks/Grid_wilson.cc @@ -24,7 +24,8 @@ int main (int argc, char ** argv) std::vector latt_size = GridDefaultLatt(); std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); - GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); int threads = GridThread::GetThreads(); std::cout << "Grid is setup to use "< U(4,&Grid); @@ -51,8 +52,11 @@ int main (int argc, char ** argv) // Only one non-zero (y) Umu=zero; + Complex cone(1.0,0.0); for(int nn=0;nn(Umu,U[nn],nn); } @@ -78,7 +82,7 @@ int main (int argc, char ** argv) } RealD mass=0.1; - WilsonMatrix Dw(Umu,mass); + WilsonMatrix Dw(Umu,Grid,RBGrid,mass); std::cout << "Calling Dw"< + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + LatticeFermion src(&Grid); random(pRNG,src); + RealD nrm = norm2(src); + LatticeFermion result(&Grid); result=zero; + LatticeGaugeField Umu(&Grid); random(pRNG,Umu); + + std::vector U(4,&Grid); + + for(int mu=0;mu(Umu,mu); + } + + RealD mass=0.5; + WilsonMatrix Dw(Umu,Grid,RBGrid,mass); + + // HermitianOperator HermOp(Dw); + // ConjugateGradient CG(1.0e-8,10000); + // CG(HermOp,src,result); + + LatticeFermion src_o(&RBGrid); + LatticeFermion result_o(&RBGrid); + pickCheckerboard(Odd,src_o,src); + result_o=zero; + + HermitianCheckerBoardedOperator HermOpEO(Dw); + ConjugateGradient CG(1.0e-8,10000); + CG(HermOpEO,src_o,result_o); + + Grid_finalize(); +} diff --git a/benchmarks/Grid_wilson_cg_schur.cc b/benchmarks/Grid_wilson_cg_schur.cc new file mode 100644 index 00000000..af630ae1 --- /dev/null +++ b/benchmarks/Grid_wilson_cg_schur.cc @@ -0,0 +1,48 @@ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(&Grid); random(pRNG,Umu); + + LatticeFermion src(&Grid); random(pRNG,src); + LatticeFermion result(&Grid); result=zero; + LatticeFermion resid(&Grid); + + RealD mass=0.5; + WilsonMatrix Dw(Umu,Grid,RBGrid,mass); + + ConjugateGradient CG(1.0e-8,10000); + SchurRedBlackSolve SchurSolver(CG); + + SchurSolver(Dw,src,result); + + Grid_finalize(); +} diff --git a/benchmarks/Grid_wilson_cg_unprec.cc b/benchmarks/Grid_wilson_cg_unprec.cc index 93023534..15302aab 100644 --- a/benchmarks/Grid_wilson_cg_unprec.cc +++ b/benchmarks/Grid_wilson_cg_unprec.cc @@ -24,7 +24,8 @@ int main (int argc, char ** argv) std::vector latt_size = GridDefaultLatt(); std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); - GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); @@ -46,11 +47,11 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonMatrix Dw(Umu,mass); + WilsonMatrix Dw(Umu,Grid,RBGrid,mass); + HermitianOperator HermOp(Dw); ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src,result); - Grid_finalize(); } diff --git a/benchmarks/Grid_wilson_evenodd.cc b/benchmarks/Grid_wilson_evenodd.cc new file mode 100644 index 00000000..4bc8c357 --- /dev/null +++ b/benchmarks/Grid_wilson_evenodd.cc @@ -0,0 +1,201 @@ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); + + int threads = GridThread::GetThreads(); + std::cout << "Grid is setup to use "< seeds({1,2,3,4}); + + GridParallelRNG pRNG(&Grid); + // std::vector seeds({1,2,3,4}); + // pRNG.SeedFixedIntegers(seeds); + pRNG.SeedRandomDevice(); + + LatticeFermion src (&Grid); random(pRNG,src); + LatticeFermion phi (&Grid); random(pRNG,phi); + LatticeFermion chi (&Grid); random(pRNG,chi); + LatticeFermion result(&Grid); result=zero; + LatticeFermion ref(&Grid); ref=zero; + LatticeFermion tmp(&Grid); tmp=zero; + LatticeFermion err(&Grid); tmp=zero; + LatticeGaugeField Umu(&Grid); random(pRNG,Umu); + std::vector U(4,&Grid); + + double volume=1; + for(int mu=0;mu(Umu,U[nn],nn); + } + + RealD mass=0.1; + + WilsonMatrix Dw(Umu,Grid,RBGrid,mass); + + LatticeFermion src_e (&RBGrid); + LatticeFermion src_o (&RBGrid); + LatticeFermion r_e (&RBGrid); + LatticeFermion r_o (&RBGrid); + LatticeFermion r_eo (&Grid); + + const int Even=0; + const int Odd=1; + std::cout<<"=========================================================="< * = < chi | Deo^dag| phi> "< - struct RealPart { - typedef T type; - }; - template - struct RealPart< std::complex >{ + template struct RealPart { + typedef T type; + }; + template struct RealPart< std::complex >{ typedef T type; }; // type alias used to simplify the syntax of std::enable_if - template using Invoke = - typename T::type; - template using EnableIf = - Invoke>; - template using NotEnableIf = - Invoke>; + template using Invoke = typename T::type; + template using EnableIf = Invoke>; + template using NotEnableIf= Invoke>; //////////////////////////////////////////////////////// // Check for complexity with type traits - template - struct is_complex : std::false_type {}; - template < typename T > - struct is_complex< std::complex >: std::true_type {}; + template struct is_complex : std::false_type {}; + template < typename T > struct is_complex< std::complex >: std::true_type {}; //////////////////////////////////////////////////////// // Define the operation templates functors // general forms to allow for vsplat syntax @@ -86,8 +79,6 @@ namespace Grid { Grid_simd(Real a){ vsplat(*this,Scalar_type(a)); }; - - /////////////////////////////////////////////// // mac, mult, sub, add, adj @@ -126,10 +117,6 @@ namespace Grid { friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);} template < class S = Scalar_type, EnableIf, int> = 0 > friend inline void vfalse(vInteger &ret){vsplat(ret,0);} - - - - //////////////////////////////////// // Arithmetic operator overloads +,-,* @@ -165,7 +152,6 @@ namespace Grid { ret.v = binary(a.v,b.v, MultSIMD()); return ret; }; - //////////////////////////////////////////////////////////////////////// // FIXME: gonna remove these load/store, get, set, prefetch diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index 10e74099..1a761042 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -1,8 +1,9 @@ #include "Grid.h" //DEBUG +#ifdef SSE4 #include "simd/Grid_vector_types.h" - +#endif using namespace std; using namespace Grid; From 3a6ff2d7b86233ae1da83119c3f97b5cb3177ed7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 25 May 2015 14:43:08 +0100 Subject: [PATCH 27/27] Makefile update --- Makefile.in | 44 +++++++++++++++++++----------- aclocal.m4 | 61 ++++++++++++++++++++++-------------------- benchmarks/Makefile.am | 11 +++++++- config.guess | 2 +- config.sub | 2 +- configure | 13 ++++----- 6 files changed, 79 insertions(+), 54 deletions(-) diff --git a/Makefile.in b/Makefile.in index b6894ef6..b9fba168 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.14.1 from Makefile.am. +# Makefile.in generated by automake 1.15 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2013 Free Software Foundation, Inc. +# Copyright (C) 1994-2014 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -14,7 +14,17 @@ @SET_MAKE@ VPATH = @srcdir@ -am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} am__make_running_with_option = \ case $${target_option-} in \ ?) ;; \ @@ -79,14 +89,12 @@ build_triplet = @build@ host_triplet = @host@ target_triplet = @target@ subdir = . -DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \ - $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ - $(top_srcdir)/configure $(am__configure_deps) COPYING TODO \ - compile config.guess config.sub depcomp install-sh missing ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \ + $(am__configure_deps) $(am__DIST_COMMON) am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ configure.lineno config.status.lineno mkinstalldirs = $(install_sh) -d @@ -149,6 +157,9 @@ ETAGS = etags CTAGS = ctags CSCOPE = cscope DIST_SUBDIRS = $(SUBDIRS) +am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \ + INSTALL NEWS README TODO compile config.guess config.sub \ + depcomp install-sh missing DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) distdir = $(PACKAGE)-$(VERSION) top_distdir = $(distdir) @@ -314,7 +325,6 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu Makefile -.PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ @@ -521,15 +531,15 @@ dist-xz: distdir $(am__post_remove_distdir) dist-tarZ: distdir - @echo WARNING: "Support for shar distribution archives is" \ - "deprecated." >&2 + @echo WARNING: "Support for distribution archives compressed with" \ + "legacy program 'compress' is deprecated." >&2 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2 tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z $(am__post_remove_distdir) dist-shar: distdir - @echo WARNING: "Support for distribution archives compressed with" \ - "legacy program 'compress' is deprecated." >&2 + @echo WARNING: "Support for shar distribution archives is" \ + "deprecated." >&2 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2 shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz $(am__post_remove_distdir) @@ -565,17 +575,17 @@ distcheck: dist esac chmod -R a-w $(distdir) chmod u+w $(distdir) - mkdir $(distdir)/_build $(distdir)/_inst + mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst chmod a-w $(distdir) test -d $(distdir)/_build || exit 0; \ dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \ && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \ && am__cwd=`pwd` \ - && $(am__cd) $(distdir)/_build \ - && ../configure \ + && $(am__cd) $(distdir)/_build/sub \ + && ../../configure \ $(AM_DISTCHECK_CONFIGURE_FLAGS) \ $(DISTCHECK_CONFIGURE_FLAGS) \ - --srcdir=.. --prefix="$$dc_install_base" \ + --srcdir=../.. --prefix="$$dc_install_base" \ && $(MAKE) $(AM_MAKEFLAGS) \ && $(MAKE) $(AM_MAKEFLAGS) dvi \ && $(MAKE) $(AM_MAKEFLAGS) check \ @@ -749,6 +759,8 @@ uninstall-am: maintainer-clean-generic mostlyclean mostlyclean-generic pdf \ pdf-am ps ps-am tags tags-am uninstall uninstall-am +.PRECIOUS: Makefile + # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/aclocal.m4 b/aclocal.m4 index 389763bf..bf79d078 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -1,6 +1,6 @@ -# generated automatically by aclocal 1.14.1 -*- Autoconf -*- +# generated automatically by aclocal 1.15 -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -20,7 +20,7 @@ You have another version of autoconf. It may work, but is not guaranteed to. If you have problems, you may need to regenerate the build system entirely. To do so, use the procedure documented by the package, typically 'autoreconf'.])]) -# Copyright (C) 2002-2013 Free Software Foundation, Inc. +# Copyright (C) 2002-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.]) # generated from the m4 files accompanying Automake X.Y. # (This private macro should not be called outside this file.) AC_DEFUN([AM_AUTOMAKE_VERSION], -[am__api_version='1.14' +[am__api_version='1.15' dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to dnl require some minimum version. Point them to the right macro. -m4_if([$1], [1.14.1], [], +m4_if([$1], [1.15], [], [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl ]) @@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], []) # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. # This function is AC_REQUIREd by AM_INIT_AUTOMAKE. AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], -[AM_AUTOMAKE_VERSION([1.14.1])dnl +[AM_AUTOMAKE_VERSION([1.15])dnl m4_ifndef([AC_AUTOCONF_VERSION], [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # AM_AUX_DIR_EXPAND -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -103,15 +103,14 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # configured tree to be moved without reconfiguration. AC_DEFUN([AM_AUX_DIR_EXPAND], -[dnl Rely on autoconf to set up CDPATH properly. -AC_PREREQ([2.50])dnl -# expand $ac_aux_dir to an absolute path -am_aux_dir=`cd $ac_aux_dir && pwd` +[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl +# Expand $ac_aux_dir to an absolute path. +am_aux_dir=`cd "$ac_aux_dir" && pwd` ]) # AM_CONDITIONAL -*- Autoconf -*- -# Copyright (C) 1997-2013 Free Software Foundation, Inc. +# Copyright (C) 1997-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -142,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE( Usually this means the macro was only invoked conditionally.]]) fi])]) -# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# Copyright (C) 1999-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -333,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl # Generate code to set up dependency tracking. -*- Autoconf -*- -# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# Copyright (C) 1999-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -409,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS], # Do all the work for Automake. -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -499,8 +498,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl # # AC_SUBST([mkdir_p], ['$(MKDIR_P)']) -# We need awk for the "check" target. The system "awk" is bad on -# some platforms. +# We need awk for the "check" target (and possibly the TAP driver). The +# system "awk" is bad on some platforms. AC_REQUIRE([AC_PROG_AWK])dnl AC_REQUIRE([AC_PROG_MAKE_SET])dnl AC_REQUIRE([AM_SET_LEADING_DOT])dnl @@ -573,7 +572,11 @@ to "yes", and re-run configure. END AC_MSG_ERROR([Your 'rm' program is bad, sorry.]) fi -fi]) +fi +dnl The trailing newline in this macro's definition is deliberate, for +dnl backward compatibility and to allow trailing 'dnl'-style comments +dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841. +]) dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further @@ -602,7 +605,7 @@ for _am_header in $config_headers :; do done echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -613,7 +616,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co # Define $install_sh. AC_DEFUN([AM_PROG_INSTALL_SH], [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -if test x"${install_sh}" != xset; then +if test x"${install_sh+set}" != xset; then case $am_aux_dir in *\ * | *\ *) install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; @@ -623,7 +626,7 @@ if test x"${install_sh}" != xset; then fi AC_SUBST([install_sh])]) -# Copyright (C) 2003-2013 Free Software Foundation, Inc. +# Copyright (C) 2003-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -644,7 +647,7 @@ AC_SUBST([am__leading_dot])]) # Check to see how 'make' treats includes. -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -694,7 +697,7 @@ rm -f confinc confmf # Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- -# Copyright (C) 1997-2013 Free Software Foundation, Inc. +# Copyright (C) 1997-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -733,7 +736,7 @@ fi # Helper functions for option handling. -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -764,7 +767,7 @@ AC_DEFUN([_AM_IF_OPTION], # Check to make sure that the build environment is sane. -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -845,7 +848,7 @@ AC_CONFIG_COMMANDS_PRE( rm -f conftest.file ]) -# Copyright (C) 2009-2013 Free Software Foundation, Inc. +# Copyright (C) 2009-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -905,7 +908,7 @@ AC_SUBST([AM_BACKSLASH])dnl _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl ]) -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -933,7 +936,7 @@ fi INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" AC_SUBST([INSTALL_STRIP_PROGRAM])]) -# Copyright (C) 2006-2013 Free Software Foundation, Inc. +# Copyright (C) 2006-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -952,7 +955,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) # Check how to create a tarball. -*- Autoconf -*- -# Copyright (C) 2004-2013 Free Software Foundation, Inc. +# Copyright (C) 2004-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index fc513e86..95ae5eca 100644 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -5,14 +5,23 @@ AM_LDFLAGS = -L$(top_builddir)/lib # # Test code # -bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec +bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec Grid_wilson_evenodd Grid_wilson_cg_prec Grid_wilson_cg_schur Grid_wilson_SOURCES = Grid_wilson.cc Grid_wilson_LDADD = -lGrid +Grid_wilson_evenodd_SOURCES = Grid_wilson_evenodd.cc +Grid_wilson_evenodd_LDADD = -lGrid + Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc Grid_wilson_cg_unprec_LDADD = -lGrid +Grid_wilson_cg_prec_SOURCES = Grid_wilson_cg_prec.cc +Grid_wilson_cg_prec_LDADD = -lGrid + +Grid_wilson_cg_schur_SOURCES = Grid_wilson_cg_schur.cc +Grid_wilson_cg_schur_LDADD = -lGrid + Grid_comms_SOURCES = Grid_comms.cc Grid_comms_LDADD = -lGrid diff --git a/config.guess b/config.guess index 5f6aa02d..a12faba2 120000 --- a/config.guess +++ b/config.guess @@ -1 +1 @@ -/usr/share/automake-1.14/config.guess \ No newline at end of file +/opt/local/share/automake-1.15/config.guess \ No newline at end of file diff --git a/config.sub b/config.sub index 0abfe18c..e3c9b5ca 120000 --- a/config.sub +++ b/config.sub @@ -1 +1 @@ -/usr/share/automake-1.14/config.sub \ No newline at end of file +/opt/local/share/automake-1.15/config.sub \ No newline at end of file diff --git a/configure b/configure index 8c9e8c59..6e27ab11 100755 --- a/configure +++ b/configure @@ -2466,7 +2466,7 @@ test -n "$target_alias" && NONENONEs,x,x, && program_prefix=${target_alias}- -am__api_version='1.14' +am__api_version='1.15' # Find a good install program. We prefer a C program (faster), # so one script is as good as another. But avoid the broken or @@ -2638,8 +2638,8 @@ test "$program_suffix" != NONE && ac_script='s/[\\$]/&&/g;s/;s,x,x,$//' program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"` -# expand $ac_aux_dir to an absolute path -am_aux_dir=`cd $ac_aux_dir && pwd` +# Expand $ac_aux_dir to an absolute path. +am_aux_dir=`cd "$ac_aux_dir" && pwd` if test x"${MISSING+set}" != xset; then case $am_aux_dir in @@ -2658,7 +2658,7 @@ else $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;} fi -if test x"${install_sh}" != xset; then +if test x"${install_sh+set}" != xset; then case $am_aux_dir in *\ * | *\ *) install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; @@ -2986,8 +2986,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"} # mkdir_p='$(MKDIR_P)' -# We need awk for the "check" target. The system "awk" is bad on -# some platforms. +# We need awk for the "check" target (and possibly the TAP driver). The +# system "awk" is bad on some platforms. # Always define AMTAR for backward compatibility. Yes, it's still used # in the wild :-( We should find a proper way to deprecate it ... AMTAR='$${TAR-tar}' @@ -3046,6 +3046,7 @@ END fi + ac_config_headers="$ac_config_headers lib/Grid_config.h" # Check whether --enable-silent-rules was given.