From 6cec662ac50ae75f9b43f29e1bac40214ff6cf93 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 May 2015 20:41:44 +0100 Subject: [PATCH] Enhanced SIMD interfacing --- benchmarks/Grid_comms.cc | 2 +- benchmarks/Grid_memory_bandwidth.cc | 4 +--- benchmarks/Grid_wilson.cc | 2 +- lib/Grid.h | 2 +- lib/Grid_init.cc | 34 ++++++++++++++--------------- lib/lattice/Grid_lattice_ET.h | 8 +++---- lib/lattice/Grid_lattice_arith.h | 2 +- lib/lattice/Grid_lattice_base.h | 14 +++++++----- lib/simd/Grid_vComplexF.h | 6 ++--- tests/Grid_cshift.cc | 2 +- tests/Grid_gamma.cc | 2 +- tests/Grid_main.cc | 2 +- tests/Grid_nersc_io.cc | 2 +- tests/Grid_simd.cc | 2 +- tests/Grid_stencil.cc | 2 +- 15 files changed, 43 insertions(+), 43 deletions(-) diff --git a/benchmarks/Grid_comms.cc b/benchmarks/Grid_comms.cc index 3dedfb5b..810cfd06 100644 --- a/benchmarks/Grid_comms.cc +++ b/benchmarks/Grid_comms.cc @@ -8,7 +8,7 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); int Nloop=10; diff --git a/benchmarks/Grid_memory_bandwidth.cc b/benchmarks/Grid_memory_bandwidth.cc index cb444128..5a4d1f18 100644 --- a/benchmarks/Grid_memory_bandwidth.cc +++ b/benchmarks/Grid_memory_bandwidth.cc @@ -13,7 +13,7 @@ int main (int argc, char ** argv) int Nloop=1000; - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); std::cout << "===================================================================================================="< latt_size = GridDefaultLatt(); - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); GridCartesian Grid(latt_size,simd_layout,mpi_layout); diff --git a/lib/Grid.h b/lib/Grid.h index 4512f443..bd7b371a 100644 --- a/lib/Grid.h +++ b/lib/Grid.h @@ -71,7 +71,7 @@ namespace Grid { // C++11 time facilities better? double usecond(void); - const std::vector &GridDefaultSimd(void); + const std::vector GridDefaultSimd(int dims,int nsimd); const std::vector &GridDefaultLatt(void); const std::vector &GridDefaultMpi(void); const int &GridThreads(void) ; diff --git a/lib/Grid_init.cc b/lib/Grid_init.cc index 9e977064..c804e810 100644 --- a/lib/Grid_init.cc +++ b/lib/Grid_init.cc @@ -27,14 +27,28 @@ namespace Grid { // Convenience functions to access stadard command line arg // driven parallelism controls ////////////////////////////////////////////////////// - static std::vector Grid_default_simd; static std::vector Grid_default_latt; static std::vector Grid_default_mpi; int GridThread::_threads; + const std::vector GridDefaultSimd(int dims,int nsimd) + { + std::vector layout(dims); + int nn=nsimd; + for(int d=dims-1;d>=0;d--){ + if ( nn>=2) { + layout[d]=2; + nn/=2; + } else { + layout[d]=1; + } + } + assert(nn==1); + return layout; + } - const std::vector &GridDefaultSimd(void) {return Grid_default_simd;}; + const std::vector &GridDefaultLatt(void) {return Grid_default_latt;}; const std::vector &GridDefaultMpi(void) {return Grid_default_mpi;}; @@ -71,22 +85,11 @@ void GridCmdOptionIntVector(std::string &str,std::vector & vec) void GridParseLayout(char **argv,int argc, std::vector &latt, - std::vector &simd, std::vector &mpi) { mpi =std::vector({1,1,1,1}); latt=std::vector({8,8,8,8}); -#if defined(SSE4) - simd=std::vector({1,1,1,2}); -#endif -#if defined(AVX1) || defined (AVX2) - simd=std::vector({1,1,2,2}); -#endif -#if defined(AVX512) - simd=std::vector({1,2,2,2}); -#endif - GridThread::SetMaxThreads(); std::string arg; @@ -94,10 +97,6 @@ void GridParseLayout(char **argv,int argc, arg = GridCmdOptionPayload(argv,argv+argc,"--mpi"); GridCmdOptionIntVector(arg,mpi); } - if( GridCmdOptionExists(argv,argv+argc,"--simd") ){ - arg= GridCmdOptionPayload(argv,argv+argc,"--simd"); - GridCmdOptionIntVector(arg,simd); - } if( GridCmdOptionExists(argv,argv+argc,"--grid") ){ arg= GridCmdOptionPayload(argv,argv+argc,"--grid"); GridCmdOptionIntVector(arg,latt); @@ -129,7 +128,6 @@ void Grid_init(int *argc,char ***argv) } GridParseLayout(*argv,*argc, Grid_default_latt, - Grid_default_simd, Grid_default_mpi); } diff --git a/lib/lattice/Grid_lattice_ET.h b/lib/lattice/Grid_lattice_ET.h index d4fd7d05..c0b5f97d 100644 --- a/lib/lattice/Grid_lattice_ET.h +++ b/lib/lattice/Grid_lattice_ET.h @@ -67,6 +67,10 @@ inline void GridFromExpression(GridBase * &grid,const T1& lat) // Lattice leaf } grid=lat._grid; } +template::value, T1>::type * = nullptr > +inline void GridFromExpression(GridBase * &grid,const T1& notlat) // non-lattice leaf +{ +} template inline void GridFromExpression(GridBase * &grid,const LatticeUnaryExpression &expr) { @@ -86,10 +90,6 @@ inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression< GridFromExpression(grid,std::get<1>(expr.second)); GridFromExpression(grid,std::get<2>(expr.second)); } -template::value, T1>::type * = nullptr > -inline void GridFromExpression(GridBase * &grid,const T1& notlat) // non-lattice leaf -{ -} //////////////////////////////////////////// // Unary operators and funcs diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h index c1d58b86..10f31025 100644 --- a/lib/lattice/Grid_lattice_arith.h +++ b/lib/lattice/Grid_lattice_arith.h @@ -145,7 +145,7 @@ PARALLEL_FOR_LOOP template inline void axpy(Lattice &ret,sobj a,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); -PARALLEL_FOR_LOOP +#pragma omp parallel for for(int ss=0;ssoSites();ss++){ vobj tmp = a*lhs._odata[ss]; vstream(ret._odata[ss],tmp+rhs._odata[ss]); diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h index cd376b32..ace7565d 100644 --- a/lib/lattice/Grid_lattice_base.h +++ b/lib/lattice/Grid_lattice_base.h @@ -64,7 +64,8 @@ public: //////////////////////////////////////////////////////////////////////////////// template inline Lattice & operator=(const LatticeUnaryExpression &expr) { -PARALLEL_FOR_LOOP + //PARALLEL_FOR_LOOP +#pragma omp parallel for for(int ss=0;ss<_grid->oSites();ss++){ vobj tmp= eval(ss,expr); vstream(_odata[ss] ,tmp); @@ -73,7 +74,8 @@ PARALLEL_FOR_LOOP } template inline Lattice & operator=(const LatticeBinaryExpression &expr) { -PARALLEL_FOR_LOOP + // PARALLEL_FOR_LOOP +#pragma omp parallel for for(int ss=0;ss<_grid->oSites();ss++){ vobj tmp= eval(ss,expr); vstream(_odata[ss] ,tmp); @@ -82,7 +84,8 @@ PARALLEL_FOR_LOOP } template inline Lattice & operator=(const LatticeTrinaryExpression &expr) { -PARALLEL_FOR_LOOP + //PARALLEL_FOR_LOOP +#pragma omp parallel for for(int ss=0;ss<_grid->oSites();ss++){ vobj tmp= eval(ss,expr); vstream(_odata[ss] ,tmp); @@ -176,15 +179,16 @@ PARALLEL_FOR_LOOP }; // class Lattice } -#undef GRID_LATTICE_EXPRESSION_TEMPLATES #include -#ifdef GRID_LATTICE_EXPRESSION_TEMPLATES +#define GRID_LATTICE_EXPRESSION_TEMPLATES +#ifdef GRID_LATTICE_EXPRESSION_TEMPLATES #include #else #include #endif + #include #include diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index 202dce43..cd757916 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -28,9 +28,9 @@ namespace Grid { vzero(*this); return (*this); } - vComplexF( Zero & z){ - vzero(*this); - } + // vComplexF( Zero & z){ + // vzero(*this); + // } vComplexF()=default; vComplexF(ComplexF a){ vsplat(*this,a); diff --git a/tests/Grid_cshift.cc b/tests/Grid_cshift.cc index e92ff793..d0c83e30 100644 --- a/tests/Grid_cshift.cc +++ b/tests/Grid_cshift.cc @@ -9,7 +9,7 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); std::vector latt_size = GridDefaultLatt(); - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); GridCartesian Fine(latt_size,simd_layout,mpi_layout); diff --git a/tests/Grid_gamma.cc b/tests/Grid_gamma.cc index f5582955..e803029b 100644 --- a/tests/Grid_gamma.cc +++ b/tests/Grid_gamma.cc @@ -15,7 +15,7 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); std::vector latt_size = GridDefaultLatt(); - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); GridCartesian Grid(latt_size,simd_layout,mpi_layout); diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index c3454dfc..7515ebd1 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -26,7 +26,7 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); std::vector latt_size = GridDefaultLatt(); - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); latt_size.resize(4); diff --git a/tests/Grid_nersc_io.cc b/tests/Grid_nersc_io.cc index 73d7edfe..fbef3cb1 100644 --- a/tests/Grid_nersc_io.cc +++ b/tests/Grid_nersc_io.cc @@ -11,7 +11,7 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); std::vector latt_size ({16,16,16,32}); std::vector clatt_size ({4,4,4,8}); diff --git a/tests/Grid_simd.cc b/tests/Grid_simd.cc index 6a957d7e..32f4cdfb 100644 --- a/tests/Grid_simd.cc +++ b/tests/Grid_simd.cc @@ -107,7 +107,7 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); std::vector latt_size = GridDefaultLatt(); - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); GridCartesian Grid(latt_size,simd_layout,mpi_layout); diff --git a/tests/Grid_stencil.cc b/tests/Grid_stencil.cc index d9e779bf..1fdb7265 100644 --- a/tests/Grid_stencil.cc +++ b/tests/Grid_stencil.cc @@ -10,7 +10,7 @@ int main (int argc, char ** argv) std::vector latt_size = GridDefaultLatt(); - std::vector simd_layout = GridDefaultSimd(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];