mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 11:34:32 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/json-fix
This commit is contained in:
		
							
								
								
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							| @@ -9,68 +9,6 @@ matrix: | |||||||
|     - os:        osx |     - os:        osx | ||||||
|       osx_image: xcode8.3 |       osx_image: xcode8.3 | ||||||
|       compiler: clang |       compiler: clang | ||||||
|     - compiler: gcc |  | ||||||
|       dist: trusty |  | ||||||
|       sudo: required |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.9 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: VERSION=-4.9 |  | ||||||
|     - compiler: gcc |  | ||||||
|       dist: trusty |  | ||||||
|       sudo: required |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-5 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: VERSION=-5 |  | ||||||
|     - compiler: clang |  | ||||||
|       dist: trusty |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.8 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz |  | ||||||
|     - compiler: clang |  | ||||||
|       dist: trusty |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.8 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz |  | ||||||
|        |        | ||||||
| before_install: | before_install: | ||||||
|     - export GRIDDIR=`pwd` |     - export GRIDDIR=`pwd` | ||||||
| @@ -106,9 +44,3 @@ script: | |||||||
|     - make -j4 |     - make -j4 | ||||||
|     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals |     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||||
|     - make check |     - make check | ||||||
|     - echo make clean |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										31
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,18 +1,4 @@ | |||||||
| # Grid | # Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [](https://travis-ci.org/paboyle/Grid) | ||||||
| <table> |  | ||||||
| <tr> |  | ||||||
|     <td>Last stable release</td> |  | ||||||
|     <td><a href="https://travis-ci.org/paboyle/Grid"> |  | ||||||
|     <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a> |  | ||||||
|     </td> |  | ||||||
| </tr> |  | ||||||
| <tr> |  | ||||||
|     <td>Development branch</td> |  | ||||||
|     <td><a href="https://travis-ci.org/paboyle/Grid"> |  | ||||||
|     <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a> |  | ||||||
|     </td> |  | ||||||
| </tr> |  | ||||||
| </table> |  | ||||||
|  |  | ||||||
| **Data parallel C++ mathematical object library.** | **Data parallel C++ mathematical object library.** | ||||||
|  |  | ||||||
| @@ -324,6 +310,13 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to | |||||||
| ``` | ``` | ||||||
| This is the default.  | This is the default.  | ||||||
|  |  | ||||||
|  | #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping):  | ||||||
|  |  | ||||||
|  | mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18  | ||||||
|  |  | ||||||
|  | TBA | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Build setup for AMD EPYC / RYZEN | ### Build setup for AMD EPYC / RYZEN | ||||||
|  |  | ||||||
| The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores. | The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores. | ||||||
| @@ -378,6 +371,14 @@ echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY | |||||||
| $@ | $@ | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | Performance: | ||||||
|  |  | ||||||
|  | #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping):  | ||||||
|  |  | ||||||
|  | mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4 | ||||||
|  |  | ||||||
|  | TBA | ||||||
|  |  | ||||||
| ### Build setup for BlueGene/Q | ### Build setup for BlueGene/Q | ||||||
|  |  | ||||||
| To be written... | To be written... | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								TODO
									
									
									
									
									
								
							| @@ -2,18 +2,20 @@ TODO: | |||||||
| --------------- | --------------- | ||||||
|  |  | ||||||
| Large item work list: | Large item work list: | ||||||
| 1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O |  | ||||||
|  |  | ||||||
|  | 1)- BG/Q port and check | ||||||
| 2)- Christoph's local basis expansion Lanczos | 2)- Christoph's local basis expansion Lanczos | ||||||
| 3)- BG/Q port and check | 3)- Precision conversion and sort out localConvert      <-- partial | ||||||
| 4)- Precision conversion and sort out localConvert      <-- partial |  | ||||||
|   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet |   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet | ||||||
| 5)- Physical propagator interface | 4)- Physical propagator interface | ||||||
| 6)- Conserved currents | 5)- Conserved currents | ||||||
| 7)- Multigrid Wilson and DWF, compare to other Multigrid implementations | 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations | ||||||
| 8)- HDCR resume | 7)- HDCR resume | ||||||
|  |  | ||||||
| Recent DONE  | Recent DONE  | ||||||
|  |  | ||||||
|  | -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE | ||||||
| -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE | -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE | ||||||
| -- GaugeFix into central location                      <-- DONE | -- GaugeFix into central location                      <-- DONE | ||||||
| -- Scidac and Ildg metadata handling                   <-- DONE | -- Scidac and Ildg metadata handling                   <-- DONE | ||||||
|   | |||||||
							
								
								
									
										797
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										797
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,797 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./benchmarks/Benchmark_memory_bandwidth.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Grid::QCD; | ||||||
|  |  | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR; | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF; | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | std::vector<int> L_list; | ||||||
|  | std::vector<int> Ls_list; | ||||||
|  | std::vector<double> mflop_list; | ||||||
|  |  | ||||||
|  | double mflop_ref; | ||||||
|  | double mflop_ref_err; | ||||||
|  |  | ||||||
|  | int NN_global; | ||||||
|  |  | ||||||
|  | struct time_statistics{ | ||||||
|  |   double mean; | ||||||
|  |   double err; | ||||||
|  |   double min; | ||||||
|  |   double max; | ||||||
|  |  | ||||||
|  |   void statistics(std::vector<double> v){ | ||||||
|  |       double sum = std::accumulate(v.begin(), v.end(), 0.0); | ||||||
|  |       mean = sum / v.size(); | ||||||
|  |  | ||||||
|  |       std::vector<double> diff(v.size()); | ||||||
|  |       std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); | ||||||
|  |       double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); | ||||||
|  |       err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); | ||||||
|  |  | ||||||
|  |       auto result = std::minmax_element(v.begin(), v.end()); | ||||||
|  |       min = *result.first; | ||||||
|  |       max = *result.second; | ||||||
|  | } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | void comms_header(){ | ||||||
|  |   std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t" | ||||||
|  |             <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | Gamma::Algebra Gmu [] = { | ||||||
|  |   Gamma::Algebra::GammaX, | ||||||
|  |   Gamma::Algebra::GammaY, | ||||||
|  |   Gamma::Algebra::GammaZ, | ||||||
|  |   Gamma::Algebra::GammaT | ||||||
|  | }; | ||||||
|  | struct controls { | ||||||
|  |   int Opt; | ||||||
|  |   int CommsOverlap; | ||||||
|  |   Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch; | ||||||
|  |   //  int HugePages; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | class Benchmark { | ||||||
|  | public: | ||||||
|  |   static void Decomposition (void ) { | ||||||
|  |  | ||||||
|  |     int threads = GridThread::GetThreads(); | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n"; | ||||||
|  |     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static void Comms(void) | ||||||
|  |   { | ||||||
|  |     int Nloop=200; | ||||||
|  |     int nmu=0; | ||||||
|  |     int maxlat=32; | ||||||
|  |  | ||||||
|  |     std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); | ||||||
|  |     std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|  |     for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; | ||||||
|  |  | ||||||
|  |     std::vector<double> t_time(Nloop); | ||||||
|  |     time_statistics timestat; | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |     comms_header(); | ||||||
|  |  | ||||||
|  |     for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|  |       for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|  | 	std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|  | 	      lat*mpi_layout[1], | ||||||
|  | 	      lat*mpi_layout[2], | ||||||
|  | 	      lat*mpi_layout[3]}); | ||||||
|  |  | ||||||
|  | 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  | 	RealD Nrank = Grid._Nprocessors; | ||||||
|  | 	RealD Nnode = Grid.NodeCount(); | ||||||
|  | 	RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|  | 	std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||||
|  | 	std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||||
|  | 	Grid.ShmBufferFreeAll(); | ||||||
|  | 	for(int d=0;d<8;d++){ | ||||||
|  | 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  | 	int ncomm; | ||||||
|  | 	double dbytes; | ||||||
|  | 	std::vector<double> times(Nloop); | ||||||
|  | 	for(int i=0;i<Nloop;i++){ | ||||||
|  |  | ||||||
|  | 	  double start=usecond(); | ||||||
|  |  | ||||||
|  | 	  dbytes=0; | ||||||
|  | 	  ncomm=0; | ||||||
|  |  | ||||||
|  | 	  parallel_for(int dir=0;dir<8;dir++){ | ||||||
|  |  | ||||||
|  | 	    double tbytes; | ||||||
|  | 	    int mu =dir % 4; | ||||||
|  |  | ||||||
|  | 	    if (mpi_layout[mu]>1 ) { | ||||||
|  | 	         | ||||||
|  | 	      int xmit_to_rank; | ||||||
|  | 	      int recv_from_rank; | ||||||
|  | 	      if ( dir == mu ) {  | ||||||
|  | 		int comm_proc=1; | ||||||
|  | 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	      } else {  | ||||||
|  | 		int comm_proc = mpi_layout[mu]-1; | ||||||
|  | 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	      } | ||||||
|  | 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | ||||||
|  | 						 (void *)&rbuf[dir][0], recv_from_rank, | ||||||
|  | 						 bytes,dir); | ||||||
|  | 	   | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp atomic | ||||||
|  | #endif | ||||||
|  | 	      ncomm++; | ||||||
|  |  | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp atomic | ||||||
|  | #endif | ||||||
|  | 	      dbytes+=tbytes; | ||||||
|  | 	    } | ||||||
|  | 	  } | ||||||
|  | 	  Grid.Barrier(); | ||||||
|  | 	  double stop=usecond(); | ||||||
|  | 	  t_time[i] = stop-start; // microseconds | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	timestat.statistics(t_time); | ||||||
|  | 	//	for(int i=0;i<t_time.size();i++){ | ||||||
|  | 	//	  std::cout << i<<" "<<t_time[i]<<std::endl; | ||||||
|  | 	//	} | ||||||
|  |  | ||||||
|  | 	dbytes=dbytes*ppn; | ||||||
|  | 	double xbytes    = dbytes*0.5; | ||||||
|  | 	double rbytes    = dbytes*0.5; | ||||||
|  | 	double bidibytes = dbytes; | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||||
|  | 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||||
|  | 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||||
|  | 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||||
|  | 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||||
|  | 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||||
|  |  | ||||||
|  |   | ||||||
|  | 	 | ||||||
|  | 	    } | ||||||
|  |     }     | ||||||
|  |  | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static void Memory(void) | ||||||
|  |   { | ||||||
|  |     const int Nvec=8; | ||||||
|  |     typedef Lattice< iVector< vReal,Nvec> > LatticeVec; | ||||||
|  |     typedef iVector<vReal,Nvec> Vec; | ||||||
|  |  | ||||||
|  |     std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); | ||||||
|  |     std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |    | ||||||
|  |     uint64_t NP; | ||||||
|  |     uint64_t NN; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   uint64_t lmax=48; | ||||||
|  | #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat) | ||||||
|  |  | ||||||
|  |     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |     for(int lat=8;lat<=lmax;lat+=4){ | ||||||
|  |  | ||||||
|  |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|  |       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|  |       NP= Grid.RankCount(); | ||||||
|  |       NN =Grid.NodeCount(); | ||||||
|  |  | ||||||
|  |       Vec rn ; random(sRNG,rn); | ||||||
|  |  | ||||||
|  |       LatticeVec z(&Grid); z=rn; | ||||||
|  |       LatticeVec x(&Grid); x=rn; | ||||||
|  |       LatticeVec y(&Grid); y=rn; | ||||||
|  |       double a=2.0; | ||||||
|  |  | ||||||
|  |       uint64_t Nloop=NLOOP; | ||||||
|  |  | ||||||
|  |       double start=usecond(); | ||||||
|  |       for(int i=0;i<Nloop;i++){ | ||||||
|  | 	z=a*x-y; | ||||||
|  |         x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away | ||||||
|  |         y._odata[4]=z._odata[4]; | ||||||
|  |       } | ||||||
|  |       double stop=usecond(); | ||||||
|  |       double time = (stop-start)/Nloop*1000; | ||||||
|  |       | ||||||
|  |       double flops=vol*Nvec*2;// mul,add | ||||||
|  |       double bytes=3.0*vol*Nvec*sizeof(Real); | ||||||
|  |       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||||
|  | 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000. | ||||||
|  | 	       << "\t\t"<< bytes/time/NN <<std::endl; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   static double DWF5(int Ls,int L) | ||||||
|  |   { | ||||||
|  |     RealD mass=0.1; | ||||||
|  |     RealD M5  =1.8; | ||||||
|  |  | ||||||
|  |     double mflops; | ||||||
|  |     double mflops_best = 0; | ||||||
|  |     double mflops_worst= 0; | ||||||
|  |     std::vector<double> mflops_all; | ||||||
|  |  | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     // Set/Get the layout & grid size | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     int threads = GridThread::GetThreads(); | ||||||
|  |     std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4); | ||||||
|  |     std::vector<int> local({L,L,L,L}); | ||||||
|  |  | ||||||
|  |     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),  | ||||||
|  | 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     uint64_t NP = TmpGrid->RankCount(); | ||||||
|  |     uint64_t NN = TmpGrid->NodeCount(); | ||||||
|  |     NN_global=NN; | ||||||
|  |     uint64_t SHM=NP/NN; | ||||||
|  |  | ||||||
|  |     std::vector<int> internal; | ||||||
|  |     if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1}); | ||||||
|  |     else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1}); | ||||||
|  |     else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1}); | ||||||
|  |     else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1}); | ||||||
|  |     else assert(0); | ||||||
|  |  | ||||||
|  |     std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); | ||||||
|  |     std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); | ||||||
|  |  | ||||||
|  |     ///////// Welcome message //////////// | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Lattice Init //////////// | ||||||
|  |     GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |     GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); | ||||||
|  |     GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); | ||||||
|  |     GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |     ///////// RNG Init //////////// | ||||||
|  |     std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |     std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |     GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |     GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||||
|  |     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Source preparation //////////// | ||||||
|  |     LatticeFermion src   (sFGrid); random(RNG5,src); | ||||||
|  |     LatticeFermion tmp   (sFGrid); | ||||||
|  |  | ||||||
|  |     RealD N2 = 1.0/::sqrt(norm2(src)); | ||||||
|  |     src = src*N2; | ||||||
|  |      | ||||||
|  |     LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu);  | ||||||
|  |  | ||||||
|  |     WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); | ||||||
|  |     LatticeFermion src_e (sFrbGrid); | ||||||
|  |     LatticeFermion src_o (sFrbGrid); | ||||||
|  |     LatticeFermion r_e   (sFrbGrid); | ||||||
|  |     LatticeFermion r_o   (sFrbGrid); | ||||||
|  |     LatticeFermion r_eo  (sFGrid); | ||||||
|  |     LatticeFermion err   (sFGrid); | ||||||
|  |     { | ||||||
|  |  | ||||||
|  |       pickCheckerboard(Even,src_e,src); | ||||||
|  |       pickCheckerboard(Odd,src_o,src); | ||||||
|  |  | ||||||
|  | #if defined(AVX512)  | ||||||
|  |       const int num_cases = 6; | ||||||
|  |       std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #else | ||||||
|  |       const int num_cases = 4; | ||||||
|  |       std::string fmt("U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #endif | ||||||
|  |       controls Cases [] = { | ||||||
|  | #ifdef AVX512 | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | #endif | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  } | ||||||
|  |       };  | ||||||
|  |  | ||||||
|  |       for(int c=0;c<num_cases;c++) { | ||||||
|  |  | ||||||
|  | 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; | ||||||
|  | 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt; | ||||||
|  | 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||||
|  | 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  | 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  | 	int nwarm = 100; | ||||||
|  | 	uint64_t ncall = 1000; | ||||||
|  |  | ||||||
|  | 	double t0=usecond(); | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	for(int i=0;i<nwarm;i++){ | ||||||
|  | 	  sDw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	} | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	double t1=usecond(); | ||||||
|  |  | ||||||
|  | 	sDw.ZeroCounters(); | ||||||
|  | 	time_statistics timestat; | ||||||
|  | 	std::vector<double> t_time(ncall); | ||||||
|  | 	for(uint64_t i=0;i<ncall;i++){ | ||||||
|  | 	  t0=usecond(); | ||||||
|  | 	  sDw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	  t1=usecond(); | ||||||
|  | 	  t_time[i] = t1-t0; | ||||||
|  | 	} | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	 | ||||||
|  | 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|  | 	double flops=(1344.0*volume)/2; | ||||||
|  | 	double mf_hi, mf_lo, mf_err; | ||||||
|  |  | ||||||
|  | 	timestat.statistics(t_time); | ||||||
|  | 	mf_hi = flops/timestat.min; | ||||||
|  | 	mf_lo = flops/timestat.max; | ||||||
|  | 	mf_err= flops/timestat.min * timestat.err/timestat.mean; | ||||||
|  |  | ||||||
|  | 	mflops = flops/timestat.mean; | ||||||
|  | 	mflops_all.push_back(mflops); | ||||||
|  | 	if ( mflops_best == 0   ) mflops_best = mflops; | ||||||
|  | 	if ( mflops_worst== 0   ) mflops_worst= mflops; | ||||||
|  | 	if ( mflops>mflops_best ) mflops_best = mflops; | ||||||
|  | 	if ( mflops<mflops_worst) mflops_worst= mflops; | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl; | ||||||
|  |  | ||||||
|  | 	sDw.Report(); | ||||||
|  |  | ||||||
|  |       } | ||||||
|  |       double robust = mflops_worst/mflops_best;; | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; | ||||||
|  |  | ||||||
|  |       std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage <<fmt << std::endl; | ||||||
|  |       std::cout<<GridLogMessage; | ||||||
|  |  | ||||||
|  |       for(int i=0;i<mflops_all.size();i++){ | ||||||
|  | 	std::cout<<mflops_all[i]/NN<<" ; " ; | ||||||
|  |       } | ||||||
|  |       std::cout<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     return mflops_best; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static double DWF(int Ls,int L, double & robust) | ||||||
|  |   { | ||||||
|  |     RealD mass=0.1; | ||||||
|  |     RealD M5  =1.8; | ||||||
|  |  | ||||||
|  |     double mflops; | ||||||
|  |     double mflops_best = 0; | ||||||
|  |     double mflops_worst= 0; | ||||||
|  |     std::vector<double> mflops_all; | ||||||
|  |  | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     // Set/Get the layout & grid size | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     int threads = GridThread::GetThreads(); | ||||||
|  |     std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4); | ||||||
|  |     std::vector<int> local({L,L,L,L}); | ||||||
|  |  | ||||||
|  |     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),  | ||||||
|  | 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     uint64_t NP = TmpGrid->RankCount(); | ||||||
|  |     uint64_t NN = TmpGrid->NodeCount(); | ||||||
|  |     NN_global=NN; | ||||||
|  |     uint64_t SHM=NP/NN; | ||||||
|  |  | ||||||
|  |     std::vector<int> internal; | ||||||
|  |     if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1}); | ||||||
|  |     else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1}); | ||||||
|  |     else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1}); | ||||||
|  |     else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1}); | ||||||
|  |     else assert(0); | ||||||
|  |  | ||||||
|  |     std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); | ||||||
|  |     std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); | ||||||
|  |  | ||||||
|  |     ///////// Welcome message //////////// | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     ///////// Lattice Init //////////// | ||||||
|  |     GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |     GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
|  |     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |      | ||||||
|  |     ///////// RNG Init //////////// | ||||||
|  |     std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |     std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |     GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |     GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||||
|  |     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Source preparation //////////// | ||||||
|  |     LatticeFermion src   (FGrid); random(RNG5,src); | ||||||
|  |     LatticeFermion ref   (FGrid); | ||||||
|  |     LatticeFermion tmp   (FGrid); | ||||||
|  |  | ||||||
|  |     RealD N2 = 1.0/::sqrt(norm2(src)); | ||||||
|  |     src = src*N2; | ||||||
|  |      | ||||||
|  |     LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu);  | ||||||
|  |  | ||||||
|  |     DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||||
|  |  | ||||||
|  |     //////////////////////////////////// | ||||||
|  |     // Naive wilson implementation | ||||||
|  |     //////////////////////////////////// | ||||||
|  |     { | ||||||
|  |       LatticeGaugeField Umu5d(FGrid);  | ||||||
|  |       std::vector<LatticeColourMatrix> U(4,FGrid); | ||||||
|  |       for(int ss=0;ss<Umu._grid->oSites();ss++){ | ||||||
|  | 	for(int s=0;s<Ls;s++){ | ||||||
|  | 	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss]; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       ref = zero; | ||||||
|  |       for(int mu=0;mu<Nd;mu++){ | ||||||
|  | 	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu); | ||||||
|  |       } | ||||||
|  |       for(int mu=0;mu<Nd;mu++){ | ||||||
|  | 	 | ||||||
|  | 	tmp = U[mu]*Cshift(src,mu+1,1); | ||||||
|  | 	ref=ref + tmp - Gamma(Gmu[mu])*tmp; | ||||||
|  | 	 | ||||||
|  | 	tmp =adj(U[mu])*src; | ||||||
|  | 	tmp =Cshift(tmp,mu+1,-1); | ||||||
|  | 	ref=ref + tmp + Gamma(Gmu[mu])*tmp; | ||||||
|  |       } | ||||||
|  |       ref = -0.5*ref; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     LatticeFermion src_e (FrbGrid); | ||||||
|  |     LatticeFermion src_o (FrbGrid); | ||||||
|  |     LatticeFermion r_e   (FrbGrid); | ||||||
|  |     LatticeFermion r_o   (FrbGrid); | ||||||
|  |     LatticeFermion r_eo  (FGrid); | ||||||
|  |     LatticeFermion err   (FGrid); | ||||||
|  |     { | ||||||
|  |  | ||||||
|  |       pickCheckerboard(Even,src_e,src); | ||||||
|  |       pickCheckerboard(Odd,src_o,src); | ||||||
|  |  | ||||||
|  | #if defined(AVX512)  | ||||||
|  |       const int num_cases = 6; | ||||||
|  |       std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #else | ||||||
|  |       const int num_cases = 4; | ||||||
|  |       std::string fmt("U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #endif | ||||||
|  |       controls Cases [] = { | ||||||
|  | #ifdef AVX512 | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | #endif | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  } | ||||||
|  |       };  | ||||||
|  |  | ||||||
|  |       for(int c=0;c<num_cases;c++) { | ||||||
|  |  | ||||||
|  | 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; | ||||||
|  | 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt; | ||||||
|  | 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||||
|  | 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  | 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  | 	int nwarm = 200; | ||||||
|  | 	double t0=usecond(); | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	for(int i=0;i<nwarm;i++){ | ||||||
|  | 	  Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	} | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	double t1=usecond(); | ||||||
|  | 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); | ||||||
|  | 	//	if (ncall < 500) ncall = 500; | ||||||
|  | 	uint64_t ncall = 1000; | ||||||
|  |  | ||||||
|  | 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); | ||||||
|  |  | ||||||
|  | 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; | ||||||
|  | 	Dw.ZeroCounters(); | ||||||
|  |  | ||||||
|  | 	time_statistics timestat; | ||||||
|  | 	std::vector<double> t_time(ncall); | ||||||
|  | 	for(uint64_t i=0;i<ncall;i++){ | ||||||
|  | 	  t0=usecond(); | ||||||
|  | 	  Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	  t1=usecond(); | ||||||
|  | 	  t_time[i] = t1-t0; | ||||||
|  | 	} | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	 | ||||||
|  | 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|  | 	double flops=(1344.0*volume)/2; | ||||||
|  | 	double mf_hi, mf_lo, mf_err; | ||||||
|  |  | ||||||
|  | 	timestat.statistics(t_time); | ||||||
|  | 	mf_hi = flops/timestat.min; | ||||||
|  | 	mf_lo = flops/timestat.max; | ||||||
|  | 	mf_err= flops/timestat.min * timestat.err/timestat.mean; | ||||||
|  |  | ||||||
|  | 	mflops = flops/timestat.mean; | ||||||
|  | 	mflops_all.push_back(mflops); | ||||||
|  | 	if ( mflops_best == 0   ) mflops_best = mflops; | ||||||
|  | 	if ( mflops_worst== 0   ) mflops_worst= mflops; | ||||||
|  | 	if ( mflops>mflops_best ) mflops_best = mflops; | ||||||
|  | 	if ( mflops<mflops_worst) mflops_worst= mflops; | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl; | ||||||
|  |  | ||||||
|  | 	Dw.Report(); | ||||||
|  |  | ||||||
|  | 	Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	Dw.DhopOE(src_e,r_o,DaggerNo); | ||||||
|  | 	setCheckerboard(r_eo,r_o); | ||||||
|  | 	setCheckerboard(r_eo,r_e); | ||||||
|  | 	err = r_eo-ref;  | ||||||
|  | 	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  | 	assert((norm2(err)<1.0e-4)); | ||||||
|  |  | ||||||
|  |       } | ||||||
|  |       robust = mflops_worst/mflops_best; | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage <<fmt << std::endl; | ||||||
|  |       std::cout<<GridLogMessage ; | ||||||
|  |  | ||||||
|  |       for(int i=0;i<mflops_all.size();i++){ | ||||||
|  | 	std::cout<<mflops_all[i]/NN<<" ; " ; | ||||||
|  |       } | ||||||
|  |       std::cout<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     return mflops_best; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); | ||||||
|  | #ifdef KNL | ||||||
|  |   LebesgueOrder::Block = std::vector<int>({8,2,2,2}); | ||||||
|  | #else | ||||||
|  |   LebesgueOrder::Block = std::vector<int>({2,2,2,2}); | ||||||
|  | #endif | ||||||
|  |   Benchmark::Decomposition(); | ||||||
|  |  | ||||||
|  |   int do_memory=1; | ||||||
|  |   int do_comms =1; | ||||||
|  |   int do_su3   =0; | ||||||
|  |   int do_wilson=1; | ||||||
|  |   int do_dwf   =1; | ||||||
|  |  | ||||||
|  |   if ( do_su3 ) { | ||||||
|  |     // empty for now | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   int sel=2; | ||||||
|  |   std::vector<int> L_list({8,12,16,24}); | ||||||
|  |  | ||||||
|  |   //int sel=1; | ||||||
|  |   //  std::vector<int> L_list({8,12}); | ||||||
|  |   std::vector<double> robust_list; | ||||||
|  |  | ||||||
|  |   std::vector<double> wilson; | ||||||
|  |   std::vector<double> dwf4; | ||||||
|  |   std::vector<double> dwf5; | ||||||
|  |  | ||||||
|  |   if ( do_wilson ) { | ||||||
|  |     int Ls=1; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       double robust; | ||||||
|  |       wilson.push_back(Benchmark::DWF(1,L_list[l],robust)); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   int Ls=16; | ||||||
|  |   if ( do_dwf ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       double robust; | ||||||
|  |       double result = Benchmark::DWF(Ls,L_list[l],robust) ; | ||||||
|  |       dwf4.push_back(result); | ||||||
|  |       robust_list.push_back(robust); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( do_dwf ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       dwf5.push_back(Benchmark::DWF5(Ls,L_list[l])); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( do_dwf ) { | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl; | ||||||
|  |   for(int l=0;l<L_list.size();l++){ | ||||||
|  |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl; | ||||||
|  |   } | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   int NN=NN_global; | ||||||
|  |   if ( do_memory ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     Benchmark::Memory(); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( do_comms && (NN>1) ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Communications benchmark " <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     Benchmark::Comms(); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( do_dwf ) { | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl; | ||||||
|  |   for(int l=0;l<L_list.size();l++){ | ||||||
|  |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl; | ||||||
|  |   } | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Comparison point     result: "  << dwf4[sel]/NN << " Mflop/s per node"<<std::endl; | ||||||
|  |   std::cout<<std::setprecision(3); | ||||||
|  |   std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   Grid_finalize(); | ||||||
|  | } | ||||||
| @@ -68,7 +68,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   int Nloop=100; |   int Nloop=100; | ||||||
|   int nmu=0; |   int nmu=0; | ||||||
|   int maxlat=24; |   int maxlat=32; | ||||||
|   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; |   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; |   std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; | ||||||
| @@ -80,7 +80,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   header(); |   header(); | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -92,11 +92,16 @@ int main (int argc, char ** argv) | |||||||
|       RealD Nnode = Grid.NodeCount(); |       RealD Nnode = Grid.NodeCount(); | ||||||
|       RealD ppn = Nrank/Nnode; |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	 | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > rbuf(8); | ||||||
|  |  | ||||||
|       int ncomm; |       int ncomm; | ||||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  |       for(int mu=0;mu<8;mu++){ | ||||||
|  | 	xbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	rbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl; | ||||||
|  |       } | ||||||
|  |  | ||||||
|       for(int i=0;i<Nloop;i++){ |       for(int i=0;i<Nloop;i++){ | ||||||
|       double start=usecond(); |       double start=usecond(); | ||||||
| @@ -112,7 +117,6 @@ int main (int argc, char ** argv) | |||||||
| 	    int comm_proc=1; | 	    int comm_proc=1; | ||||||
| 	    int xmit_to_rank; | 	    int xmit_to_rank; | ||||||
| 	    int recv_from_rank; | 	    int recv_from_rank; | ||||||
| 	     |  | ||||||
| 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
| 	    Grid.SendToRecvFromBegin(requests, | 	    Grid.SendToRecvFromBegin(requests, | ||||||
| 				   (void *)&xbuf[mu][0], | 				   (void *)&xbuf[mu][0], | ||||||
| @@ -163,7 +167,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat,lat,lat,lat}); |       std::vector<int> latt_size  ({lat,lat,lat,lat}); | ||||||
|  |  | ||||||
| @@ -172,9 +176,14 @@ int main (int argc, char ** argv) | |||||||
|       RealD Nnode = Grid.NodeCount(); |       RealD Nnode = Grid.NodeCount(); | ||||||
|       RealD ppn = Nrank/Nnode; |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > xbuf(8); | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > rbuf(8); | ||||||
|  |  | ||||||
|  |       for(int mu=0;mu<8;mu++){ | ||||||
|  | 	xbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	rbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl; | ||||||
|  |       } | ||||||
|  |  | ||||||
|       int ncomm; |       int ncomm; | ||||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
| @@ -249,7 +258,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -299,7 +308,7 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu][0], | 					      (void *)&rbuf[mu][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu); | ||||||
| 	 | 	 | ||||||
| 	    comm_proc = mpi_layout[mu]-1; | 	    comm_proc = mpi_layout[mu]-1; | ||||||
| 	   | 	   | ||||||
| @@ -310,11 +319,11 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu+4][0], | 					      (void *)&rbuf[mu+4][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu+4); | ||||||
| 	   | 	   | ||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
| 	Grid.StencilSendToRecvFromComplete(requests); | 	Grid.StencilSendToRecvFromComplete(requests,0); | ||||||
| 	Grid.Barrier(); | 	Grid.Barrier(); | ||||||
| 	double stop=usecond(); | 	double stop=usecond(); | ||||||
| 	t_time[i] = stop-start; // microseconds | 	t_time[i] = stop-start; // microseconds | ||||||
| @@ -346,7 +355,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -393,8 +402,8 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu][0], | 					      (void *)&rbuf[mu][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu); | ||||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | 	    Grid.StencilSendToRecvFromComplete(requests,mu); | ||||||
| 	    requests.resize(0); | 	    requests.resize(0); | ||||||
|  |  | ||||||
| 	    comm_proc = mpi_layout[mu]-1; | 	    comm_proc = mpi_layout[mu]-1; | ||||||
| @@ -406,8 +415,8 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu+4][0], | 					      (void *)&rbuf[mu+4][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu+4); | ||||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | 	    Grid.StencilSendToRecvFromComplete(requests,mu+4); | ||||||
| 	    requests.resize(0); | 	    requests.resize(0); | ||||||
| 	   | 	   | ||||||
| 	  } | 	  } | ||||||
| @@ -436,5 +445,97 @@ int main (int argc, char ** argv) | |||||||
|     } |     } | ||||||
|   }     |   }     | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   header(); | ||||||
|  |  | ||||||
|  |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|  |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|  |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|  |       				    lat*mpi_layout[1], | ||||||
|  |       				    lat*mpi_layout[2], | ||||||
|  |       				    lat*mpi_layout[3]}); | ||||||
|  |  | ||||||
|  |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |       RealD Nrank = Grid._Nprocessors; | ||||||
|  |       RealD Nnode = Grid.NodeCount(); | ||||||
|  |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|  |       std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||||
|  |       std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||||
|  |       Grid.ShmBufferFreeAll(); | ||||||
|  |       for(int d=0;d<8;d++){ | ||||||
|  | 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       int ncomm; | ||||||
|  |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  |       double dbytes; | ||||||
|  |       for(int i=0;i<Nloop;i++){ | ||||||
|  | 	double start=usecond(); | ||||||
|  |  | ||||||
|  | 	std::vector<CartesianCommunicator::CommsRequest_t> requests; | ||||||
|  | 	dbytes=0; | ||||||
|  | 	ncomm=0; | ||||||
|  |  | ||||||
|  | 	parallel_for(int dir=0;dir<8;dir++){ | ||||||
|  |  | ||||||
|  | 	  double tbytes; | ||||||
|  | 	  int mu =dir % 4; | ||||||
|  |  | ||||||
|  | 	  if (mpi_layout[mu]>1 ) { | ||||||
|  | 	   | ||||||
|  | 	    ncomm++; | ||||||
|  | 	    int xmit_to_rank; | ||||||
|  | 	    int recv_from_rank; | ||||||
|  | 	    if ( dir == mu ) {  | ||||||
|  | 	      int comm_proc=1; | ||||||
|  | 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	    } else {  | ||||||
|  | 	      int comm_proc = mpi_layout[mu]-1; | ||||||
|  | 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	    } | ||||||
|  |  | ||||||
|  | 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | ||||||
|  | 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir); | ||||||
|  |  | ||||||
|  | #pragma omp atomic | ||||||
|  | 	    dbytes+=tbytes; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  | 	Grid.Barrier(); | ||||||
|  | 	double stop=usecond(); | ||||||
|  | 	t_time[i] = stop-start; // microseconds | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       timestat.statistics(t_time); | ||||||
|  |  | ||||||
|  |       dbytes=dbytes*ppn; | ||||||
|  |       double xbytes    = dbytes*0.5; | ||||||
|  |       double rbytes    = dbytes*0.5; | ||||||
|  |       double bidibytes = dbytes; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |       std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||||
|  |                <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||||
|  |                <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||||
|  |                <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||||
|  |                << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||||
|  |                << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||||
|  |   | ||||||
|  |     } | ||||||
|  |   }     | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -503,9 +503,9 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; |   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; |   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||||
|  |  | ||||||
|   //assert(norm2(src_e)<1.0e-4); |   assert(norm2(src_e)<1.0e-4); | ||||||
|   //assert(norm2(src_o)<1.0e-4); |   assert(norm2(src_o)<1.0e-4); | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
|  |   exit(0); | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										54
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										54
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) | |||||||
| ################ Get git info | ################ Get git info | ||||||
| #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])]) | #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])]) | ||||||
|  |  | ||||||
|  | ################ Set flags | ||||||
|  | # do not move! | ||||||
|  | CXXFLAGS="-O3 $CXXFLAGS" | ||||||
|  |  | ||||||
| ############### Checks for programs | ############### Checks for programs | ||||||
| AC_PROG_CXX | AC_PROG_CXX | ||||||
| AC_PROG_RANLIB | AC_PROG_RANLIB | ||||||
| @@ -27,7 +31,6 @@ AX_GXX_VERSION | |||||||
| AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], | AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], | ||||||
|       [version of g++ that will compile the code]) |       [version of g++ that will compile the code]) | ||||||
|  |  | ||||||
| CXXFLAGS="-O3 $CXXFLAGS" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ############### Checks for typedefs, structures, and compiler characteristics | ############### Checks for typedefs, structures, and compiler characteristics | ||||||
| @@ -55,6 +58,10 @@ AC_CHECK_HEADERS(numaif.h) | |||||||
| AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) | AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) | ||||||
| AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) | AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) | ||||||
|  |  | ||||||
|  | ############## Standard libraries | ||||||
|  | AC_CHECK_LIB([m],[cos]) | ||||||
|  | AC_CHECK_LIB([stdc++],[abort]) | ||||||
|  |  | ||||||
| ############### GMP and MPFR | ############### GMP and MPFR | ||||||
| AC_ARG_WITH([gmp], | AC_ARG_WITH([gmp], | ||||||
|     [AS_HELP_STRING([--with-gmp=prefix], |     [AS_HELP_STRING([--with-gmp=prefix], | ||||||
| @@ -324,8 +331,41 @@ case ${ac_PRECISION} in | |||||||
|      double) |      double) | ||||||
|        AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) |        AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) | ||||||
|      ;; |      ;; | ||||||
|  |      *) | ||||||
|  |      AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]); | ||||||
|  |      ;; | ||||||
| esac | esac | ||||||
|  |  | ||||||
|  | ######################  Shared memory allocation technique under MPI3 | ||||||
|  | AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs], | ||||||
|  |               [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen]) | ||||||
|  |  | ||||||
|  | case ${ac_SHM} in | ||||||
|  |  | ||||||
|  |      shmget) | ||||||
|  |      AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] ) | ||||||
|  |      ;; | ||||||
|  |  | ||||||
|  |      shmopen) | ||||||
|  |      AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] ) | ||||||
|  |      ;; | ||||||
|  |  | ||||||
|  |      hugetlbfs) | ||||||
|  |      AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] ) | ||||||
|  |      ;; | ||||||
|  |  | ||||||
|  |      *) | ||||||
|  |      AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]); | ||||||
|  |      ;; | ||||||
|  | esac | ||||||
|  |  | ||||||
|  | ######################  Shared base path for SHMMMAP | ||||||
|  | AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path], | ||||||
|  |               [Select SHM mmap base path for hugetlbfs])], | ||||||
|  | 	      [ac_SHMPATH=${enable_shmpath}], | ||||||
|  | 	      [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/]) | ||||||
|  | AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing]) | ||||||
|  |  | ||||||
| ############### communication type selection | ############### communication type selection | ||||||
| AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem], | AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem], | ||||||
|               [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) |               [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) | ||||||
| @@ -335,14 +375,14 @@ case ${ac_COMMS} in | |||||||
|         AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) |         AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) | ||||||
|         comms_type='none' |         comms_type='none' | ||||||
|      ;; |      ;; | ||||||
|      mpi3l*) |  | ||||||
|        AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] ) |  | ||||||
|        comms_type='mpi3l' |  | ||||||
|      ;; |  | ||||||
|      mpi3*) |      mpi3*) | ||||||
|         AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) |         AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) | ||||||
|         comms_type='mpi3' |         comms_type='mpi3' | ||||||
|      ;; |      ;; | ||||||
|  |      mpit) | ||||||
|  |         AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] ) | ||||||
|  |         comms_type='mpit' | ||||||
|  |      ;; | ||||||
|      mpi*) |      mpi*) | ||||||
|         AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) |         AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) | ||||||
|         comms_type='mpi' |         comms_type='mpi' | ||||||
| @@ -370,7 +410,7 @@ esac | |||||||
| AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ]) | AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ]) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ]) | AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ]) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] ) | AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] ) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] ) | AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] ) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ]) | AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ]) | ||||||
|  |  | ||||||
| ############### RNG selection | ############### RNG selection | ||||||
| @@ -475,6 +515,8 @@ compiler version            : ${ax_cv_gxx_version} | |||||||
| SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} | SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} | ||||||
| Threading                   : ${ac_openmp} | Threading                   : ${ac_openmp} | ||||||
| Communications type         : ${comms_type} | Communications type         : ${comms_type} | ||||||
|  | Shared memory allocator     : ${ac_SHM} | ||||||
|  | Shared memory mmap path     : ${ac_SHMPATH} | ||||||
| Default precision           : ${ac_PRECISION} | Default precision           : ${ac_PRECISION} | ||||||
| Software FP16 conversion    : ${ac_SFW_FP16} | Software FP16 conversion    : ${ac_SFW_FP16} | ||||||
| RNG choice                  : ${ac_RNG} | RNG choice                  : ${ac_RNG} | ||||||
|   | |||||||
| @@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3 | |||||||
|   extra_sources+=communicator/Communicator_base.cc |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
| if BUILD_COMMS_MPI3L | if BUILD_COMMS_MPIT | ||||||
|   extra_sources+=communicator/Communicator_mpi3_leader.cc |   extra_sources+=communicator/Communicator_mpit.cc | ||||||
|   extra_sources+=communicator/Communicator_base.cc |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr, | |||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   sliceInnerProductMatrix(m_rr,R,R,Orthog); |   sliceInnerProductMatrix(m_rr,R,R,Orthog); | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// |   // Force manifest hermitian to avoid rounding related | ||||||
|   // Cholesky from Eigen |   m_rr = 0.5*(m_rr+m_rr.adjoint()); | ||||||
|   // There exists a ldlt that is documented as more stable |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   Eigen::MatrixXcd L    = m_rr.llt().matrixL();  |  | ||||||
|  |  | ||||||
|  | #if 0 | ||||||
|  |   std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl; | ||||||
|  |   Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL();  | ||||||
|  |   std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl; | ||||||
|  |   auto  D_ldlt = m_rr.ldlt().vectorD();  | ||||||
|  |   std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl; | ||||||
|  |   Eigen::MatrixXcd L    = m_rr.llt().matrixL();  | ||||||
|  |   //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl; | ||||||
|   C    = L.adjoint(); |   C    = L.adjoint(); | ||||||
|   Cinv = C.inverse(); |   Cinv = C.inverse(); | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Q = R C^{-1} |   // Q = R C^{-1} | ||||||
|   // |   // | ||||||
| @@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr, | |||||||
|   // |   // | ||||||
|   // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already |   // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // FIXME:: make a sliceMulMatrix to avoid zero vector |  | ||||||
|   sliceMulMatrix(Q,Cinv,R,Orthog); |   sliceMulMatrix(Q,Cinv,R,Orthog); | ||||||
| } | } | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -199,7 +205,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) | |||||||
|  |  | ||||||
|   Linop.HermOp(X, AD); |   Linop.HermOp(X, AD); | ||||||
|   tmp = B - AD;   |   tmp = B - AD;   | ||||||
|  |   //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl; | ||||||
|   ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); |   ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); | ||||||
|  |   //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl; | ||||||
|  |   //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl; | ||||||
|  |   //std::cout << GridLogMessage << " m_C " << m_C<<std::endl; | ||||||
|  |   //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl; | ||||||
|   D=Q; |   D=Q; | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl; |   std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl; | ||||||
| @@ -221,12 +232,14 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) | |||||||
|     MatrixTimer.Start(); |     MatrixTimer.Start(); | ||||||
|     Linop.HermOp(D, Z);       |     Linop.HermOp(D, Z);       | ||||||
|     MatrixTimer.Stop(); |     MatrixTimer.Stop(); | ||||||
|  |     //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl; | ||||||
|  |  | ||||||
|     //4. M  = [D^dag Z]^{-1} |     //4. M  = [D^dag Z]^{-1} | ||||||
|     sliceInnerTimer.Start(); |     sliceInnerTimer.Start(); | ||||||
|     sliceInnerProductMatrix(m_DZ,D,Z,Orthog); |     sliceInnerProductMatrix(m_DZ,D,Z,Orthog); | ||||||
|     sliceInnerTimer.Stop(); |     sliceInnerTimer.Stop(); | ||||||
|     m_M       = m_DZ.inverse(); |     m_M       = m_DZ.inverse(); | ||||||
|  |     //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl; | ||||||
|      |      | ||||||
|     //5. X  = X + D MC |     //5. X  = X + D MC | ||||||
|     m_tmp     = m_M * m_C; |     m_tmp     = m_M * m_C; | ||||||
|   | |||||||
| @@ -1,7 +1,5 @@ | |||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
|  | #include <fcntl.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -11,7 +9,7 @@ int PointerCache::victim; | |||||||
|  |  | ||||||
| void *PointerCache::Insert(void *ptr,size_t bytes) { | void *PointerCache::Insert(void *ptr,size_t bytes) { | ||||||
|  |  | ||||||
|   if (bytes < 4096 ) return NULL; |   if (bytes < 4096 ) return ptr; | ||||||
|  |  | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   assert(omp_in_parallel()==0); |   assert(omp_in_parallel()==0); | ||||||
| @@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) { | |||||||
|   return NULL; |   return NULL; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | void check_huge_pages(void *Buf,uint64_t BYTES) | ||||||
|  | { | ||||||
|  | #ifdef __linux__ | ||||||
|  |   int fd = open("/proc/self/pagemap", O_RDONLY); | ||||||
|  |   assert(fd >= 0); | ||||||
|  |   const int page_size = 4096; | ||||||
|  |   uint64_t virt_pfn = (uint64_t)Buf / page_size; | ||||||
|  |   off_t offset = sizeof(uint64_t) * virt_pfn; | ||||||
|  |   uint64_t npages = (BYTES + page_size-1) / page_size; | ||||||
|  |   uint64_t pagedata[npages]; | ||||||
|  |   uint64_t ret = lseek(fd, offset, SEEK_SET); | ||||||
|  |   assert(ret == offset); | ||||||
|  |   ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); | ||||||
|  |   assert(ret == sizeof(uint64_t) * npages); | ||||||
|  |   int nhugepages = npages / 512; | ||||||
|  |   int n4ktotal, nnothuge; | ||||||
|  |   n4ktotal = 0; | ||||||
|  |   nnothuge = 0; | ||||||
|  |   for (int i = 0; i < nhugepages; ++i) { | ||||||
|  |     uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; | ||||||
|  |     for (int j = 0; j < 512; ++j) { | ||||||
|  |       uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; | ||||||
|  |       ++n4ktotal; | ||||||
|  |       if (pageaddr != baseaddr + j * page_size) | ||||||
|  | 	++nnothuge; | ||||||
|  |       } | ||||||
|  |   } | ||||||
|  |   int rank = CartesianCommunicator::RankWorld(); | ||||||
|  |   printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -64,6 +64,8 @@ namespace Grid { | |||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   void check_huge_pages(void *Buf,uint64_t BYTES); | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////// | ||||||
| // A lattice of something, but assume the something is SIMDized. | // A lattice of something, but assume the something is SIMDized. | ||||||
| //////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////// | ||||||
| @@ -92,12 +94,20 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|  |  | ||||||
|     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); |     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); | ||||||
|  |     //    if ( ptr != NULL )  | ||||||
|  |     //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl; | ||||||
|  |  | ||||||
|  |     ////////////////// | ||||||
|  |     // Hack 2MB align; could make option probably doesn't need configurability | ||||||
|  |     ////////////////// | ||||||
|  | //define GRID_ALLOC_ALIGN (128) | ||||||
|  | #define GRID_ALLOC_ALIGN (2*1024*1024) | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); |     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); | ||||||
| #else | #else | ||||||
|     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); |     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); | ||||||
| #endif | #endif | ||||||
|  |     //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl; | ||||||
|     // First touch optimise in threaded loop |     // First touch optimise in threaded loop | ||||||
|     uint8_t *cp = (uint8_t *)ptr; |     uint8_t *cp = (uint8_t *)ptr; | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
| @@ -111,6 +121,7 @@ public: | |||||||
|  |  | ||||||
|   void deallocate(pointer __p, size_type __n) {  |   void deallocate(pointer __p, size_type __n) {  | ||||||
|     size_type bytes = __n * sizeof(_Tp); |     size_type bytes = __n * sizeof(_Tp); | ||||||
|  |  | ||||||
|     pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); |     pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); | ||||||
|  |  | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
| @@ -189,17 +200,19 @@ public: | |||||||
|   pointer allocate(size_type __n, const void* _p= 0)  |   pointer allocate(size_type __n, const void* _p= 0)  | ||||||
|   { |   { | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); |     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN); | ||||||
| #else | #else | ||||||
|     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); |     _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp)); | ||||||
| #endif | #endif | ||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|     uint8_t *cp = (uint8_t *)ptr; |     uint8_t *cp = (uint8_t *)ptr; | ||||||
|  |     if ( ptr ) {  | ||||||
|     // One touch per 4k page, static OMP loop to catch same loop order |     // One touch per 4k page, static OMP loop to catch same loop order | ||||||
| #pragma omp parallel for schedule(static) | #pragma omp parallel for schedule(static) | ||||||
|       for(size_type n=0;n<bytes;n+=4096){ |       for(size_type n=0;n<bytes;n+=4096){ | ||||||
| 	cp[n]=0; | 	cp[n]=0; | ||||||
|       } |       } | ||||||
|  |     } | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
|   void deallocate(pointer __p, size_type) {  |   void deallocate(pointer __p, size_type) {  | ||||||
|   | |||||||
| @@ -185,17 +185,18 @@ public: | |||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|     void show_decomposition(){ |     void show_decomposition(){ | ||||||
|       std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl; |       std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl; | ||||||
|       std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl; |       std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl; | ||||||
|       std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl; |       std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl; | ||||||
|       std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl; |       std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl; | ||||||
|       std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl; |       std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl; | ||||||
|       std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl; |       std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl; | ||||||
|       std::cout << GridLogMessage << "iSites             : " << _isites << std::endl; |       std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl; | ||||||
|       std::cout << GridLogMessage << "oSites             : " << _osites << std::endl; |       std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl; | ||||||
|       std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;         |       std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl; | ||||||
|       std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl; |       std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;         | ||||||
|       std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;              |       std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;              | ||||||
|     }  |     }  | ||||||
|  |  | ||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -63,8 +63,7 @@ public: | |||||||
|     } |     } | ||||||
|     GridCartesian(const std::vector<int> &dimensions, |     GridCartesian(const std::vector<int> &dimensions, | ||||||
|                   const std::vector<int> &simd_layout, |                   const std::vector<int> &simd_layout, | ||||||
| 		  const std::vector<int> &processor_grid |                   const std::vector<int> &processor_grid) : GridBase(processor_grid) | ||||||
| 		  ) : GridBase(processor_grid) |  | ||||||
|     { |     { | ||||||
|       /////////////////////// |       /////////////////////// | ||||||
|       // Grid information |       // Grid information | ||||||
| @@ -84,28 +83,34 @@ public: | |||||||
|  |  | ||||||
|       _fsites = _gsites = _osites = _isites = 1; |       _fsites = _gsites = _osites = _isites = 1; | ||||||
|  |  | ||||||
|         for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |       { | ||||||
|         _fdimensions[d] = dimensions[d];   // Global dimensions |         _fdimensions[d] = dimensions[d];   // Global dimensions | ||||||
|         _gdimensions[d] = _fdimensions[d]; // Global dimensions |         _gdimensions[d] = _fdimensions[d]; // Global dimensions | ||||||
|         _simd_layout[d] = simd_layout[d]; |         _simd_layout[d] = simd_layout[d]; | ||||||
|         _fsites = _fsites * _fdimensions[d]; |         _fsites = _fsites * _fdimensions[d]; | ||||||
|         _gsites = _gsites * _gdimensions[d]; |         _gsites = _gsites * _gdimensions[d]; | ||||||
|  |  | ||||||
| 	  //FIXME check for exact division |  | ||||||
|  |  | ||||||
|         // Use a reduced simd grid |         // Use a reduced simd grid | ||||||
|         _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions |         _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions | ||||||
|  |         assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); | ||||||
|  |  | ||||||
|         _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition |         _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition | ||||||
|  |         assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); | ||||||
|  |  | ||||||
|         _lstart[d] = _processor_coor[d] * _ldimensions[d]; |         _lstart[d] = _processor_coor[d] * _ldimensions[d]; | ||||||
|         _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; |         _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; | ||||||
|         _osites *= _rdimensions[d]; |         _osites *= _rdimensions[d]; | ||||||
|         _isites *= _simd_layout[d]; |         _isites *= _simd_layout[d]; | ||||||
|  |  | ||||||
|         // Addressing support |         // Addressing support | ||||||
| 	  if ( d==0 ) { |         if (d == 0) | ||||||
|  |         { | ||||||
|           _ostride[d] = 1; |           _ostride[d] = 1; | ||||||
|           _istride[d] = 1; |           _istride[d] = 1; | ||||||
| 	  } else { |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|           _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; |           _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; | ||||||
|           _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; |           _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; | ||||||
|         } |         } | ||||||
| @@ -120,19 +125,18 @@ public: | |||||||
|  |  | ||||||
|       int block = 1; |       int block = 1; | ||||||
|       int nblock = 1; |       int nblock = 1; | ||||||
|         for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d]; |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |         nblock *= _rdimensions[d]; | ||||||
|  |  | ||||||
|         for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |       { | ||||||
|         nblock /= _rdimensions[d]; |         nblock /= _rdimensions[d]; | ||||||
|         _slice_block[d] = block; |         _slice_block[d] = block; | ||||||
|         _slice_stride[d] = _ostride[d] * _rdimensions[d]; |         _slice_stride[d] = _ostride[d] * _rdimensions[d]; | ||||||
|         _slice_nblock[d] = nblock; |         _slice_nblock[d] = nblock; | ||||||
|         block = block * _rdimensions[d]; |         block = block * _rdimensions[d]; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|     }; |     }; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -161,28 +161,35 @@ public: | |||||||
|  |  | ||||||
|       _checker_dim_mask = checker_dim_mask; |       _checker_dim_mask = checker_dim_mask; | ||||||
|  |  | ||||||
|       for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |       { | ||||||
|         _fdimensions[d] = dimensions[d]; |         _fdimensions[d] = dimensions[d]; | ||||||
|         _gdimensions[d] = _fdimensions[d]; |         _gdimensions[d] = _fdimensions[d]; | ||||||
|         _fsites = _fsites * _fdimensions[d]; |         _fsites = _fsites * _fdimensions[d]; | ||||||
|         _gsites = _gsites * _gdimensions[d]; |         _gsites = _gsites * _gdimensions[d]; | ||||||
|  |  | ||||||
| 	if (d==_checker_dim) { |         if (d == _checker_dim) | ||||||
|  |         { | ||||||
|  |           assert((_gdimensions[d] & 0x1) == 0); | ||||||
|           _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard |           _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard | ||||||
|         } |         } | ||||||
|         _ldimensions[d] = _gdimensions[d] / _processors[d]; |         _ldimensions[d] = _gdimensions[d] / _processors[d]; | ||||||
|  |         assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); | ||||||
|         _lstart[d] = _processor_coor[d] * _ldimensions[d]; |         _lstart[d] = _processor_coor[d] * _ldimensions[d]; | ||||||
|         _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; |         _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; | ||||||
|  |  | ||||||
|         // Use a reduced simd grid |         // Use a reduced simd grid | ||||||
|         _simd_layout[d] = simd_layout[d]; |         _simd_layout[d] = simd_layout[d]; | ||||||
| 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; |         _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer | ||||||
|  |         assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); | ||||||
|         assert(_rdimensions[d] > 0); |         assert(_rdimensions[d] > 0); | ||||||
|  |  | ||||||
|         // all elements of a simd vector must have same checkerboard. |         // all elements of a simd vector must have same checkerboard. | ||||||
|         // If Ls vectorised, this must still be the case; e.g. dwf rb5d |         // If Ls vectorised, this must still be the case; e.g. dwf rb5d | ||||||
| 	if ( _simd_layout[d]>1 ) { |         if (_simd_layout[d] > 1) | ||||||
| 	  if ( checker_dim_mask[d] ) {  |         { | ||||||
|  |           if (checker_dim_mask[d]) | ||||||
|  |           { | ||||||
|             assert((_rdimensions[d] & 0x1) == 0); |             assert((_rdimensions[d] & 0x1) == 0); | ||||||
|           } |           } | ||||||
|         } |         } | ||||||
| @@ -191,15 +198,16 @@ public: | |||||||
|         _isites *= _simd_layout[d]; |         _isites *= _simd_layout[d]; | ||||||
|  |  | ||||||
|         // Addressing support |         // Addressing support | ||||||
| 	if ( d==0 ) { |         if (d == 0) | ||||||
|  |         { | ||||||
|           _ostride[d] = 1; |           _ostride[d] = 1; | ||||||
|           _istride[d] = 1; |           _istride[d] = 1; | ||||||
| 	} else { |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|           _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; |           _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; | ||||||
|           _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; |           _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |  | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       //////////////////////////////////////////////////////////////////////////////////////////// |       //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -211,9 +219,11 @@ public: | |||||||
|  |  | ||||||
|       int block = 1; |       int block = 1; | ||||||
|       int nblock = 1; |       int nblock = 1; | ||||||
|       for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d]; |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |         nblock *= _rdimensions[d]; | ||||||
|  |  | ||||||
|       for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |       { | ||||||
|         nblock /= _rdimensions[d]; |         nblock /= _rdimensions[d]; | ||||||
|         _slice_block[d] = block; |         _slice_block[d] = block; | ||||||
|         _slice_stride[d] = _ostride[d] * _rdimensions[d]; |         _slice_stride[d] = _ostride[d] * _rdimensions[d]; | ||||||
| @@ -225,23 +235,29 @@ public: | |||||||
|       // Create a checkerboard lookup table |       // Create a checkerboard lookup table | ||||||
|       //////////////////////////////////////////////// |       //////////////////////////////////////////////// | ||||||
|       int rvol = 1; |       int rvol = 1; | ||||||
|       for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |       { | ||||||
|         rvol = rvol * _rdimensions[d]; |         rvol = rvol * _rdimensions[d]; | ||||||
|       } |       } | ||||||
|       _checker_board.resize(rvol); |       _checker_board.resize(rvol); | ||||||
|       for(int osite=0;osite<_osites;osite++){ |       for (int osite = 0; osite < _osites; osite++) | ||||||
|  |       { | ||||||
|         _checker_board[osite] = CheckerBoardFromOindex(osite); |         _checker_board[osite] = CheckerBoardFromOindex(osite); | ||||||
|       } |       } | ||||||
|        |  | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|   protected: |   protected: | ||||||
|     virtual int oIndex(std::vector<int> &coor) |     virtual int oIndex(std::vector<int> &coor) | ||||||
|     { |     { | ||||||
|       int idx = 0; |       int idx = 0; | ||||||
|       for(int d=0;d<_ndimension;d++) { |       for (int d = 0; d < _ndimension; d++) | ||||||
| 	if( d==_checker_dim ) { |       { | ||||||
|  |         if (d == _checker_dim) | ||||||
|  |         { | ||||||
|           idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); |           idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); | ||||||
| 	} else { |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|           idx += _ostride[d] * (coor[d] % _rdimensions[d]); |           idx += _ostride[d] * (coor[d] % _rdimensions[d]); | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
| @@ -251,16 +267,19 @@ protected: | |||||||
|     virtual int iIndex(std::vector<int> &lcoor) |     virtual int iIndex(std::vector<int> &lcoor) | ||||||
|     { |     { | ||||||
|       int idx = 0; |       int idx = 0; | ||||||
|         for(int d=0;d<_ndimension;d++) { |       for (int d = 0; d < _ndimension; d++) | ||||||
| 	  if( d==_checker_dim ) { |       { | ||||||
|  |         if (d == _checker_dim) | ||||||
|  |         { | ||||||
|           idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); |           idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); | ||||||
| 	  } else {  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|           idx += _istride[d] * (lcoor[d] / _rdimensions[d]); |           idx += _istride[d] * (lcoor[d] / _rdimensions[d]); | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
|       return idx; |       return idx; | ||||||
|     } |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
|  | #include <fcntl.h> | ||||||
|  | #include <unistd.h> | ||||||
|  | #include <limits.h> | ||||||
|  | #include <sys/mman.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -33,8 +37,11 @@ namespace Grid { | |||||||
| // Info that is setup once and indept of cartesian layout | // Info that is setup once and indept of cartesian layout | ||||||
| /////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////// | ||||||
| void *              CartesianCommunicator::ShmCommBuf; | void *              CartesianCommunicator::ShmCommBuf; | ||||||
| uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;  | uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL;  | ||||||
| CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; | CartesianCommunicator::CommunicatorPolicy_t   | ||||||
|  | CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; | ||||||
|  | int CartesianCommunicator::nCommThreads = -1; | ||||||
|  | int CartesianCommunicator::Hugepages = 0; | ||||||
|  |  | ||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
| // Alloc, free shmem region | // Alloc, free shmem region | ||||||
| @@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | |||||||
|   GlobalSumVector((double *)c,2*N); |   GlobalSumVector((double *)c,2*N); | ||||||
| } | } | ||||||
|  |  | ||||||
| #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) | #if !defined( GRID_COMMS_MPI3)  | ||||||
|  |  | ||||||
| int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();}; | int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();}; | ||||||
| int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();}; | int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();}; | ||||||
|  | #endif | ||||||
|  | #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT) | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | ||||||
|  | 						     int xmit_to_rank, | ||||||
|  | 						     void *recv, | ||||||
|  | 						     int recv_from_rank, | ||||||
|  | 						     int bytes, int dir) | ||||||
|  | { | ||||||
|  |   std::vector<CommsRequest_t> list; | ||||||
|  |   // Discard the "dir" | ||||||
|  |   SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||||
|  |   SendToRecvFromComplete(list); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int xmit_to_rank, | 							 int xmit_to_rank, | ||||||
| 							 void *recv, | 							 void *recv, | ||||||
| 							 int recv_from_rank, | 							 int recv_from_rank, | ||||||
| 						       int bytes) | 							 int bytes, int dir) | ||||||
| { | { | ||||||
|  |   // Discard the "dir" | ||||||
|   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); |   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||||
|   return 2.0*bytes; |   return 2.0*bytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
| { | { | ||||||
|   SendToRecvFromComplete(waitall); |   SendToRecvFromComplete(waitall); | ||||||
| } | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #if !defined( GRID_COMMS_MPI3)  | ||||||
|  |  | ||||||
| void CartesianCommunicator::StencilBarrier(void){}; | void CartesianCommunicator::StencilBarrier(void){}; | ||||||
|  |  | ||||||
| commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; | commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; | ||||||
| @@ -121,8 +146,25 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { | |||||||
|   return NULL; |   return NULL; | ||||||
| } | } | ||||||
| void CartesianCommunicator::ShmInitGeneric(void){ | void CartesianCommunicator::ShmInitGeneric(void){ | ||||||
|  | #if 1 | ||||||
|  |  | ||||||
|  |   int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; | ||||||
|  | #ifdef MAP_HUGETLB | ||||||
|  |   if ( Hugepages ) mmap_flag |= MAP_HUGETLB; | ||||||
|  | #endif | ||||||
|  |   ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);  | ||||||
|  |   if (ShmCommBuf == (void *)MAP_FAILED) { | ||||||
|  |     perror("mmap failed "); | ||||||
|  |     exit(EXIT_FAILURE);   | ||||||
|  |   } | ||||||
|  | #ifdef MADV_HUGEPAGE | ||||||
|  |   if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE); | ||||||
|  | #endif | ||||||
|  | #else  | ||||||
|   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); |   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); | ||||||
|   ShmCommBuf=(void *)&ShmBufStorageVector[0]; |   ShmCommBuf=(void *)&ShmBufStorageVector[0]; | ||||||
|  | #endif | ||||||
|  |   bzero(ShmCommBuf,MAX_MPI_SHM_BYTES); | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifdef GRID_COMMS_MPI3 | #ifdef GRID_COMMS_MPI3 | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_MPI3L | #ifdef GRID_COMMS_MPIT | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
| @@ -50,12 +50,24 @@ namespace Grid { | |||||||
| class CartesianCommunicator { | class CartesianCommunicator { | ||||||
|   public:     |   public:     | ||||||
|  |  | ||||||
|   // 65536 ranks per node adequate for now |  | ||||||
|  |   //////////////////////////////////////////// | ||||||
|  |   // Isend/Irecv/Wait, or Sendrecv blocking | ||||||
|  |   //////////////////////////////////////////// | ||||||
|  |   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; | ||||||
|  |   static CommunicatorPolicy_t CommunicatorPolicy; | ||||||
|  |   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////// | ||||||
|  |   // Up to 65536 ranks per node adequate for now | ||||||
|   // 128MB shared memory for comms enought for 48^4 local vol comms |   // 128MB shared memory for comms enought for 48^4 local vol comms | ||||||
|   // Give external control (command line override?) of this |   // Give external control (command line override?) of this | ||||||
|  |   /////////////////////////////////////////// | ||||||
|   static const int MAXLOG2RANKSPERNODE = 16;             |   static const int MAXLOG2RANKSPERNODE = 16;             | ||||||
|   static uint64_t  MAX_MPI_SHM_BYTES; |   static uint64_t  MAX_MPI_SHM_BYTES; | ||||||
|  |   static int       nCommThreads; | ||||||
|  |   // use explicit huge pages | ||||||
|  |   static int       Hugepages; | ||||||
|  |  | ||||||
|   // Communicator should know nothing of the physics grid, only processor grid. |   // Communicator should know nothing of the physics grid, only processor grid. | ||||||
|   int              _Nprocessors;     // How many in all |   int              _Nprocessors;     // How many in all | ||||||
| @@ -64,14 +76,18 @@ class CartesianCommunicator { | |||||||
|   std::vector<int> _processor_coor;  // linear processor coordinate |   std::vector<int> _processor_coor;  // linear processor coordinate | ||||||
|   unsigned long _ndimension; |   unsigned long _ndimension; | ||||||
|  |  | ||||||
| #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L) | #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) | ||||||
|   static MPI_Comm communicator_world; |   static MPI_Comm communicator_world; | ||||||
|  |  | ||||||
|   MPI_Comm              communicator; |   MPI_Comm              communicator; | ||||||
|  |   std::vector<MPI_Comm> communicator_halo; | ||||||
|  |  | ||||||
|   typedef MPI_Request CommsRequest_t; |   typedef MPI_Request CommsRequest_t; | ||||||
| #else  | #else  | ||||||
|   typedef int CommsRequest_t; |   typedef int CommsRequest_t; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////// | ||||||
|   // Helper functionality for SHM Windows common to all other impls |   // Helper functionality for SHM Windows common to all other impls | ||||||
|   //////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////// | ||||||
| @@ -117,10 +133,6 @@ class CartesianCommunicator { | |||||||
|   ///////////////////////////////// |   ///////////////////////////////// | ||||||
|   static void * ShmCommBuf; |   static void * ShmCommBuf; | ||||||
|  |  | ||||||
|   // Isend/Irecv/Wait, or Sendrecv blocking |  | ||||||
|   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; |  | ||||||
|   static CommunicatorPolicy_t CommunicatorPolicy; |  | ||||||
|   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } |  | ||||||
|    |    | ||||||
|   size_t heap_top; |   size_t heap_top; | ||||||
|   size_t heap_bytes; |   size_t heap_bytes; | ||||||
| @@ -211,14 +223,21 @@ class CartesianCommunicator { | |||||||
|    |    | ||||||
|   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); |   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); | ||||||
|  |  | ||||||
|  |   double StencilSendToRecvFrom(void *xmit, | ||||||
|  | 			       int xmit_to_rank, | ||||||
|  | 			       void *recv, | ||||||
|  | 			       int recv_from_rank, | ||||||
|  | 			       int bytes,int dir); | ||||||
|  |  | ||||||
|   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, |   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 				    void *xmit, | 				    void *xmit, | ||||||
| 				    int xmit_to_rank, | 				    int xmit_to_rank, | ||||||
| 				    void *recv, | 				    void *recv, | ||||||
| 				    int recv_from_rank, | 				    int recv_from_rank, | ||||||
| 				  int bytes); | 				    int bytes,int dir); | ||||||
|    |    | ||||||
|   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); |    | ||||||
|  |   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); | ||||||
|   void StencilBarrier(void); |   void StencilBarrier(void); | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -41,9 +41,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifdef HAVE_NUMAIF_H | #ifdef HAVE_NUMAIF_H | ||||||
| #include <numaif.h> | #include <numaif.h> | ||||||
| #endif | #endif | ||||||
| #ifndef SHM_HUGETLB |  | ||||||
| #define SHM_HUGETLB 04000 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -200,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|   ShmCommBuf = 0; |   ShmCommBuf = 0; | ||||||
|   ShmCommBufs.resize(ShmSize); |   ShmCommBufs.resize(ShmSize); | ||||||
|  |  | ||||||
| #if 1 |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Hugetlbf and others map filesystems as mappable huge pages | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifdef GRID_MPI3_SHMMMAP | ||||||
|  |   char shm_name [NAME_MAX]; | ||||||
|  |   for(int r=0;r<ShmSize;r++){ | ||||||
|  |      | ||||||
|  |     size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; | ||||||
|  |     sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r); | ||||||
|  |     //sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r); | ||||||
|  |     //    printf("Opening file %s \n",shm_name); | ||||||
|  |     int fd=open(shm_name,O_RDWR|O_CREAT,0666); | ||||||
|  |     if ( fd == -1) {  | ||||||
|  |       printf("open %s failed\n",shm_name); | ||||||
|  |       perror("open hugetlbfs"); | ||||||
|  |       exit(0); | ||||||
|  |     } | ||||||
|  |     int mmap_flag = MAP_SHARED ; | ||||||
|  | #ifdef MAP_POPULATE     | ||||||
|  |     mmap_flag|=MAP_POPULATE; | ||||||
|  | #endif | ||||||
|  | #ifdef MAP_HUGETLB | ||||||
|  |     if ( Hugepages ) mmap_flag |= MAP_HUGETLB; | ||||||
|  | #endif | ||||||
|  |     void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);  | ||||||
|  |     if ( ptr == (void *)MAP_FAILED ) {     | ||||||
|  |       printf("mmap %s failed\n",shm_name); | ||||||
|  |       perror("failed mmap");      assert(0);     | ||||||
|  |     } | ||||||
|  |     assert(((uint64_t)ptr&0x3F)==0); | ||||||
|  |     ShmCommBufs[r] =ptr; | ||||||
|  |      | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case | ||||||
|  |   // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for  | ||||||
|  |   // the posix shm virtual file system | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifdef GRID_MPI3_SHMOPEN | ||||||
|   char shm_name [NAME_MAX]; |   char shm_name [NAME_MAX]; | ||||||
|   if ( ShmRank == 0 ) { |   if ( ShmRank == 0 ) { | ||||||
|     for(int r=0;r<ShmSize;r++){ |     for(int r=0;r<ShmSize;r++){ | ||||||
| @@ -214,12 +251,21 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      } |       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      } | ||||||
|       ftruncate(fd, size); |       ftruncate(fd, size); | ||||||
|        |        | ||||||
|       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); |       int mmap_flag = MAP_SHARED; | ||||||
|       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } | #ifdef MAP_POPULATE  | ||||||
|  |       mmap_flag |= MAP_POPULATE; | ||||||
|  | #endif | ||||||
|  | #ifdef MAP_HUGETLB | ||||||
|  |       if (Hugepages) mmap_flag |= MAP_HUGETLB; | ||||||
|  | #endif | ||||||
|  |       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); | ||||||
|  |  | ||||||
|  |       if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    } | ||||||
|       assert(((uint64_t)ptr&0x3F)==0); |       assert(((uint64_t)ptr&0x3F)==0); | ||||||
|  |  | ||||||
|       // Try to force numa domain on the shm segment if we have numaif.h | // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h | ||||||
| #ifdef HAVE_NUMAIF_H | #if 0 | ||||||
|  | //#ifdef HAVE_NUMAIF_H | ||||||
| 	int status; | 	int status; | ||||||
| 	int flags=MPOL_MF_MOVE; | 	int flags=MPOL_MF_MOVE; | ||||||
| #ifdef KNL | #ifdef KNL | ||||||
| @@ -258,23 +304,34 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|       ShmCommBufs[r] =ptr; |       ShmCommBufs[r] =ptr; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  | #endif | ||||||
| #else |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // SHMGET SHMAT and SHM_HUGETLB flag | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifdef GRID_MPI3_SHMGET | ||||||
|   std::vector<int> shmids(ShmSize); |   std::vector<int> shmids(ShmSize); | ||||||
|  |  | ||||||
|   if ( ShmRank == 0 ) { |   if ( ShmRank == 0 ) { | ||||||
|     for(int r=0;r<ShmSize;r++){ |     for(int r=0;r<ShmSize;r++){ | ||||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; |       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; | ||||||
|       key_t key   = 0x4545 + r; |       key_t key   = IPC_PRIVATE; | ||||||
|       if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { |       int flags = IPC_CREAT | SHM_R | SHM_W; | ||||||
|  | #ifdef SHM_HUGETLB | ||||||
|  |       if (Hugepages) flags|=SHM_HUGETLB; | ||||||
|  | #endif | ||||||
|  |       if ((shmids[r]= shmget(key,size, flags)) ==-1) { | ||||||
| 	int errsv = errno; | 	int errsv = errno; | ||||||
| 	printf("Errno %d\n",errsv); | 	printf("Errno %d\n",errsv); | ||||||
|  | 	printf("key   %d\n",key); | ||||||
|  | 	printf("size  %lld\n",size); | ||||||
|  | 	printf("flags %d\n",flags); | ||||||
| 	perror("shmget"); | 	perror("shmget"); | ||||||
| 	exit(1); | 	exit(1); | ||||||
|       } |       } else {  | ||||||
| 	printf("shmid: 0x%x\n", shmids[r]); | 	printf("shmid: 0x%x\n", shmids[r]); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |   } | ||||||
|   MPI_Barrier(ShmComm); |   MPI_Barrier(ShmComm); | ||||||
|   MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm); |   MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm); | ||||||
|   MPI_Barrier(ShmComm); |   MPI_Barrier(ShmComm); | ||||||
| @@ -397,8 +454,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | |||||||
| {  | {  | ||||||
|   int ierr; |   int ierr; | ||||||
|   communicator=communicator_world; |   communicator=communicator_world; | ||||||
|  |  | ||||||
|   _ndimension = processors.size(); |   _ndimension = processors.size(); | ||||||
|  |  | ||||||
|  |   communicator_halo.resize (2*_ndimension); | ||||||
|  |   for(int i=0;i<_ndimension*2;i++){ | ||||||
|  |     MPI_Comm_dup(communicator,&communicator_halo[i]); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   // Assert power of two shm_size. |   // Assert power of two shm_size. | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -621,13 +684,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | ||||||
|  | 						     int dest, | ||||||
|  | 						     void *recv, | ||||||
|  | 						     int from, | ||||||
|  | 						     int bytes,int dir) | ||||||
|  | { | ||||||
|  |   std::vector<CommsRequest_t> list; | ||||||
|  |   double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); | ||||||
|  |   StencilSendToRecvFromComplete(list,dir); | ||||||
|  |   return offbytes; | ||||||
|  | } | ||||||
|  |  | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int dest, | 							 int dest, | ||||||
| 							 void *recv, | 							 void *recv, | ||||||
| 							 int from, | 							 int from, | ||||||
| 						       int bytes) | 							 int bytes,int dir) | ||||||
| { | { | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |  | ||||||
|   MPI_Request xrq; |   MPI_Request xrq; | ||||||
|   MPI_Request rrq; |   MPI_Request rrq; | ||||||
|  |  | ||||||
| @@ -646,26 +723,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|   gfrom = MPI_UNDEFINED; |   gfrom = MPI_UNDEFINED; | ||||||
| #endif | #endif | ||||||
|   if ( gfrom ==MPI_UNDEFINED) { |   if ( gfrom ==MPI_UNDEFINED) { | ||||||
|     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); |     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq); | ||||||
|     assert(ierr==0); |     assert(ierr==0); | ||||||
|     list.push_back(rrq); |     list.push_back(rrq); | ||||||
|     off_node_bytes+=bytes; |     off_node_bytes+=bytes; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( gdest == MPI_UNDEFINED ) { |   if ( gdest == MPI_UNDEFINED ) { | ||||||
|     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); |     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq); | ||||||
|     assert(ierr==0); |     assert(ierr==0); | ||||||
|     list.push_back(xrq); |     list.push_back(xrq); | ||||||
|     off_node_bytes+=bytes; |     off_node_bytes+=bytes; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {  |   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {  | ||||||
|     this->StencilSendToRecvFromComplete(list); |     this->StencilSendToRecvFromComplete(list,dir); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return off_node_bytes; |   return off_node_bytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
| { | { | ||||||
|   SendToRecvFromComplete(waitall); |   SendToRecvFromComplete(waitall); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										286
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/communicator/Communicator_mpi.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/GridCore.h> | ||||||
|  | #include <Grid/GridQCDcore.h> | ||||||
|  | #include <Grid/qcd/action/ActionCore.h> | ||||||
|  | #include <mpi.h> | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Info that is setup once and indept of cartesian layout | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | MPI_Comm CartesianCommunicator::communicator_world; | ||||||
|  |  | ||||||
|  | // Should error check all MPI calls. | ||||||
|  | void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||||
|  |   int flag; | ||||||
|  |   int provided; | ||||||
|  |   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||||
|  |   if ( !flag ) { | ||||||
|  |     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); | ||||||
|  |     if ( provided != MPI_THREAD_MULTIPLE ) { | ||||||
|  |       QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||||
|  |   ShmInitGeneric(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||||
|  | { | ||||||
|  |   _ndimension = processors.size(); | ||||||
|  |   std::vector<int> periodic(_ndimension,1); | ||||||
|  |  | ||||||
|  |   _Nprocessors=1; | ||||||
|  |   _processors = processors; | ||||||
|  |   _processor_coor.resize(_ndimension); | ||||||
|  |    | ||||||
|  |   MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); | ||||||
|  |   MPI_Comm_rank(communicator,&_processor); | ||||||
|  |   MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); | ||||||
|  |  | ||||||
|  |   for(int i=0;i<_ndimension;i++){ | ||||||
|  |     _Nprocessors*=_processors[i]; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   communicator_halo.resize (2*_ndimension); | ||||||
|  |   for(int i=0;i<_ndimension*2;i++){ | ||||||
|  |     MPI_Comm_dup(communicator,&communicator_halo[i]); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   int Size;  | ||||||
|  |   MPI_Comm_size(communicator,&Size); | ||||||
|  |    | ||||||
|  |   assert(Size==_Nprocessors); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(uint64_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalXOR(uint32_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalXOR(uint64_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(float &f){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(double &d) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) | ||||||
|  | { | ||||||
|  |   int rank; | ||||||
|  |   int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank); | ||||||
|  |   assert(ierr==0); | ||||||
|  |   return rank; | ||||||
|  | } | ||||||
|  | void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) | ||||||
|  | { | ||||||
|  |   coor.resize(_ndimension); | ||||||
|  |   int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Basic Halo comms primitive | ||||||
|  | void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||||
|  | 					   int dest, | ||||||
|  | 					   void *recv, | ||||||
|  | 					   int from, | ||||||
|  | 					   int bytes) | ||||||
|  | { | ||||||
|  |   std::vector<CommsRequest_t> reqs(0); | ||||||
|  |   SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); | ||||||
|  |   SendToRecvFromComplete(reqs); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||||
|  | 					   void *recv, | ||||||
|  | 					   int sender, | ||||||
|  | 					   int receiver, | ||||||
|  | 					   int bytes) | ||||||
|  | { | ||||||
|  |   MPI_Status stat; | ||||||
|  |   assert(sender != receiver); | ||||||
|  |   int tag = sender; | ||||||
|  |   if ( _processor == sender ) { | ||||||
|  |     MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); | ||||||
|  |   } | ||||||
|  |   if ( _processor == receiver ) {  | ||||||
|  |     MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Basic Halo comms primitive | ||||||
|  | void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 						void *xmit, | ||||||
|  | 						int dest, | ||||||
|  | 						void *recv, | ||||||
|  | 						int from, | ||||||
|  | 						int bytes) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||||
|  |     MPI_Request xrq; | ||||||
|  |     MPI_Request rrq; | ||||||
|  |  | ||||||
|  |     ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |     ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|  |      | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(xrq); | ||||||
|  |     list.push_back(rrq); | ||||||
|  |   } else {  | ||||||
|  |     // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |     ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, | ||||||
|  | 		      recv,bytes,MPI_CHAR,from, from, | ||||||
|  | 		      communicator,MPI_STATUS_IGNORE); | ||||||
|  |     assert(ierr==0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
|  | { | ||||||
|  |   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||||
|  |     int nreq=list.size(); | ||||||
|  |     std::vector<MPI_Status> status(nreq); | ||||||
|  |     int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||||
|  |     assert(ierr==0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Barrier(void) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Barrier(communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Bcast(data, | ||||||
|  | 		     bytes, | ||||||
|  | 		     MPI_BYTE, | ||||||
|  | 		     root, | ||||||
|  | 		     communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|  |   // Should only be used prior to Grid Init finished. | ||||||
|  |   // Check for this? | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|  | int CartesianCommunicator::RankWorld(void){  | ||||||
|  |   int r;  | ||||||
|  |   MPI_Comm_rank(communicator_world,&r); | ||||||
|  |   return r; | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||||
|  | { | ||||||
|  |   int ierr= MPI_Bcast(data, | ||||||
|  | 		      bytes, | ||||||
|  | 		      MPI_BYTE, | ||||||
|  | 		      root, | ||||||
|  | 		      communicator_world); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 							 void *xmit, | ||||||
|  | 							 int xmit_to_rank, | ||||||
|  | 							 void *recv, | ||||||
|  | 							 int recv_from_rank, | ||||||
|  | 							 int bytes,int dir) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |    | ||||||
|  |   //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl; | ||||||
|  |   // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |   MPI_Request req[2]; | ||||||
|  |   MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); | ||||||
|  |   MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]); | ||||||
|  |  | ||||||
|  |   list.push_back(req[0]); | ||||||
|  |   list.push_back(req[1]); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
|  | {  | ||||||
|  |   int nreq=waitall.size(); | ||||||
|  |   MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE); | ||||||
|  | }; | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, | ||||||
|  | 						    int xmit_to_rank, | ||||||
|  | 						    void *recv, | ||||||
|  | 						    int recv_from_rank, | ||||||
|  | 						    int bytes,int dir) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |    | ||||||
|  |   //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl; | ||||||
|  |   // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |   MPI_Request req[2]; | ||||||
|  |   MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); | ||||||
|  |   MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]); | ||||||
|  |   MPI_Waitall(2, req, MPI_STATUSES_IGNORE); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
| @@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/cshift/Cshift_mpi.h> | #include <Grid/cshift/Cshift_mpi.h> | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
| #ifdef GRID_COMMS_MPI3L | #ifdef GRID_COMMS_MPIT | ||||||
| #include <Grid/cshift/Cshift_mpi.h> | #include <Grid/cshift/Cshift_mpi.h> | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | /* | ||||||
| inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) | inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) | ||||||
| { | { | ||||||
|   int NN    = BlockSolverGrid->_ndimension; |   int NN    = BlockSolverGrid->_ndimension; | ||||||
| @@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or | |||||||
|   } |   } | ||||||
|   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);  |   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);  | ||||||
| } | } | ||||||
|  | */ | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | ||||||
| @@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice | |||||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; |   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||||
|  |  | ||||||
|   GridBase *FullGrid  = X._grid; |   GridBase *FullGrid  = X._grid; | ||||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); |   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|  |  | ||||||
|   Lattice<vobj> Xslice(SliceGrid); |   //  Lattice<vobj> Xslice(SliceGrid); | ||||||
|   Lattice<vobj> Rslice(SliceGrid); |   //  Lattice<vobj> Rslice(SliceGrid); | ||||||
|  |  | ||||||
|   assert( FullGrid->_simd_layout[Orthog]==1); |   assert( FullGrid->_simd_layout[Orthog]==1); | ||||||
|   int nh =  FullGrid->_ndimension; |   int nh =  FullGrid->_ndimension; | ||||||
|   int nl = SliceGrid->_ndimension; |   //  int nl = SliceGrid->_ndimension; | ||||||
|  |   int nl = nh-1; | ||||||
|  |  | ||||||
|   //FIXME package in a convenient iterator |   //FIXME package in a convenient iterator | ||||||
|   //Should loop over a plane orthogonal to direction "Orthog" |   //Should loop over a plane orthogonal to direction "Orthog" | ||||||
| @@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice< | |||||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; |   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||||
|  |  | ||||||
|   GridBase *FullGrid  = X._grid; |   GridBase *FullGrid  = X._grid; | ||||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); |   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|  |   //  Lattice<vobj> Xslice(SliceGrid); | ||||||
|   Lattice<vobj> Xslice(SliceGrid); |   //  Lattice<vobj> Rslice(SliceGrid); | ||||||
|   Lattice<vobj> Rslice(SliceGrid); |  | ||||||
|  |  | ||||||
|   assert( FullGrid->_simd_layout[Orthog]==1); |   assert( FullGrid->_simd_layout[Orthog]==1); | ||||||
|   int nh =  FullGrid->_ndimension; |   int nh =  FullGrid->_ndimension; | ||||||
|   int nl = SliceGrid->_ndimension; |   //  int nl = SliceGrid->_ndimension; | ||||||
|  |   int nl=1; | ||||||
|  |  | ||||||
|   //FIXME package in a convenient iterator |   //FIXME package in a convenient iterator | ||||||
|   //Should loop over a plane orthogonal to direction "Orthog" |   //Should loop over a plane orthogonal to direction "Orthog" | ||||||
| @@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | |||||||
|   typedef typename vobj::vector_type vector_type; |   typedef typename vobj::vector_type vector_type; | ||||||
|    |    | ||||||
|   GridBase *FullGrid  = lhs._grid; |   GridBase *FullGrid  = lhs._grid; | ||||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); |   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|    |    | ||||||
|   int Nblock = FullGrid->GlobalDimensions()[Orthog]; |   int Nblock = FullGrid->GlobalDimensions()[Orthog]; | ||||||
|    |    | ||||||
|   Lattice<vobj> Lslice(SliceGrid); |   //  Lattice<vobj> Lslice(SliceGrid); | ||||||
|   Lattice<vobj> Rslice(SliceGrid); |   //  Lattice<vobj> Rslice(SliceGrid); | ||||||
|    |    | ||||||
|   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); |   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||||
|  |  | ||||||
|   assert( FullGrid->_simd_layout[Orthog]==1); |   assert( FullGrid->_simd_layout[Orthog]==1); | ||||||
|   int nh =  FullGrid->_ndimension; |   int nh =  FullGrid->_ndimension; | ||||||
|   int nl = SliceGrid->_ndimension; |   //  int nl = SliceGrid->_ndimension; | ||||||
|  |   int nl = nh-1; | ||||||
|  |  | ||||||
|   //FIXME package in a convenient iterator |   //FIXME package in a convenient iterator | ||||||
|   //Should loop over a plane orthogonal to direction "Orthog" |   //Should loop over a plane orthogonal to direction "Orthog" | ||||||
| @@ -550,6 +554,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | |||||||
|       mat += mat_thread; |       mat += mat_thread; | ||||||
|     }   |     }   | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   for(int i=0;i<Nblock;i++){ | ||||||
|  |   for(int j=0;j<Nblock;j++){ | ||||||
|  |     ComplexD sum = mat(i,j); | ||||||
|  |     FullGrid->GlobalSum(sum); | ||||||
|  |     mat(i,j)=sum; | ||||||
|  |   }} | ||||||
|  |  | ||||||
|   return; |   return; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) { | |||||||
| //////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////// | ||||||
| void Grid_quiesce_nodes(void) { | void Grid_quiesce_nodes(void) { | ||||||
|   int me = 0; |   int me = 0; | ||||||
| #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) | #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) | ||||||
|   MPI_Comm_rank(MPI_COMM_WORLD, &me); |   MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ | |||||||
| #ifndef GRID_BINARY_IO_H | #ifndef GRID_BINARY_IO_H | ||||||
| #define GRID_BINARY_IO_H | #define GRID_BINARY_IO_H | ||||||
|  |  | ||||||
| #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)  | #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)  | ||||||
| #define USE_MPI_IO | #define USE_MPI_IO | ||||||
| #else | #else | ||||||
| #undef  USE_MPI_IO | #undef  USE_MPI_IO | ||||||
| @@ -99,13 +99,14 @@ class BinaryIO { | |||||||
|     NerscChecksum(grid,scalardata,nersc_csum); |     NerscChecksum(grid,scalardata,nersc_csum); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum) |   template <class fobj> | ||||||
|  |   static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum) | ||||||
|   { |   { | ||||||
|     const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t); |     const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t); | ||||||
|  |  | ||||||
|  |  | ||||||
|     uint64_t lsites = grid->lSites(); |     uint64_t lsites = grid->lSites(); | ||||||
|     if (fbuf.size()==1) { |     if (fbuf.size() == 1) | ||||||
|  |     { | ||||||
|       lsites = 1; |       lsites = 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -114,9 +115,11 @@ class BinaryIO { | |||||||
|       uint32_t nersc_csum_thr = 0; |       uint32_t nersc_csum_thr = 0; | ||||||
|  |  | ||||||
|       #pragma omp for |       #pragma omp for | ||||||
|       for(uint64_t local_site=0;local_site<lsites;local_site++){ |       for (uint64_t local_site = 0; local_site < lsites; local_site++) | ||||||
|  |       { | ||||||
|         uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; |         uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; | ||||||
| 	for(uint64_t j=0;j<size32;j++){ |         for (uint64_t j = 0; j < size32; j++) | ||||||
|  |         { | ||||||
|           nersc_csum_thr = nersc_csum_thr + site_buf[j]; |           nersc_csum_thr = nersc_csum_thr + site_buf[j]; | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
| @@ -127,6 +130,7 @@ class BinaryIO { | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) |   template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) | ||||||
|   { |   { | ||||||
|     const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); |     const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); | ||||||
| @@ -367,12 +371,16 @@ class BinaryIO { | |||||||
|                   << iodata.size() * sizeof(fobj) << " bytes" << std::endl; |                   << iodata.size() * sizeof(fobj) << " bytes" << std::endl; | ||||||
|         std::ifstream fin; |         std::ifstream fin; | ||||||
|         fin.open(file, std::ios::binary | std::ios::in); |         fin.open(file, std::ios::binary | std::ios::in); | ||||||
| 	if ( control & BINARYIO_MASTER_APPEND )  { |         if (control & BINARYIO_MASTER_APPEND) | ||||||
|  |         { | ||||||
|           fin.seekg(-sizeof(fobj), fin.end); |           fin.seekg(-sizeof(fobj), fin.end); | ||||||
| 	} else {  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|           fin.seekg(offset + myrank * lsites * sizeof(fobj)); |           fin.seekg(offset + myrank * lsites * sizeof(fobj)); | ||||||
|         } |         } | ||||||
| 	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0); |         fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj)); | ||||||
|  |         assert(fin.fail() == 0); | ||||||
|         fin.close(); |         fin.close(); | ||||||
|       } |       } | ||||||
|       timer.Stop(); |       timer.Stop(); | ||||||
| @@ -406,9 +414,29 @@ class BinaryIO { | |||||||
|       if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { |       if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { | ||||||
| #ifdef USE_MPI_IO | #ifdef USE_MPI_IO | ||||||
|         std::cout << GridLogMessage << "MPI write I/O " << file << std::endl; |         std::cout << GridLogMessage << "MPI write I/O " << file << std::endl; | ||||||
| 	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); |         ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh); | ||||||
| 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0); |         std::cout << GridLogMessage << "Checking for errors" << std::endl; | ||||||
| 	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0); |         if (ierr != MPI_SUCCESS) | ||||||
|  |         { | ||||||
|  |           char error_string[BUFSIZ]; | ||||||
|  |           int length_of_error_string, error_class; | ||||||
|  |  | ||||||
|  |           MPI_Error_class(ierr, &error_class); | ||||||
|  |           MPI_Error_string(error_class, error_string, &length_of_error_string); | ||||||
|  |           fprintf(stderr, "%3d: %s\n", myrank, error_string); | ||||||
|  |           MPI_Error_string(ierr, error_string, &length_of_error_string); | ||||||
|  |           fprintf(stderr, "%3d: %s\n", myrank, error_string); | ||||||
|  |           MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl; | ||||||
|  |         ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); | ||||||
|  |         assert(ierr == 0); | ||||||
|  |  | ||||||
|  |         std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl; | ||||||
|  |         ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); | ||||||
|  |         assert(ierr == 0); | ||||||
|  |  | ||||||
|         MPI_File_close(&fh); |         MPI_File_close(&fh); | ||||||
|         MPI_Type_free(&fileArray); |         MPI_Type_free(&fileArray); | ||||||
|         MPI_Type_free(&localArray); |         MPI_Type_free(&localArray); | ||||||
| @@ -416,15 +444,43 @@ class BinaryIO { | |||||||
| 	assert(0); | 	assert(0); | ||||||
| #endif | #endif | ||||||
|       } else {  |       } else {  | ||||||
| 	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in); |          | ||||||
|  | 	std::ofstream fout;  | ||||||
|  |   fout.exceptions ( std::fstream::failbit | std::fstream::badbit ); | ||||||
|  |   try { | ||||||
|  |     fout.open(file,std::ios::binary|std::ios::out|std::ios::in); | ||||||
|  |   } catch (const std::fstream::failure& exc) { | ||||||
|  |     std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl; | ||||||
|  |     std::cout << GridLogError << "Exception description: " << exc.what() << std::endl; | ||||||
|  |     std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl; | ||||||
|  |     #ifdef USE_MPI_IO | ||||||
|  |     MPI_Abort(MPI_COMM_WORLD,1); | ||||||
|  |     #else | ||||||
|  |     exit(1); | ||||||
|  |     #endif | ||||||
|  |   } | ||||||
| 	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : " | 	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : " | ||||||
| 		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; | 		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; | ||||||
|  | 	 | ||||||
|   if ( control & BINARYIO_MASTER_APPEND )  { |   if ( control & BINARYIO_MASTER_APPEND )  { | ||||||
| 	  fout.seekp(0,fout.end); | 	  fout.seekp(0,fout.end); | ||||||
| 	} else { | 	} else { | ||||||
| 	  fout.seekp(offset+myrank*lsites*sizeof(fobj)); | 	  fout.seekp(offset+myrank*lsites*sizeof(fobj)); | ||||||
| 	} | 	} | ||||||
| 	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0); |    | ||||||
|  |   try { | ||||||
|  |   	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0); | ||||||
|  |   } | ||||||
|  |   catch (const std::fstream::failure& exc) { | ||||||
|  |     std::cout << "Exception in writing file " << file << std::endl; | ||||||
|  |     std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl; | ||||||
|  |     #ifdef USE_MPI_IO | ||||||
|  |     MPI_Abort(MPI_COMM_WORLD,1); | ||||||
|  |     #else | ||||||
|  |     exit(1); | ||||||
|  |     #endif | ||||||
|  |   } | ||||||
|  |  | ||||||
| 	fout.close(); | 	fout.close(); | ||||||
|   } |   } | ||||||
|   timer.Stop(); |   timer.Stop(); | ||||||
| @@ -442,12 +498,15 @@ class BinaryIO { | |||||||
|     ////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////// | ||||||
|     // Safety check |     // Safety check | ||||||
|     ////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////// | ||||||
|  |     // if the data size is 1 we do not want to sum over the MPI ranks | ||||||
|  |     if (iodata.size() != 1){ | ||||||
|       grid->Barrier(); |       grid->Barrier(); | ||||||
|       grid->GlobalSum(nersc_csum); |       grid->GlobalSum(nersc_csum); | ||||||
|       grid->GlobalXOR(scidac_csuma); |       grid->GlobalXOR(scidac_csuma); | ||||||
|       grid->GlobalXOR(scidac_csumb); |       grid->GlobalXOR(scidac_csumb); | ||||||
|       grid->Barrier(); |       grid->Barrier(); | ||||||
|     } |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////////////// | ||||||
|   // Read a Lattice of object |   // Read a Lattice of object | ||||||
| @@ -546,9 +605,9 @@ class BinaryIO { | |||||||
|     int gsites = grid->gSites(); |     int gsites = grid->gSites(); | ||||||
|     int lsites = grid->lSites(); |     int lsites = grid->lSites(); | ||||||
|  |  | ||||||
|     uint32_t nersc_csum_tmp; |     uint32_t nersc_csum_tmp   = 0; | ||||||
|     uint32_t scidac_csuma_tmp; |     uint32_t scidac_csuma_tmp = 0; | ||||||
|     uint32_t scidac_csumb_tmp; |     uint32_t scidac_csumb_tmp = 0; | ||||||
|  |  | ||||||
|     GridStopWatch timer; |     GridStopWatch timer; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|   for(int i=0; i < Ls; i++){ |   for(int i=0; i < Ls; i++){ | ||||||
|     as[i] = 1.0; |     as[i] = 1.0; | ||||||
|     omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code |     omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code | ||||||
|     //    assert(fabs(omega[i])>0.0); |     assert(omega[i]!=Coeff_t(0.0)); | ||||||
|     bs[i] = 0.5*(bpc/omega[i] + bmc); |     bs[i] = 0.5*(bpc/omega[i] + bmc); | ||||||
|     cs[i] = 0.5*(bpc/omega[i] - bmc); |     cs[i] = 0.5*(bpc/omega[i] - bmc); | ||||||
|   } |   } | ||||||
| @@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|    |    | ||||||
|   for(int i=0;i<Ls;i++){ |   for(int i=0;i<Ls;i++){ | ||||||
|     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);      |     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);      | ||||||
|     //    assert(fabs(bee[i])>0.0); |     assert(bee[i]!=Coeff_t(0.0)); | ||||||
|     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); |     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); | ||||||
|     beo[i]=as[i]*bs[i]; |     beo[i]=as[i]*bs[i]; | ||||||
|     ceo[i]=-as[i]*cs[i]; |     ceo[i]=-as[i]*cs[i]; | ||||||
| @@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|      |      | ||||||
|     if ( i < Ls-1 ) { |     if ( i < Ls-1 ) { | ||||||
|  |  | ||||||
|  |       assert(bee[i]!=Coeff_t(0.0)); | ||||||
|  |       assert(bee[0]!=Coeff_t(0.0)); | ||||||
|  |        | ||||||
|       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column |       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column | ||||||
|        |        | ||||||
|       leem[i]=mass*cee[Ls-1]/bee[0]; |       leem[i]=mass*cee[Ls-1]/bee[0]; | ||||||
|       for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1]; |       for(int j=0;j<i;j++) { | ||||||
|  | 	assert(bee[j+1]!=Coeff_t(0.0)); | ||||||
|  | 	leem[i]*= aee[j]/bee[j+1]; | ||||||
|  |       } | ||||||
|        |        | ||||||
|       uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row |       uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row | ||||||
|        |        | ||||||
| @@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|   {  |   {  | ||||||
|     Coeff_t delta_d=mass*cee[Ls-1]; |     Coeff_t delta_d=mass*cee[Ls-1]; | ||||||
|     for(int j=0;j<Ls-1;j++) { |     for(int j=0;j<Ls-1;j++) { | ||||||
|       //      assert(fabs(bee[j])>0.0); |       assert(bee[j] != Coeff_t(0.0)); | ||||||
|       delta_d *= cee[j]/bee[j]; |       delta_d *= cee[j]/bee[j]; | ||||||
|     } |     } | ||||||
|     dee[Ls-1] += delta_d; |     dee[Ls-1] += delta_d; | ||||||
|   | |||||||
| @@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr | |||||||
| { | { | ||||||
|   Compressor compressor; |   Compressor compressor; | ||||||
|   int LLs = in._grid->_rdimensions[0]; |   int LLs = in._grid->_rdimensions[0]; | ||||||
|   st.HaloExchange(in,compressor); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   DhopTotalTime -= usecond(); | ||||||
|  |   DhopCommTime -= usecond(); | ||||||
|  |   st.HaloExchange(in,compressor); | ||||||
|  |   DhopCommTime += usecond(); | ||||||
|  |    | ||||||
|  |   DhopComputeTime -= usecond(); | ||||||
|   // Dhop takes the 4d grid from U, and makes a 5d index for fermion |   // Dhop takes the 4d grid from U, and makes a 5d index for fermion | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { |     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { | ||||||
| @@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr | |||||||
| 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); | 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   DhopComputeTime += usecond(); | ||||||
|  |   DhopTotalTime   += usecond(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) | void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls+=1; | ||||||
|   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid |   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid | ||||||
|   conformable(in._grid,out._grid); // drops the cb check |   conformable(in._grid,out._grid); // drops the cb check | ||||||
|  |  | ||||||
| @@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls+=1; | ||||||
|   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid |   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid | ||||||
|   conformable(in._grid,out._grid); // drops the cb check |   conformable(in._grid,out._grid); // drops the cb check | ||||||
|  |  | ||||||
| @@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls+=2; | ||||||
|   conformable(in._grid,FermionGrid()); // verifies full grid |   conformable(in._grid,FermionGrid()); // verifies full grid | ||||||
|   conformable(in._grid,out._grid); |   conformable(in._grid,out._grid); | ||||||
|  |  | ||||||
| @@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField | |||||||
|   DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); |   DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::Report(void)  | ||||||
|  | { | ||||||
|  |   std::vector<int> latt = GridDefaultLatt();           | ||||||
|  |   RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; | ||||||
|  |   RealD NP = _FourDimGrid->_Nprocessors; | ||||||
|  |   RealD NN = _FourDimGrid->NodeCount(); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : "  | ||||||
|  | 	    << DhopCalls   << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : "  | ||||||
|  | 	    << DhopTotalTime   / DhopCalls << " us" << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : "  | ||||||
|  | 	    << DhopCommTime    / DhopCalls << " us" << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : "  | ||||||
|  | 	    << DhopComputeTime / DhopCalls << " us" << std::endl; | ||||||
|  |  | ||||||
|  |   // Average the compute time | ||||||
|  |   _FourDimGrid->GlobalSum(DhopComputeTime); | ||||||
|  |   DhopComputeTime/=NP; | ||||||
|  |  | ||||||
|  |   RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl; | ||||||
|  |    | ||||||
|  |   RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report(); | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report(); | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report(); | ||||||
|  | } | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)  | ||||||
|  | { | ||||||
|  |   DhopCalls       = 0; | ||||||
|  |   DhopTotalTime    = 0; | ||||||
|  |   DhopCommTime    = 0; | ||||||
|  |   DhopComputeTime = 0; | ||||||
|  |   Stencil.ZeroCounters(); | ||||||
|  |   StencilEven.ZeroCounters(); | ||||||
|  |   StencilOdd.ZeroCounters(); | ||||||
|  | } | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////// | ||||||
| // Implement the general interface. Here we use SAME mass on all slices | // Implement the general interface. Here we use SAME mass on all slices | ||||||
|   | |||||||
| @@ -55,6 +55,16 @@ namespace QCD { | |||||||
|       FermionField _tmp; |       FermionField _tmp; | ||||||
|       FermionField &tmp(void) { return _tmp; } |       FermionField &tmp(void) { return _tmp; } | ||||||
|  |  | ||||||
|  |       //////////////////////////////////////// | ||||||
|  |       // Performance monitoring | ||||||
|  |       //////////////////////////////////////// | ||||||
|  |       void Report(void); | ||||||
|  |       void ZeroCounters(void); | ||||||
|  |       double DhopTotalTime; | ||||||
|  |       double DhopCalls; | ||||||
|  |       double DhopCommTime; | ||||||
|  |       double DhopComputeTime; | ||||||
|  |  | ||||||
|       /////////////////////////////////////////////////////////////// |       /////////////////////////////////////////////////////////////// | ||||||
|       // Implement the abstract base |       // Implement the abstract base | ||||||
|       /////////////////////////////////////////////////////////////// |       /////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom | |||||||
| template<class vobj,class cobj> | template<class vobj,class cobj> | ||||||
| class WilsonStencil : public CartesianStencil<vobj,cobj> { | class WilsonStencil : public CartesianStencil<vobj,cobj> { | ||||||
| public: | public: | ||||||
|  |   double timer0; | ||||||
|  |   double timer1; | ||||||
|  |   double timer2; | ||||||
|  |   double timer3; | ||||||
|  |   double timer4; | ||||||
|  |   double timer5; | ||||||
|  |   double timer6; | ||||||
|  |   uint64_t callsi; | ||||||
|  |   void ZeroCountersi(void) | ||||||
|  |   { | ||||||
|  |     timer0=0; | ||||||
|  |     timer1=0; | ||||||
|  |     timer2=0; | ||||||
|  |     timer3=0; | ||||||
|  |     timer4=0; | ||||||
|  |     timer5=0; | ||||||
|  |     timer6=0; | ||||||
|  |     callsi=0; | ||||||
|  |   } | ||||||
|  |   void Reporti(int calls) | ||||||
|  |   { | ||||||
|  |     if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl; | ||||||
|  |     if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl; | ||||||
|  |     if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl; | ||||||
|  |     if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl; | ||||||
|  |     if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl; | ||||||
|  |   } | ||||||
|   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; |   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; | ||||||
|  |  | ||||||
|   std::vector<int> same_node; |   std::vector<int> same_node; | ||||||
| @@ -252,6 +278,7 @@ public: | |||||||
|     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , |     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , | ||||||
|     same_node(npoints) |     same_node(npoints) | ||||||
|   {  |   {  | ||||||
|  |     ZeroCountersi(); | ||||||
|     surface_list.resize(0); |     surface_list.resize(0); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
| @@ -261,7 +288,6 @@ public: | |||||||
|     // Here we know the distance is 1 for WilsonStencil |     // Here we know the distance is 1 for WilsonStencil | ||||||
|     for(int point=0;point<this->_npoints;point++){ |     for(int point=0;point<this->_npoints;point++){ | ||||||
|       same_node[point] = this->SameNode(point); |       same_node[point] = this->SameNode(point); | ||||||
|       //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl; |  | ||||||
|     } |     } | ||||||
|      |      | ||||||
|     for(int site = 0 ;site< vol4;site++){ |     for(int site = 0 ;site< vol4;site++){ | ||||||
| @@ -282,17 +308,28 @@ public: | |||||||
|   { |   { | ||||||
|     std::vector<std::vector<CommsRequest_t> > reqs; |     std::vector<std::vector<CommsRequest_t> > reqs; | ||||||
|     this->HaloExchangeOptGather(source,compress); |     this->HaloExchangeOptGather(source,compress); | ||||||
|     this->CommunicateBegin(reqs); |     double t1=usecond(); | ||||||
|     this->CommunicateComplete(reqs); |     // Asynchronous MPI calls multidirectional, Isend etc... | ||||||
|  |     //    this->CommunicateBegin(reqs); | ||||||
|  |     //    this->CommunicateComplete(reqs); | ||||||
|  |     // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways. | ||||||
|  |     this->Communicate(); | ||||||
|  |     double t2=usecond(); timer1 += t2-t1; | ||||||
|     this->CommsMerge(compress); |     this->CommsMerge(compress); | ||||||
|  |     double t3=usecond(); timer2 += t3-t2; | ||||||
|     this->CommsMergeSHM(compress); |     this->CommsMergeSHM(compress); | ||||||
|  |     double t4=usecond(); timer3 += t4-t3; | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   template <class compressor> |   template <class compressor> | ||||||
|   void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)  |   void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)  | ||||||
|   { |   { | ||||||
|     this->Prepare(); |     this->Prepare(); | ||||||
|  |     double t0=usecond(); | ||||||
|     this->HaloGatherOpt(source,compress); |     this->HaloGatherOpt(source,compress); | ||||||
|  |     double t1=usecond(); | ||||||
|  |     timer0 += t1-t0; | ||||||
|  |     callsi++; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template <class compressor> |   template <class compressor> | ||||||
| @@ -304,7 +341,9 @@ public: | |||||||
|     typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor; |     typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor; | ||||||
|     typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; |     typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; | ||||||
|  |  | ||||||
|  |     this->mpi3synctime_g-=usecond(); | ||||||
|     this->_grid->StencilBarrier(); |     this->_grid->StencilBarrier(); | ||||||
|  |     this->mpi3synctime_g+=usecond(); | ||||||
|  |  | ||||||
|     assert(source._grid==this->_grid); |     assert(source._grid==this->_grid); | ||||||
|     this->halogtime-=usecond(); |     this->halogtime-=usecond(); | ||||||
| @@ -323,7 +362,6 @@ public: | |||||||
|     int dag = compress.dag; |     int dag = compress.dag; | ||||||
|     int face_idx=0; |     int face_idx=0; | ||||||
|     if ( dag ) {  |     if ( dag ) {  | ||||||
|       //	std::cout << " Optimised Dagger compress " <<std::endl; |  | ||||||
|       assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); |       assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); | ||||||
|       assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); |       assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); | ||||||
|       assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); |       assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); | ||||||
|   | |||||||
| @@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu, | |||||||
|   int vol4; |   int vol4; | ||||||
|   vol4=FourDimGrid.oSites(); |   vol4=FourDimGrid.oSites(); | ||||||
|   Stencil.BuildSurfaceList(LLs,vol4); |   Stencil.BuildSurfaceList(LLs,vol4); | ||||||
|  |  | ||||||
|   vol4=FourDimRedBlackGrid.oSites(); |   vol4=FourDimRedBlackGrid.oSites(); | ||||||
|   StencilEven.BuildSurfaceList(LLs,vol4); |   StencilEven.BuildSurfaceList(LLs,vol4); | ||||||
|    StencilOdd.BuildSurfaceList(LLs,vol4); |    StencilOdd.BuildSurfaceList(LLs,vol4); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() |    //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() | ||||||
|                        <<" " << StencilEven.surface_list.size()<<std::endl; |    //                       <<" " << StencilEven.surface_list.size()<<std::endl; | ||||||
|  |  | ||||||
| } | } | ||||||
|       |       | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::Report(void) | void WilsonFermion5D<Impl>::Report(void) | ||||||
| { | { | ||||||
|     std::vector<int> latt = GridDefaultLatt();           |  | ||||||
|     RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; |  | ||||||
|   RealD NP     = _FourDimGrid->_Nprocessors; |   RealD NP     = _FourDimGrid->_Nprocessors; | ||||||
|   RealD NN     = _FourDimGrid->NodeCount(); |   RealD NN     = _FourDimGrid->NodeCount(); | ||||||
|  |   RealD volume = Ls;   | ||||||
|  |   std::vector<int> latt = _FourDimGrid->GlobalDimensions(); | ||||||
|  |   for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; | ||||||
|  |  | ||||||
|   if ( DhopCalls > 0 ) { |   if ( DhopCalls > 0 ) { | ||||||
|     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; |     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; | ||||||
| @@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void) | |||||||
|     std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report(); |     std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report(); | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report(); |     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report(); | ||||||
|   } |   } | ||||||
|  |   if ( DhopCalls > 0){ | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls); | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls); | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls); | ||||||
|  |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) { | |||||||
|   Stencil.ZeroCounters(); |   Stencil.ZeroCounters(); | ||||||
|   StencilEven.ZeroCounters(); |   StencilEven.ZeroCounters(); | ||||||
|   StencilOdd.ZeroCounters(); |   StencilOdd.ZeroCounters(); | ||||||
|  |   Stencil.ZeroCountersi(); | ||||||
|  |   StencilEven.ZeroCountersi(); | ||||||
|  |   StencilOdd.ZeroCountersi(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
| { | { | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); |   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); | ||||||
|   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; |  | ||||||
|  |  | ||||||
|   Compressor compressor(dag); |   Compressor compressor(dag); | ||||||
|  |  | ||||||
| @@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |   DhopFaceTime-=usecond(); | ||||||
|   st.HaloExchangeOptGather(in,compressor); |   st.HaloExchangeOptGather(in,compressor); | ||||||
|   DhopFaceTime+=usecond(); |   st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms | ||||||
|   std::vector<std::vector<CommsRequest_t> > reqs; |  | ||||||
|  |  | ||||||
|   // Rely on async comms; start comms before merge of local data |  | ||||||
|   DhopCommTime-=usecond(); |  | ||||||
|   st.CommunicateBegin(reqs); |  | ||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |  | ||||||
|   st.CommsMergeSHM(compressor); |  | ||||||
|   DhopFaceTime+=usecond(); |   DhopFaceTime+=usecond(); | ||||||
|  |  | ||||||
|   // Perhaps use omp task and region |   double ctime=0; | ||||||
| #pragma omp parallel  |   double ptime=0; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Ugly explicit thread mapping introduced for OPA reasons. | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #pragma omp parallel reduction(max:ctime) reduction(max:ptime) | ||||||
|   {  |   {  | ||||||
|  |     int tid = omp_get_thread_num(); | ||||||
|     int nthreads = omp_get_num_threads(); |     int nthreads = omp_get_num_threads(); | ||||||
|     int me = omp_get_thread_num(); |     int ncomms = CartesianCommunicator::nCommThreads; | ||||||
|     int myoff, mywork; |     if (ncomms == -1) ncomms = 1; | ||||||
|  |     assert(nthreads > ncomms); | ||||||
|     GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); |     if (tid >= ncomms) { | ||||||
|     int sF = LLs * myoff; |       double start = usecond(); | ||||||
|  |       nthreads -= ncomms; | ||||||
|     if ( me == 0 ) { |       int ttid = tid - ncomms; | ||||||
|       st.CommunicateComplete(reqs); |       int n = U._grid->oSites(); | ||||||
|       DhopCommTime+=usecond(); |       int chunk = n / nthreads; | ||||||
|  |       int rem = n % nthreads; | ||||||
|  |       int myblock, myn; | ||||||
|  |       if (ttid < rem) { | ||||||
|  | 	myblock = ttid * chunk + ttid; | ||||||
|  | 	myn = chunk+1; | ||||||
|       } else { |       } else { | ||||||
|       // Interior links in stencil | 	myblock = ttid*chunk + rem; | ||||||
|       if ( me==1 ) DhopComputeTime-=usecond(); | 	myn = chunk; | ||||||
|       if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); |       } | ||||||
|       else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); |        | ||||||
|       if ( me==1 ) DhopComputeTime+=usecond(); |       // do the compute | ||||||
|  |       if (dag == DaggerYes) { | ||||||
|  | 	for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|  | 	  int sU = ss; | ||||||
|  | 	  int sF = LLs * sU; | ||||||
|  | 	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); | ||||||
|  | 	} | ||||||
|  |       } else { | ||||||
|  | 	for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|  | 	  int sU = ss; | ||||||
|  | 	  int sF = LLs * sU; | ||||||
|  | 	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|  | 	ptime = usecond() - start; | ||||||
|  |     } | ||||||
|  |     { | ||||||
|  |       double start = usecond(); | ||||||
|  |       st.CommunicateThreaded(); | ||||||
|  |       ctime = usecond() - start; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   DhopCommTime += ctime; | ||||||
|  |   DhopComputeTime+=ptime; | ||||||
|  |  | ||||||
|  |   // First to enter, last to leave timing | ||||||
|  |   st.CollateThreads(); | ||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |   DhopFaceTime-=usecond(); | ||||||
|   st.CommsMerge(compressor); |   st.CommsMerge(compressor); | ||||||
|   DhopFaceTime+=usecond(); |   DhopFaceTime+=usecond(); | ||||||
|  |  | ||||||
|   // Load imbalance alert. Should use dynamic schedule OMP for loop |  | ||||||
|   // Perhaps create a list of only those sites with face work, and  |  | ||||||
|   // load balance process the list. |  | ||||||
|   DhopComputeTime2-=usecond(); |   DhopComputeTime2-=usecond(); | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     int sz=st.surface_list.size(); |     int sz=st.surface_list.size(); | ||||||
| @@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
| #else  | #else  | ||||||
|   assert(0); |   assert(0); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | ||||||
| 					 DoubledGaugeField & U, | 					 DoubledGaugeField & U, | ||||||
|   | |||||||
| @@ -165,7 +165,7 @@ class HMCResourceManager { | |||||||
|   // Grids |   // Grids | ||||||
|   ////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|   void AddGrid(std::string s, GridModule& M) { |   void AddGrid(const std::string s, GridModule& M) { | ||||||
|     // Check for name clashes |     // Check for name clashes | ||||||
|     auto search = Grids.find(s); |     auto search = Grids.find(s); | ||||||
|     if (search != Grids.end()) { |     if (search != Grids.end()) { | ||||||
| @@ -174,14 +174,24 @@ class HMCResourceManager { | |||||||
|       exit(1); |       exit(1); | ||||||
|     } |     } | ||||||
|     Grids[s] = std::move(M); |     Grids[s] = std::move(M); | ||||||
|  |     std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << "HMCResourceManager:" << std::endl; | ||||||
|  |     std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl; | ||||||
|  |     Grids[s].show_full_decomposition(); | ||||||
|  |     std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // Add a named grid set, 4d shortcut |   // Add a named grid set, 4d shortcut | ||||||
|   void AddFourDimGrid(std::string s) { |   void AddFourDimGrid(const std::string s) { | ||||||
|     GridFourDimModule<vComplex> Mod; |     GridFourDimModule<vComplex> Mod; | ||||||
|     AddGrid(s, Mod); |     AddGrid(s, Mod); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   // Add a named grid set, 4d shortcut + tweak simd lanes | ||||||
|  |   void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) { | ||||||
|  |     GridFourDimModule<vComplex> Mod(simd_decomposition); | ||||||
|  |     AddGrid(s, Mod); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   GridCartesian* GetCartesian(std::string s = "") { |   GridCartesian* GetCartesian(std::string s = "") { | ||||||
|   | |||||||
| @@ -43,10 +43,11 @@ public: | |||||||
|   std::string, lattice, |   std::string, lattice, | ||||||
|   std::string, mpi); |   std::string, mpi); | ||||||
|  |  | ||||||
|   std::vector<int> getLattice(){return strToVec<int>(lattice);} |   std::vector<int> getLattice() const {return strToVec<int>(lattice);} | ||||||
|   std::vector<int> getMpi()    {return strToVec<int>(mpi);} |   std::vector<int> getMpi()     const {return strToVec<int>(mpi);} | ||||||
|  |  | ||||||
|   void check(){ |  | ||||||
|  |   void check() const { | ||||||
|     if (getLattice().size() != getMpi().size() ) { |     if (getLattice().size() != getMpi().size() ) { | ||||||
|       std::cout << GridLogError |       std::cout << GridLogError | ||||||
|                 << "Error in GridModuleParameters: lattice and mpi dimensions " |                 << "Error in GridModuleParameters: lattice and mpi dimensions " | ||||||
| @@ -84,6 +85,8 @@ class GridModule { | |||||||
|  |  | ||||||
|   void set_full(GridCartesian* grid) { grid_.reset(grid); } |   void set_full(GridCartesian* grid) { grid_.reset(grid); } | ||||||
|   void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); } |   void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); } | ||||||
|  |   void show_full_decomposition(){ grid_->show_decomposition(); } | ||||||
|  |   void show_rb_decomposition(){ rbgrid_->show_decomposition(); } | ||||||
|  |  | ||||||
|  protected: |  protected: | ||||||
|   std::unique_ptr<GridCartesian> grid_; |   std::unique_ptr<GridCartesian> grid_; | ||||||
| @@ -96,27 +99,68 @@ class GridModule { | |||||||
| //////////////////////////////////// | //////////////////////////////////// | ||||||
| // Note: the space time grid should be out of the QCD namespace | // Note: the space time grid should be out of the QCD namespace | ||||||
| template <class vector_type> | template <class vector_type> | ||||||
| class GridFourDimModule : public GridModule { | class GridFourDimModule : public GridModule | ||||||
|  | { | ||||||
| public: | public: | ||||||
|   GridFourDimModule() { |   GridFourDimModule() | ||||||
|  |   { | ||||||
|     using namespace QCD; |     using namespace QCD; | ||||||
|     set_full(SpaceTimeGrid::makeFourDimGrid( |     set_full(SpaceTimeGrid::makeFourDimGrid( | ||||||
|         GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()), |         GridDefaultLatt(),  | ||||||
|  |         GridDefaultSimd(4, vector_type::Nsimd()), | ||||||
|         GridDefaultMpi())); |         GridDefaultMpi())); | ||||||
|     set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); |     set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   GridFourDimModule(GridModuleParameters Params) { |   GridFourDimModule(const std::vector<int> tweak_simd) | ||||||
|  |   { | ||||||
|  |     using namespace QCD; | ||||||
|  |     if (tweak_simd.size() != 4) | ||||||
|  |     { | ||||||
|  |       std::cout << GridLogError | ||||||
|  |                 << "Error in GridFourDimModule: SIMD size different from 4"  | ||||||
|  |                 << std::endl; | ||||||
|  |       exit(1); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Checks that the product agrees with the expectation | ||||||
|  |     int simd_sum = 1; | ||||||
|  |     for (auto &n : tweak_simd) | ||||||
|  |       simd_sum *= n; | ||||||
|  |     std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl; | ||||||
|  |  | ||||||
|  |     if (simd_sum == vector_type::Nsimd()) | ||||||
|  |     { | ||||||
|  |       set_full(SpaceTimeGrid::makeFourDimGrid( | ||||||
|  |           GridDefaultLatt(),  | ||||||
|  |           tweak_simd,  | ||||||
|  |           GridDefaultMpi())); | ||||||
|  |       set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |       std::cout << GridLogError  | ||||||
|  |                 << "Error in GridFourDimModule: SIMD lanes must sum to "  | ||||||
|  |                 << vector_type::Nsimd()  | ||||||
|  |                 << std::endl; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   GridFourDimModule(const GridModuleParameters Params) | ||||||
|  |   { | ||||||
|     using namespace QCD; |     using namespace QCD; | ||||||
|     Params.check(); |  | ||||||
|     std::vector<int> lattice_v = Params.getLattice(); |     std::vector<int> lattice_v = Params.getLattice(); | ||||||
|     std::vector<int> mpi_v = Params.getMpi(); |     std::vector<int> mpi_v = Params.getMpi(); | ||||||
|     if (lattice_v.size() == 4) { |     if (lattice_v.size() == 4) | ||||||
|  |     { | ||||||
|       set_full(SpaceTimeGrid::makeFourDimGrid( |       set_full(SpaceTimeGrid::makeFourDimGrid( | ||||||
|           lattice_v, GridDefaultSimd(4, vector_type::Nsimd()), |           lattice_v,  | ||||||
|  |           GridDefaultSimd(4, vector_type::Nsimd()), | ||||||
|           mpi_v)); |           mpi_v)); | ||||||
|       set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); |       set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); | ||||||
|     } else { |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|       std::cout << GridLogError |       std::cout << GridLogError | ||||||
|                 << "Error in GridFourDimModule: lattice dimension different from 4" |                 << "Error in GridFourDimModule: lattice dimension different from 4" | ||||||
|                 << std::endl; |                 << std::endl; | ||||||
|   | |||||||
| @@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters> | |||||||
|   typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase; |   typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase; | ||||||
|   using ObsBase::ObsBase; // for constructors |   using ObsBase::ObsBase; // for constructors | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   // acquire resource |   // acquire resource | ||||||
|   virtual void initialize(){ |   virtual void initialize(){ | ||||||
|     this->ObservablePtr.reset(new PlaquetteLogger<Impl>()); |     this->ObservablePtr.reset(new PlaquetteLogger<Impl>()); | ||||||
| @@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters> | |||||||
|   PlaquetteMod(): ObsBase(NoParameters()){} |   PlaquetteMod(): ObsBase(NoParameters()){} | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| template < class Impl > | template < class Impl > | ||||||
| class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{ | class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{ | ||||||
|   typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase; |   typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase; | ||||||
|   using ObsBase::ObsBase; // for constructors |   using ObsBase::ObsBase; // for constructors | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   // acquire resource |   // acquire resource | ||||||
|   virtual void initialize(){ |   virtual void initialize(){ | ||||||
|     this->ObservablePtr.reset(new TopologicalCharge<Impl>()); |     this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_)); | ||||||
|   } |   } | ||||||
|   public: |   public: | ||||||
|   TopologicalChargeMod(): ObsBase(NoParameters()){} |   TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){} | ||||||
|  |   TopologicalChargeMod(): ObsBase(){} | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| }// QCD temporarily here | }// QCD temporarily here | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -33,9 +33,45 @@ directory | |||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
|  | struct TopologySmearingParameters : Serializable { | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters, | ||||||
|  |     int, steps, | ||||||
|  |     float, step_size, | ||||||
|  |     int, meas_interval, | ||||||
|  |     float, maxTau); | ||||||
|  |  | ||||||
|  |     TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f): | ||||||
|  |         steps(s), step_size(ss), meas_interval(mi), maxTau(mT){} | ||||||
|  |  | ||||||
|  |     template < class ReaderClass > | ||||||
|  |     TopologySmearingParameters(Reader<ReaderClass>& Reader){ | ||||||
|  |         read(Reader, "Smearing", *this);   | ||||||
|  |     }   | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | struct TopologyObsParameters : Serializable { | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters, | ||||||
|  |       int, interval, | ||||||
|  |       bool, do_smearing, | ||||||
|  |       TopologySmearingParameters, Smearing);   | ||||||
|  |  | ||||||
|  |     TopologyObsParameters(int interval = 1, bool smearing = false): | ||||||
|  |         interval(interval), Smearing(smearing){} | ||||||
|  |  | ||||||
|  |     template <class ReaderClass > | ||||||
|  |       TopologyObsParameters(Reader<ReaderClass>& Reader){ | ||||||
|  |         read(Reader, "TopologyMeasurement", *this); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| // this is only defined for a gauge theory | // this is only defined for a gauge theory | ||||||
| template <class Impl> | template <class Impl> | ||||||
| class TopologicalCharge : public HmcObservable<typename Impl::Field> { | class TopologicalCharge : public HmcObservable<typename Impl::Field> { | ||||||
|  |     TopologyObsParameters Pars; | ||||||
|  |  | ||||||
|  public: |  public: | ||||||
|     // here forces the Impl to be of gauge fields |     // here forces the Impl to be of gauge fields | ||||||
|     // if not the compiler will complain |     // if not the compiler will complain | ||||||
| @@ -44,21 +80,40 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> { | |||||||
|     // necessary for HmcObservable compatibility |     // necessary for HmcObservable compatibility | ||||||
|     typedef typename Impl::Field Field; |     typedef typename Impl::Field Field; | ||||||
|  |  | ||||||
|  |     TopologicalCharge(int interval = 1, bool do_smearing = false): | ||||||
|  |         Pars(interval, do_smearing){} | ||||||
|  |      | ||||||
|  |     TopologicalCharge(TopologyObsParameters P):Pars(P){ | ||||||
|  |         std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     void TrajectoryComplete(int traj, |     void TrajectoryComplete(int traj, | ||||||
|                             Field &U, |                             Field &U, | ||||||
|                             GridSerialRNG &sRNG, |                             GridSerialRNG &sRNG, | ||||||
|                             GridParallelRNG &pRNG) { |                             GridParallelRNG &pRNG) { | ||||||
|  |  | ||||||
|     Real q = WilsonLoops<Impl>::TopologicalCharge(U); |     if (traj%Pars.interval == 0){ | ||||||
|  |         // Smearing | ||||||
|  |         Field Usmear = U; | ||||||
|         int def_prec = std::cout.precision(); |         int def_prec = std::cout.precision(); | ||||||
|          |          | ||||||
|  |         if (Pars.do_smearing){ | ||||||
|  |             // using wilson flow by default here | ||||||
|  |             WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval); | ||||||
|  |             WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau); | ||||||
|  |             Real T0   = WF.energyDensityPlaquette(Usmear); | ||||||
|  |             std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1) | ||||||
|  |                       << "T0                : [ " << traj << " ] "<< T0 << std::endl; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear); | ||||||
|         std::cout << GridLogMessage |         std::cout << GridLogMessage | ||||||
|             << std::setprecision(std::numeric_limits<Real>::digits10 + 1) |             << std::setprecision(std::numeric_limits<Real>::digits10 + 1) | ||||||
|             << "Topological Charge: [ " << traj << " ] "<< q << std::endl; |             << "Topological Charge: [ " << traj << " ] "<< q << std::endl; | ||||||
|  |  | ||||||
|         std::cout.precision(def_prec); |         std::cout.precision(def_prec); | ||||||
|         } |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
| }; | }; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -26,8 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| //#include <Grid/Grid.h> | //#include <Grid/Grid.h> | ||||||
|  |  | ||||||
| using namespace Grid; | #ifndef GRID_QCD_GAUGE_FIX_H | ||||||
| using namespace Grid::QCD; | #define GRID_QCD_GAUGE_FIX_H | ||||||
|  | namespace Grid { | ||||||
|  | namespace QCD { | ||||||
|  |  | ||||||
| template <class Gimpl>  | template <class Gimpl>  | ||||||
| class FourierAcceleratedGaugeFixer  : public Gimpl { | class FourierAcceleratedGaugeFixer  : public Gimpl { | ||||||
| @@ -186,3 +188,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl { | |||||||
|   }   |   }   | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | } | ||||||
|  | #endif | ||||||
|   | |||||||
| @@ -82,11 +82,11 @@ namespace Optimization { | |||||||
|       double tmp[2]={a,b}; |       double tmp[2]={a,b}; | ||||||
|       return vld1q_f64(tmp); |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     //Real double // N:tbc |     //Real double | ||||||
|     inline float64x2_t operator()(double a){ |     inline float64x2_t operator()(double a){ | ||||||
|       return vdupq_n_f64(a); |       return vdupq_n_f64(a); | ||||||
|     } |     } | ||||||
|     //Integer // N:tbc |     //Integer | ||||||
|     inline uint32x4_t operator()(Integer a){ |     inline uint32x4_t operator()(Integer a){ | ||||||
|       return vdupq_n_u32(a); |       return vdupq_n_u32(a); | ||||||
|     } |     } | ||||||
| @@ -124,33 +124,32 @@ namespace Optimization { | |||||||
|   // Nils: Vset untested; not used currently in Grid at all; |   // Nils: Vset untested; not used currently in Grid at all; | ||||||
|   // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b |   // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b | ||||||
|   struct Vset{ |   struct Vset{ | ||||||
|     // Complex float // N:ok |     // Complex float | ||||||
|     inline float32x4_t operator()(Grid::ComplexF *a){ |     inline float32x4_t operator()(Grid::ComplexF *a){ | ||||||
|       float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; |       float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; | ||||||
|       return vld1q_f32(tmp); |       return vld1q_f32(tmp); | ||||||
|     } |     } | ||||||
|     // Complex double // N:ok |     // Complex double | ||||||
|     inline float64x2_t operator()(Grid::ComplexD *a){ |     inline float64x2_t operator()(Grid::ComplexD *a){ | ||||||
|       double tmp[2]={a[0].imag(),a[0].real()}; |       double tmp[2]={a[0].imag(),a[0].real()}; | ||||||
|       return vld1q_f64(tmp); |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     // Real float // N:ok |     // Real float | ||||||
|     inline float32x4_t operator()(float *a){ |     inline float32x4_t operator()(float *a){ | ||||||
|       float tmp[4]={a[3],a[2],a[1],a[0]}; |       float tmp[4]={a[3],a[2],a[1],a[0]}; | ||||||
|       return vld1q_f32(tmp); |       return vld1q_f32(tmp); | ||||||
|     } |     } | ||||||
|     // Real double // N:ok |     // Real double | ||||||
|     inline float64x2_t operator()(double *a){ |     inline float64x2_t operator()(double *a){ | ||||||
|       double tmp[2]={a[1],a[0]}; |       double tmp[2]={a[1],a[0]}; | ||||||
|       return vld1q_f64(tmp); |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     // Integer // N:ok |     // Integer | ||||||
|     inline uint32x4_t operator()(Integer *a){ |     inline uint32x4_t operator()(Integer *a){ | ||||||
|       return vld1q_dup_u32(a); |       return vld1q_dup_u32(a); | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   // N:leaving as is |  | ||||||
|   template <typename Out_type, typename In_type> |   template <typename Out_type, typename In_type> | ||||||
|   struct Reduce{ |   struct Reduce{ | ||||||
|     //Need templated class to overload output type |     //Need templated class to overload output type | ||||||
| @@ -421,11 +420,6 @@ namespace Optimization { | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| // working, but no restriction on n |  | ||||||
| //    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); }; |  | ||||||
| //    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); }; |  | ||||||
|  |  | ||||||
| // restriction on n |  | ||||||
|     template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; |     template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; | ||||||
|     template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; |     template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; | ||||||
|  |  | ||||||
| @@ -547,7 +541,7 @@ namespace Optimization { | |||||||
|  |  | ||||||
|  |  | ||||||
|   //Complex double Reduce |   //Complex double Reduce | ||||||
|   template<> // N:by Boyle |   template<> | ||||||
|   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ |   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ | ||||||
|     u128d conv; conv.v = in; |     u128d conv; conv.v = in; | ||||||
|     return Grid::ComplexD(conv.f[0],conv.f[1]); |     return Grid::ComplexD(conv.f[0],conv.f[1]); | ||||||
| @@ -562,9 +556,7 @@ namespace Optimization { | |||||||
|   //Integer Reduce |   //Integer Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){ |   inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){ | ||||||
|     // FIXME unimplemented |     return vaddvq_u32(in); | ||||||
|     printf("Reduce : Missing integer implementation -> FIX\n"); |  | ||||||
|     assert(0); |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -604,3 +596,4 @@ namespace Optimization { | |||||||
|   typedef Optimization::TimesI      TimesISIMD; |   typedef Optimization::TimesI      TimesISIMD; | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   // Timing info; ugly; possibly temporary |   // Timing info; ugly; possibly temporary | ||||||
|   ///////////////////////////////////////// |   ///////////////////////////////////////// | ||||||
|   double commtime; |   double commtime; | ||||||
|  |   double mpi3synctime; | ||||||
|  |   double mpi3synctime_g; | ||||||
|  |   double shmmergetime; | ||||||
|   double gathertime; |   double gathertime; | ||||||
|   double gathermtime; |   double gathermtime; | ||||||
|   double halogtime; |   double halogtime; | ||||||
| @@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   double splicetime; |   double splicetime; | ||||||
|   double nosplicetime; |   double nosplicetime; | ||||||
|   double calls; |   double calls; | ||||||
|  |   std::vector<double> comm_bytes_thr; | ||||||
|  |   std::vector<double> comm_time_thr; | ||||||
|  |   std::vector<double> comm_enter_thr; | ||||||
|  |   std::vector<double> comm_leave_thr; | ||||||
|  |  | ||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   // Stencil query |   // Stencil query | ||||||
| @@ -248,6 +255,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|   // Comms packet queue for asynch thread |   // Comms packet queue for asynch thread | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|  |   void CommunicateThreaded() | ||||||
|  |   { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |     // must be called in parallel region | ||||||
|  |     int mythread = omp_get_thread_num(); | ||||||
|  |     int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  | #else | ||||||
|  |     int mythread = 0; | ||||||
|  |     int nthreads = 1; | ||||||
|  | #endif | ||||||
|  |     if (nthreads == -1) nthreads = 1; | ||||||
|  |     if (mythread < nthreads) { | ||||||
|  |       comm_enter_thr[mythread] = usecond(); | ||||||
|  |       for (int i = mythread; i < Packets.size(); i += nthreads) { | ||||||
|  | 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, | ||||||
|  | 						      Packets[i].to_rank, | ||||||
|  | 						      Packets[i].recv_buf, | ||||||
|  | 						      Packets[i].from_rank, | ||||||
|  | 						      Packets[i].bytes,i); | ||||||
|  | 	comm_bytes_thr[mythread] += bytes; | ||||||
|  |       } | ||||||
|  |       comm_leave_thr[mythread]= usecond(); | ||||||
|  |       comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   void CollateThreads(void) | ||||||
|  |   { | ||||||
|  |     int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  |     double first=0.0; | ||||||
|  |     double last =0.0; | ||||||
|  |  | ||||||
|  |     for(int t=0;t<nthreads;t++) { | ||||||
|  |  | ||||||
|  |       double t0 = comm_enter_thr[t]; | ||||||
|  |       double t1 = comm_leave_thr[t]; | ||||||
|  |       comms_bytes+=comm_bytes_thr[t]; | ||||||
|  |  | ||||||
|  |       comm_enter_thr[t] = 0.0; | ||||||
|  |       comm_leave_thr[t] = 0.0; | ||||||
|  |       comm_time_thr[t]   = 0.0; | ||||||
|  |       comm_bytes_thr[t]=0; | ||||||
|  |  | ||||||
|  |       if ( first == 0.0 ) first = t0;                   // first is t0 | ||||||
|  |       if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen | ||||||
|  |  | ||||||
|  |       if ( t1 > last ) last = t1;                       // max time seen | ||||||
|  |        | ||||||
|  |     } | ||||||
|  |     commtime+= last-first; | ||||||
|  |   } | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     reqs.resize(Packets.size()); |     reqs.resize(Packets.size()); | ||||||
| @@ -258,25 +316,59 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 						     Packets[i].to_rank, | 						     Packets[i].to_rank, | ||||||
| 						     Packets[i].recv_buf, | 						     Packets[i].recv_buf, | ||||||
| 						     Packets[i].from_rank, | 						     Packets[i].from_rank, | ||||||
| 					  Packets[i].bytes); | 						     Packets[i].bytes,i); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       _grid->StencilSendToRecvFromComplete(reqs[i]); |       _grid->StencilSendToRecvFromComplete(reqs[i],i); | ||||||
|     } |     } | ||||||
|     commtime+=usecond(); |     commtime+=usecond(); | ||||||
|   } |   } | ||||||
|  |   void Communicate(void) | ||||||
|  |   { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp parallel  | ||||||
|  |     { | ||||||
|  |       // must be called in parallel region | ||||||
|  |       int mythread  = omp_get_thread_num(); | ||||||
|  |       int maxthreads= omp_get_max_threads(); | ||||||
|  |       int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  |       assert(nthreads <= maxthreads); | ||||||
|  |  | ||||||
|  |       if (nthreads == -1) nthreads = 1; | ||||||
|  | #else | ||||||
|  |       int mythread = 0; | ||||||
|  |       int nthreads = 1; | ||||||
|  | #endif | ||||||
|  |       if (mythread < nthreads) { | ||||||
|  | 	for (int i = mythread; i < Packets.size(); i += nthreads) { | ||||||
|  | 	  double start = usecond(); | ||||||
|  | 	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf, | ||||||
|  | 								   Packets[i].to_rank, | ||||||
|  | 								   Packets[i].recv_buf, | ||||||
|  | 								   Packets[i].from_rank, | ||||||
|  | 								   Packets[i].bytes,i); | ||||||
|  | 	  comm_time_thr[mythread] += usecond() - start; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|    |    | ||||||
|   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)  |   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)  | ||||||
|   { |   { | ||||||
|     std::vector<std::vector<CommsRequest_t> > reqs; |     std::vector<std::vector<CommsRequest_t> > reqs; | ||||||
|     Prepare(); |     Prepare(); | ||||||
|     HaloGather(source,compress); |     HaloGather(source,compress); | ||||||
|     CommunicateBegin(reqs); |     // Concurrent | ||||||
|     CommunicateComplete(reqs); |     //CommunicateBegin(reqs); | ||||||
|  |     //CommunicateComplete(reqs); | ||||||
|  |     // Sequential, possibly threaded | ||||||
|  |     Communicate(); | ||||||
|     CommsMergeSHM(compress);  |     CommsMergeSHM(compress);  | ||||||
|     CommsMerge(compress);  |     CommsMerge(compress);  | ||||||
|   } |   } | ||||||
| @@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   template<class compressor> |   template<class compressor> | ||||||
|   void HaloGather(const Lattice<vobj> &source,compressor &compress) |   void HaloGather(const Lattice<vobj> &source,compressor &compress) | ||||||
|   { |   { | ||||||
|  |     mpi3synctime_g-=usecond(); | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |     mpi3synctime_g+=usecond(); | ||||||
|  |  | ||||||
|     // conformable(source._grid,_grid); |     // conformable(source._grid,_grid); | ||||||
|     assert(source._grid==_grid); |     assert(source._grid==_grid); | ||||||
| @@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     CommsMerge(decompress,Mergers,Decompressions);  |     CommsMerge(decompress,Mergers,Decompressions);  | ||||||
|   } |   } | ||||||
|   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { |   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { | ||||||
|  |     mpi3synctime-=usecond();     | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |     mpi3synctime+=usecond();     | ||||||
|  |     shmmergetime-=usecond();     | ||||||
|     CommsMerge(decompress,MergersSHM,DecompressionsSHM); |     CommsMerge(decompress,MergersSHM,DecompressionsSHM); | ||||||
|  |     shmmergetime+=usecond();     | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class decompressor> |   template<class decompressor> | ||||||
| @@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 		  int checkerboard, | 		  int checkerboard, | ||||||
| 		  const std::vector<int> &directions, | 		  const std::vector<int> &directions, | ||||||
| 		  const std::vector<int> &distances)  | 		  const std::vector<int> &distances)  | ||||||
|    :   _permute_type(npoints), _comm_buf_size(npoints) |    : _permute_type(npoints),  | ||||||
|  |     _comm_buf_size(npoints), | ||||||
|  |     comm_bytes_thr(npoints),  | ||||||
|  |     comm_enter_thr(npoints), | ||||||
|  |     comm_leave_thr(npoints),  | ||||||
|  |        comm_time_thr(npoints) | ||||||
|   { |   { | ||||||
|     face_table_computed=0; |     face_table_computed=0; | ||||||
|     _npoints = npoints; |     _npoints = npoints; | ||||||
| @@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   void ZeroCounters(void) { |   void ZeroCounters(void) { | ||||||
|     gathertime = 0.; |     gathertime = 0.; | ||||||
|     commtime = 0.; |     commtime = 0.; | ||||||
|  |     mpi3synctime=0.; | ||||||
|  |     mpi3synctime_g=0.; | ||||||
|  |     shmmergetime=0.; | ||||||
|  |     for(int i=0;i<_npoints;i++){ | ||||||
|  |       comm_time_thr[i]=0; | ||||||
|  |       comm_bytes_thr[i]=0; | ||||||
|  |       comm_enter_thr[i]=0; | ||||||
|  |       comm_leave_thr[i]=0; | ||||||
|  |     } | ||||||
|     halogtime = 0.; |     halogtime = 0.; | ||||||
|     mergetime = 0.; |     mergetime = 0.; | ||||||
|     decompresstime = 0.; |     decompresstime = 0.; | ||||||
| @@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; | #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; | ||||||
|     RealD NP = _grid->_Nprocessors; |     RealD NP = _grid->_Nprocessors; | ||||||
|     RealD NN = _grid->NodeCount(); |     RealD NN = _grid->NodeCount(); | ||||||
|  |     double t = 0; | ||||||
|  |     // if comm_time_thr is set they were all done in parallel so take the max | ||||||
|  |     // but add up the bytes | ||||||
|  |     int threaded = 0 ; | ||||||
|  |     for (int i = 0; i < 8; ++i) { | ||||||
|  |       if ( comm_time_thr[i]>0.0 ) { | ||||||
|  | 	threaded = 1; | ||||||
|  | 	comms_bytes += comm_bytes_thr[i]; | ||||||
|  | 	if (t < comm_time_thr[i]) t = comm_time_thr[i]; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if (threaded) commtime += t; | ||||||
|      |      | ||||||
|     _grid->GlobalSum(commtime);    commtime/=NP; |     _grid->GlobalSum(commtime);    commtime/=NP; | ||||||
|     if ( calls > 0. ) { |     if ( calls > 0. ) { | ||||||
| @@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; | 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; | ||||||
| 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; | 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; | ||||||
|       } |       } | ||||||
|  |       PRINTIT(mpi3synctime); | ||||||
|  |       PRINTIT(mpi3synctime_g); | ||||||
|  |       PRINTIT(shmmergetime); | ||||||
|       PRINTIT(splicetime); |       PRINTIT(splicetime); | ||||||
|       PRINTIT(nosplicetime); |       PRINTIT(nosplicetime); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N> | |||||||
| strong_inline void mult(iVector<rtype,N> * __restrict__ ret, | strong_inline void mult(iVector<rtype,N> * __restrict__ ret, | ||||||
|                  const iVector<vtype,N> * __restrict__ rhs, |                  const iVector<vtype,N> * __restrict__ rhs, | ||||||
|                  const iScalar<mtype> * __restrict__ lhs){ |                  const iScalar<mtype> * __restrict__ lhs){ | ||||||
|     mult(ret,lhs,rhs); |     for(int c1=0;c1<N;c1++){ | ||||||
|  |         mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal); | ||||||
|  |     }                  | ||||||
| } | } | ||||||
|      |      | ||||||
|  |  | ||||||
|   | |||||||
| @@ -219,9 +219,15 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     int MB; |     int MB; | ||||||
|     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm"); |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm"); | ||||||
|     GridCmdOptionInt(arg,MB); |     GridCmdOptionInt(arg,MB); | ||||||
|     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; |     uint64_t MB64 = MB; | ||||||
|  |     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){ | ||||||
|  |     CartesianCommunicator::Hugepages = 1; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ | ||||||
|     Grid_debug_handler_init(); |     Grid_debug_handler_init(); | ||||||
|   } |   } | ||||||
| @@ -304,6 +310,7 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl; |     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl; |     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl; |     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; |     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
| @@ -356,10 +363,15 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ | ||||||
|     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ | ||||||
|     LebesgueOrder::UseLebesgueOrder=1; |     LebesgueOrder::UseLebesgueOrder=1; | ||||||
|   } |   } | ||||||
|  |   CartesianCommunicator::nCommThreads = -1; | ||||||
|  |   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ | ||||||
|  |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); | ||||||
|  |     GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); | ||||||
|  |   } | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ | ||||||
|     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); | ||||||
|     GridCmdOptionIntVector(arg,LebesgueOrder::Block); |     GridCmdOptionIntVector(arg,LebesgueOrder::Block); | ||||||
| @@ -374,10 +386,13 @@ void Grid_init(int *argc,char ***argv) | |||||||
| 		  Grid_default_latt, | 		  Grid_default_latt, | ||||||
| 		  Grid_default_mpi); | 		  Grid_default_mpi); | ||||||
|  |  | ||||||
|   std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; |   std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; | ||||||
|  |   if ( CartesianCommunicator::Hugepages) { | ||||||
|  |     std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ | ||||||
|     std::cout<<GridLogMessage<<"Grid Decomposition\n"; |     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n"; | ||||||
|     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; |     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; |     std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; |     std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; | ||||||
| @@ -393,7 +408,7 @@ void Grid_init(int *argc,char ***argv) | |||||||
|  |  | ||||||
| void Grid_finalize(void) | void Grid_finalize(void) | ||||||
| { | { | ||||||
| #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) | #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) | ||||||
|   MPI_Finalize(); |   MPI_Finalize(); | ||||||
|   Grid_unquiesce_nodes(); |   Grid_unquiesce_nodes(); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/Grid.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Grid::QCD; | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) | int main (int argc, char ** argv) | ||||||
| { | { | ||||||
|   std::vector<int> seeds({1,2,3,4}); |   std::vector<int> seeds({1,2,3,4}); | ||||||
| @@ -82,6 +85,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   Uorg = Uorg - Umu; |   Uorg = Uorg - Umu; | ||||||
|   std::cout << " Norm Difference "<< norm2(Uorg) << std::endl; |   std::cout << " Norm Difference "<< norm2(Uorg) << std::endl; | ||||||
|  |   std::cout << " Norm "<< norm2(Umu) << std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout<< "*****************************************************************" <<std::endl; |   std::cout<< "*****************************************************************" <<std::endl; | ||||||
|   | |||||||
| @@ -40,12 +40,6 @@ namespace Grid{ | |||||||
| 				    double, StoppingCondition, | 				    double, StoppingCondition, | ||||||
| 				    int, MaxCGIterations, | 				    int, MaxCGIterations, | ||||||
| 				    bool, ApplySmearing); | 				    bool, ApplySmearing); | ||||||
|  |  | ||||||
|     //template <class ReaderClass > |  | ||||||
|     //FermionParameters(Reader<ReaderClass>& Reader){ |  | ||||||
|     //  read(Reader, "Mobius", *this); |  | ||||||
|     //} |  | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|    |    | ||||||
| @@ -113,9 +107,17 @@ int main(int argc, char **argv) { | |||||||
|   bool ApplySmearing = MyParams.Mobius.ApplySmearing; |   bool ApplySmearing = MyParams.Mobius.ApplySmearing; | ||||||
|    |    | ||||||
|    |    | ||||||
|  |   // Use this if you want to tweak the default decomposition | ||||||
|  |   // commented out as very architecture speficic | ||||||
|  |    | ||||||
|  |   //std::vector<int> simd_lanes({2,2,1,1}); | ||||||
|  |  | ||||||
|  |   // Grid from the command line arguments --grid and --mpi | ||||||
|  |   // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes | ||||||
|  |    | ||||||
|  |   //TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); // tweak the SIMD lanes | ||||||
|  |   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition | ||||||
|    |    | ||||||
|   // Grid from the command line |  | ||||||
|   TheHMC.Resources.AddFourDimGrid("gauge"); |  | ||||||
|   // Possibile to create the module by hand  |   // Possibile to create the module by hand  | ||||||
|   // hardcoding parameters or using a Reader |   // hardcoding parameters or using a Reader | ||||||
|  |  | ||||||
|   | |||||||
| @@ -66,7 +66,14 @@ int main(int argc, char **argv) { | |||||||
|   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; |   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; | ||||||
|   typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs; |   typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs; | ||||||
|   TheHMC.Resources.AddObservable<PlaqObs>(); |   TheHMC.Resources.AddObservable<PlaqObs>(); | ||||||
|   TheHMC.Resources.AddObservable<QObs>(); |   TopologyObsParameters TopParams; | ||||||
|  |   TopParams.interval = 5; | ||||||
|  |   TopParams.do_smearing = true; | ||||||
|  |   TopParams.Smearing.steps = 200; | ||||||
|  |   TopParams.Smearing.step_size = 0.01; | ||||||
|  |   TopParams.Smearing.meas_interval = 50; | ||||||
|  |   TopParams.Smearing.maxTau = 2.0;  | ||||||
|  |   TheHMC.Resources.AddObservable<QObs>(TopParams); | ||||||
|   ////////////////////////////////////////////// |   ////////////////////////////////////////////// | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -99,21 +99,27 @@ int main (int argc, char ** argv) | |||||||
|   std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl; |   std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl; | ||||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|   result=zero; |   result=zero; | ||||||
|  |   Ds.ZeroCounters(); | ||||||
|   CG(HermOp,src,result); |   CG(HermOp,src,result); | ||||||
|  |   Ds.Report(); | ||||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|   std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl; |   std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl; | ||||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|   result=zero; |   result=zero; | ||||||
|  |   Ds.ZeroCounters(); | ||||||
|   mCG(HermOp,src,result); |   mCG(HermOp,src,result); | ||||||
|  |   Ds.Report(); | ||||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl; |   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl; | ||||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|   result=zero; |   result=zero; | ||||||
|  |   Ds.ZeroCounters(); | ||||||
|   BCGrQ(HermOp,src,result); |   BCGrQ(HermOp,src,result); | ||||||
|  |   Ds.Report(); | ||||||
|   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; |   std::cout << GridLogMessage << "************************************************************************ "<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user