mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-26 01:29:34 +00:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
		
							
								
								
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							| @@ -9,68 +9,6 @@ matrix: | |||||||
|     - os:        osx |     - os:        osx | ||||||
|       osx_image: xcode8.3 |       osx_image: xcode8.3 | ||||||
|       compiler: clang |       compiler: clang | ||||||
|     - compiler: gcc |  | ||||||
|       dist: trusty |  | ||||||
|       sudo: required |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.9 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: VERSION=-4.9 |  | ||||||
|     - compiler: gcc |  | ||||||
|       dist: trusty |  | ||||||
|       sudo: required |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-5 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: VERSION=-5 |  | ||||||
|     - compiler: clang |  | ||||||
|       dist: trusty |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.8 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz |  | ||||||
|     - compiler: clang |  | ||||||
|       dist: trusty |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.8 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz |  | ||||||
|        |        | ||||||
| before_install: | before_install: | ||||||
|     - export GRIDDIR=`pwd` |     - export GRIDDIR=`pwd` | ||||||
| @@ -106,9 +44,3 @@ script: | |||||||
|     - make -j4 |     - make -j4 | ||||||
|     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals |     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||||
|     - make check |     - make check | ||||||
|     - echo make clean |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,18 +1,4 @@ | |||||||
| # Grid | # Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [](https://travis-ci.org/paboyle/Grid) | ||||||
| <table> |  | ||||||
| <tr> |  | ||||||
|     <td>Last stable release</td> |  | ||||||
|     <td><a href="https://travis-ci.org/paboyle/Grid"> |  | ||||||
|     <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a> |  | ||||||
|     </td> |  | ||||||
| </tr> |  | ||||||
| <tr> |  | ||||||
|     <td>Development branch</td> |  | ||||||
|     <td><a href="https://travis-ci.org/paboyle/Grid"> |  | ||||||
|     <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a> |  | ||||||
|     </td> |  | ||||||
| </tr> |  | ||||||
| </table> |  | ||||||
|  |  | ||||||
| **Data parallel C++ mathematical object library.** | **Data parallel C++ mathematical object library.** | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										4
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								TODO
									
									
									
									
									
								
							| @@ -6,6 +6,7 @@ Large item work list: | |||||||
| 1)- BG/Q port and check | 1)- BG/Q port and check | ||||||
| 2)- Christoph's local basis expansion Lanczos | 2)- Christoph's local basis expansion Lanczos | ||||||
| 3)- Precision conversion and sort out localConvert      <-- partial | 3)- Precision conversion and sort out localConvert      <-- partial | ||||||
|  |  | ||||||
|   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet |   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet | ||||||
| 4)- Physical propagator interface | 4)- Physical propagator interface | ||||||
| 5)- Conserved currents | 5)- Conserved currents | ||||||
| @@ -13,7 +14,8 @@ Large item work list: | |||||||
| 7)- HDCR resume | 7)- HDCR resume | ||||||
|  |  | ||||||
| Recent DONE  | Recent DONE  | ||||||
| -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O |  | ||||||
|  | -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE | ||||||
| -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE | -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE | ||||||
| -- GaugeFix into central location                      <-- DONE | -- GaugeFix into central location                      <-- DONE | ||||||
| -- Scidac and Ildg metadata handling                   <-- DONE | -- Scidac and Ildg metadata handling                   <-- DONE | ||||||
|   | |||||||
| @@ -32,6 +32,19 @@ using namespace std; | |||||||
| using namespace Grid; | using namespace Grid; | ||||||
| using namespace Grid::QCD; | using namespace Grid::QCD; | ||||||
|  |  | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR; | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF; | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | std::vector<int> L_list; | ||||||
|  | std::vector<int> Ls_list; | ||||||
|  | std::vector<double> mflop_list; | ||||||
|  |  | ||||||
|  | double mflop_ref; | ||||||
|  | double mflop_ref_err; | ||||||
|  |  | ||||||
|  | int NN_global; | ||||||
|  |  | ||||||
| struct time_statistics{ | struct time_statistics{ | ||||||
|   double mean; |   double mean; | ||||||
| @@ -95,13 +108,15 @@ public: | |||||||
|  |  | ||||||
|   static void Comms(void) |   static void Comms(void) | ||||||
|   { |   { | ||||||
|     int Nloop=100; |     int Nloop=200; | ||||||
|     int nmu=0; |     int nmu=0; | ||||||
|     int maxlat=32; |     int maxlat=32; | ||||||
|  |  | ||||||
|     std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); |     std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); | ||||||
|     std::vector<int> mpi_layout  = GridDefaultMpi(); |     std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|  |     for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; | ||||||
|  |  | ||||||
|     std::vector<double> t_time(Nloop); |     std::vector<double> t_time(Nloop); | ||||||
|     time_statistics timestat; |     time_statistics timestat; | ||||||
|  |  | ||||||
| @@ -133,13 +148,14 @@ public: | |||||||
| 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	int ncomm; |  | ||||||
| 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  | 	int ncomm; | ||||||
| 	double dbytes; | 	double dbytes; | ||||||
|  | 	std::vector<double> times(Nloop); | ||||||
| 	for(int i=0;i<Nloop;i++){ | 	for(int i=0;i<Nloop;i++){ | ||||||
|  |  | ||||||
| 	  double start=usecond(); | 	  double start=usecond(); | ||||||
|  |  | ||||||
| 	  std::vector<CartesianCommunicator::CommsRequest_t> requests; |  | ||||||
| 	  dbytes=0; | 	  dbytes=0; | ||||||
| 	  ncomm=0; | 	  ncomm=0; | ||||||
|  |  | ||||||
| @@ -150,7 +166,6 @@ public: | |||||||
|  |  | ||||||
| 	    if (mpi_layout[mu]>1 ) { | 	    if (mpi_layout[mu]>1 ) { | ||||||
| 	         | 	         | ||||||
| 	      ncomm++; |  | ||||||
| 	      int xmit_to_rank; | 	      int xmit_to_rank; | ||||||
| 	      int recv_from_rank; | 	      int recv_from_rank; | ||||||
| 	      if ( dir == mu ) {  | 	      if ( dir == mu ) {  | ||||||
| @@ -160,18 +175,18 @@ public: | |||||||
| 		int comm_proc = mpi_layout[mu]-1; | 		int comm_proc = mpi_layout[mu]-1; | ||||||
| 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
| 	      } | 	      } | ||||||
| #if 0 | 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | ||||||
| 	      tbytes= Grid.StencilSendToRecvFromBegin(requests, | 						 (void *)&rbuf[dir][0], recv_from_rank, | ||||||
| 						      (void *)&xbuf[dir][0], |  | ||||||
| 						      xmit_to_rank, |  | ||||||
| 						      (void *)&rbuf[dir][0], |  | ||||||
| 						      recv_from_rank, |  | ||||||
| 						 bytes,dir); | 						 bytes,dir); | ||||||
| 	      Grid.StencilSendToRecvFromComplete(requests,dir); |  | ||||||
| #endif |  | ||||||
| 	      requests.resize(0); |  | ||||||
| 	   | 	   | ||||||
|  | #ifdef GRID_OMP | ||||||
| #pragma omp atomic | #pragma omp atomic | ||||||
|  | #endif | ||||||
|  | 	      ncomm++; | ||||||
|  |  | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp atomic | ||||||
|  | #endif | ||||||
| 	      dbytes+=tbytes; | 	      dbytes+=tbytes; | ||||||
| 	    } | 	    } | ||||||
| 	  } | 	  } | ||||||
| @@ -181,13 +196,15 @@ public: | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	timestat.statistics(t_time); | 	timestat.statistics(t_time); | ||||||
|  | 	//	for(int i=0;i<t_time.size();i++){ | ||||||
|  | 	//	  std::cout << i<<" "<<t_time[i]<<std::endl; | ||||||
|  | 	//	} | ||||||
|  |  | ||||||
| 	dbytes=dbytes*ppn; | 	dbytes=dbytes*ppn; | ||||||
| 	double xbytes    = dbytes*0.5; | 	double xbytes    = dbytes*0.5; | ||||||
| 	double rbytes    = dbytes*0.5; | 	double rbytes    = dbytes*0.5; | ||||||
| 	double bidibytes = dbytes; | 	double bidibytes = dbytes; | ||||||
|  |  | ||||||
|  |  | ||||||
| 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||||
| 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||||
| 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||||
| @@ -196,6 +213,7 @@ public: | |||||||
| 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||||
|  |  | ||||||
|   |   | ||||||
|  | 	 | ||||||
| 	    } | 	    } | ||||||
|     }     |     }     | ||||||
|  |  | ||||||
| @@ -218,7 +236,7 @@ public: | |||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|    |    | ||||||
|   uint64_t lmax=48; |   uint64_t lmax=48; | ||||||
| #define NLOOP (10*lmax*lmax*lmax*lmax/lat/lat/lat/lat) | #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat) | ||||||
|  |  | ||||||
|     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); |     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|     for(int lat=8;lat<=lmax;lat+=4){ |     for(int lat=8;lat<=lmax;lat+=4){ | ||||||
| @@ -253,8 +271,7 @@ public: | |||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   static double DWF5(int Ls,int L) | ||||||
|   static void DWF(int Ls,int L) |  | ||||||
|   { |   { | ||||||
|     RealD mass=0.1; |     RealD mass=0.1; | ||||||
|     RealD M5  =1.8; |     RealD M5  =1.8; | ||||||
| @@ -262,6 +279,7 @@ public: | |||||||
|     double mflops; |     double mflops; | ||||||
|     double mflops_best = 0; |     double mflops_best = 0; | ||||||
|     double mflops_worst= 0; |     double mflops_worst= 0; | ||||||
|  |     std::vector<double> mflops_all; | ||||||
|  |  | ||||||
|     /////////////////////////////////////////////////////// |     /////////////////////////////////////////////////////// | ||||||
|     // Set/Get the layout & grid size |     // Set/Get the layout & grid size | ||||||
| @@ -274,6 +292,189 @@ public: | |||||||
| 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|     uint64_t NP = TmpGrid->RankCount(); |     uint64_t NP = TmpGrid->RankCount(); | ||||||
|     uint64_t NN = TmpGrid->NodeCount(); |     uint64_t NN = TmpGrid->NodeCount(); | ||||||
|  |     NN_global=NN; | ||||||
|  |     uint64_t SHM=NP/NN; | ||||||
|  |  | ||||||
|  |     std::vector<int> internal; | ||||||
|  |     if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1}); | ||||||
|  |     else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1}); | ||||||
|  |     else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1}); | ||||||
|  |     else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1}); | ||||||
|  |     else assert(0); | ||||||
|  |  | ||||||
|  |     std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); | ||||||
|  |     std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); | ||||||
|  |  | ||||||
|  |     ///////// Welcome message //////////// | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Lattice Init //////////// | ||||||
|  |     GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |     GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); | ||||||
|  |     GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); | ||||||
|  |     GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |     ///////// RNG Init //////////// | ||||||
|  |     std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |     std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |     GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |     GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||||
|  |     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Source preparation //////////// | ||||||
|  |     LatticeFermion src   (sFGrid); random(RNG5,src); | ||||||
|  |     LatticeFermion tmp   (sFGrid); | ||||||
|  |  | ||||||
|  |     RealD N2 = 1.0/::sqrt(norm2(src)); | ||||||
|  |     src = src*N2; | ||||||
|  |      | ||||||
|  |     LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu);  | ||||||
|  |  | ||||||
|  |     WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); | ||||||
|  |     LatticeFermion src_e (sFrbGrid); | ||||||
|  |     LatticeFermion src_o (sFrbGrid); | ||||||
|  |     LatticeFermion r_e   (sFrbGrid); | ||||||
|  |     LatticeFermion r_o   (sFrbGrid); | ||||||
|  |     LatticeFermion r_eo  (sFGrid); | ||||||
|  |     LatticeFermion err   (sFGrid); | ||||||
|  |     { | ||||||
|  |  | ||||||
|  |       pickCheckerboard(Even,src_e,src); | ||||||
|  |       pickCheckerboard(Odd,src_o,src); | ||||||
|  |  | ||||||
|  | #if defined(AVX512)  | ||||||
|  |       const int num_cases = 6; | ||||||
|  |       std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #else | ||||||
|  |       const int num_cases = 4; | ||||||
|  |       std::string fmt("U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #endif | ||||||
|  |       controls Cases [] = { | ||||||
|  | #ifdef AVX512 | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | #endif | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  } | ||||||
|  |       };  | ||||||
|  |  | ||||||
|  |       for(int c=0;c<num_cases;c++) { | ||||||
|  |  | ||||||
|  | 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; | ||||||
|  | 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt; | ||||||
|  | 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||||
|  | 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  | 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  | 	int nwarm = 100; | ||||||
|  | 	double t0=usecond(); | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	for(int i=0;i<nwarm;i++){ | ||||||
|  | 	  sDw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	} | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	double t1=usecond(); | ||||||
|  | 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); | ||||||
|  | 	//	if (ncall < 500) ncall = 500; | ||||||
|  | 	uint64_t ncall = 500; | ||||||
|  |  | ||||||
|  | 	sFGrid->Broadcast(0,&ncall,sizeof(ncall)); | ||||||
|  |  | ||||||
|  | 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; | ||||||
|  | 	sDw.ZeroCounters(); | ||||||
|  |  | ||||||
|  | 	time_statistics timestat; | ||||||
|  | 	std::vector<double> t_time(ncall); | ||||||
|  | 	for(uint64_t i=0;i<ncall;i++){ | ||||||
|  | 	  t0=usecond(); | ||||||
|  | 	  sDw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	  t1=usecond(); | ||||||
|  | 	  t_time[i] = t1-t0; | ||||||
|  | 	} | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	 | ||||||
|  | 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|  | 	double flops=(1344.0*volume)/2; | ||||||
|  | 	double mf_hi, mf_lo, mf_err; | ||||||
|  |  | ||||||
|  | 	timestat.statistics(t_time); | ||||||
|  | 	mf_hi = flops/timestat.min; | ||||||
|  | 	mf_lo = flops/timestat.max; | ||||||
|  | 	mf_err= flops/timestat.min * timestat.err/timestat.mean; | ||||||
|  |  | ||||||
|  | 	mflops = flops/timestat.mean; | ||||||
|  | 	mflops_all.push_back(mflops); | ||||||
|  | 	if ( mflops_best == 0   ) mflops_best = mflops; | ||||||
|  | 	if ( mflops_worst== 0   ) mflops_worst= mflops; | ||||||
|  | 	if ( mflops>mflops_best ) mflops_best = mflops; | ||||||
|  | 	if ( mflops<mflops_worst) mflops_worst= mflops; | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl; | ||||||
|  |  | ||||||
|  | 	sDw.Report(); | ||||||
|  |  | ||||||
|  |       } | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage <<fmt << std::endl; | ||||||
|  |       std::cout<<GridLogMessage ; | ||||||
|  |  | ||||||
|  |       for(int i=0;i<mflops_all.size();i++){ | ||||||
|  | 	std::cout<<mflops_all[i]/NN<<" ; " ; | ||||||
|  |       } | ||||||
|  |       std::cout<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     return mflops_best; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static double DWF(int Ls,int L) | ||||||
|  |   { | ||||||
|  |     RealD mass=0.1; | ||||||
|  |     RealD M5  =1.8; | ||||||
|  |  | ||||||
|  |     double mflops; | ||||||
|  |     double mflops_best = 0; | ||||||
|  |     double mflops_worst= 0; | ||||||
|  |     std::vector<double> mflops_all; | ||||||
|  |  | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     // Set/Get the layout & grid size | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     int threads = GridThread::GetThreads(); | ||||||
|  |     std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4); | ||||||
|  |     std::vector<int> local({L,L,L,L}); | ||||||
|  |  | ||||||
|  |     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),  | ||||||
|  | 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     uint64_t NP = TmpGrid->RankCount(); | ||||||
|  |     uint64_t NN = TmpGrid->NodeCount(); | ||||||
|  |     NN_global=NN; | ||||||
|     uint64_t SHM=NP/NN; |     uint64_t SHM=NP/NN; | ||||||
|  |  | ||||||
|     std::vector<int> internal; |     std::vector<int> internal; | ||||||
| @@ -364,13 +565,15 @@ public: | |||||||
|  |  | ||||||
| #if defined(AVX512)  | #if defined(AVX512)  | ||||||
|       const int num_cases = 6; |       const int num_cases = 6; | ||||||
|  |       std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); | ||||||
| #else | #else | ||||||
|       const int num_cases = 4; |       const int num_cases = 4; | ||||||
|  |       std::string fmt("U/S ; U/O ; G/S ; G/O "); | ||||||
| #endif | #endif | ||||||
|       controls Cases [] = { |       controls Cases [] = { | ||||||
| #if defined(AVX512)  | #ifdef AVX512 | ||||||
| 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, |  | ||||||
| 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
| #endif | #endif | ||||||
| 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
| 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
| @@ -380,6 +583,10 @@ public: | |||||||
|  |  | ||||||
|       for(int c=0;c<num_cases;c++) { |       for(int c=0;c<num_cases;c++) { | ||||||
|  |  | ||||||
|  | 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; | ||||||
|  | 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt; | ||||||
|  | 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); | ||||||
|  |  | ||||||
| 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
| 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
| 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
| @@ -390,11 +597,7 @@ public: | |||||||
| 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
| 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  | 	int nwarm = 200; | ||||||
| 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; |  | ||||||
| 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt; |  | ||||||
| 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); |  | ||||||
| 	int nwarm = 10; |  | ||||||
| 	double t0=usecond(); | 	double t0=usecond(); | ||||||
| 	FGrid->Barrier(); | 	FGrid->Barrier(); | ||||||
| 	for(int i=0;i<nwarm;i++){ | 	for(int i=0;i<nwarm;i++){ | ||||||
| @@ -402,7 +605,10 @@ public: | |||||||
| 	} | 	} | ||||||
| 	FGrid->Barrier(); | 	FGrid->Barrier(); | ||||||
| 	double t1=usecond(); | 	double t1=usecond(); | ||||||
| 	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); | 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); | ||||||
|  | 	//	if (ncall < 500) ncall = 500; | ||||||
|  | 	uint64_t ncall = 1000; | ||||||
|  |  | ||||||
| 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); | 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); | ||||||
|  |  | ||||||
| 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; | 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; | ||||||
| @@ -428,7 +634,7 @@ public: | |||||||
| 	mf_err= flops/timestat.min * timestat.err/timestat.mean; | 	mf_err= flops/timestat.min * timestat.err/timestat.mean; | ||||||
|  |  | ||||||
| 	mflops = flops/timestat.mean; | 	mflops = flops/timestat.mean; | ||||||
|  | 	mflops_all.push_back(mflops); | ||||||
| 	if ( mflops_best == 0   ) mflops_best = mflops; | 	if ( mflops_best == 0   ) mflops_best = mflops; | ||||||
| 	if ( mflops_worst== 0   ) mflops_worst= mflops; | 	if ( mflops_worst== 0   ) mflops_worst= mflops; | ||||||
| 	if ( mflops>mflops_best ) mflops_best = mflops; | 	if ( mflops>mflops_best ) mflops_best = mflops; | ||||||
| @@ -450,12 +656,20 @@ public: | |||||||
|  |  | ||||||
|       } |       } | ||||||
|       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best <<std::endl; |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; | ||||||
|       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<<std::endl; |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; | ||||||
|       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl; |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage <<fmt << std::endl; | ||||||
|  |       std::cout<<GridLogMessage ; | ||||||
|  |  | ||||||
|  |       for(int i=0;i<mflops_all.size();i++){ | ||||||
|  | 	std::cout<<mflops_all[i]/NN<<" ; " ; | ||||||
|  |       } | ||||||
|  |       std::cout<<std::endl; | ||||||
|       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |     return mflops_best; | ||||||
|   } |   } | ||||||
|  |  | ||||||
| }; | }; | ||||||
| @@ -465,8 +679,11 @@ int main (int argc, char ** argv) | |||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); | ||||||
|  | #ifdef KNL | ||||||
|  |   LebesgueOrder::Block = std::vector<int>({8,2,2,2}); | ||||||
|  | #else | ||||||
|   LebesgueOrder::Block = std::vector<int>({2,2,2,2}); |   LebesgueOrder::Block = std::vector<int>({2,2,2,2}); | ||||||
|  | #endif | ||||||
|   Benchmark::Decomposition(); |   Benchmark::Decomposition(); | ||||||
|  |  | ||||||
|   int do_memory=1; |   int do_memory=1; | ||||||
| @@ -493,26 +710,66 @@ int main (int argc, char ** argv) | |||||||
|     // empty for now |     // empty for now | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   int sel=2; | ||||||
|  |   std::vector<int> L_list({8,12,16,24}); | ||||||
|  |   std::vector<double> wilson; | ||||||
|  |   std::vector<double> dwf4; | ||||||
|  |   std::vector<double> dwf5; | ||||||
|  |  | ||||||
|   if ( do_wilson ) { |   if ( do_wilson ) { | ||||||
|     int Ls=1; |     int Ls=1; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl; |     std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     Benchmark::DWF(Ls,16); |     for(int l=0;l<L_list.size();l++){ | ||||||
|     Benchmark::DWF(Ls,24); |       wilson.push_back(Benchmark::DWF(1,L_list[l])); | ||||||
|     Benchmark::DWF(Ls,32); |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( do_dwf ) { |  | ||||||
|   int Ls=16; |   int Ls=16; | ||||||
|  |   if ( do_dwf ) { | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; |     std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     Benchmark::DWF(Ls,8); |     for(int l=0;l<L_list.size();l++){ | ||||||
|     Benchmark::DWF(Ls,12); |       dwf4.push_back(Benchmark::DWF(Ls,L_list[l])); | ||||||
|     Benchmark::DWF(Ls,16); |  | ||||||
|     Benchmark::DWF(Ls,24); |  | ||||||
|     } |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( do_dwf ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       dwf5.push_back(Benchmark::DWF5(Ls,L_list[l])); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl; | ||||||
|  |   for(int l=0;l<L_list.size();l++){ | ||||||
|  |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl; | ||||||
|  |   } | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   int NN=NN_global; | ||||||
|  |   std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl; | ||||||
|  |   for(int l=0;l<L_list.size();l++){ | ||||||
|  |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl; | ||||||
|  |   } | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Comparison point result: "  << dwf4[sel]/NN <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -68,7 +68,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   int Nloop=100; |   int Nloop=100; | ||||||
|   int nmu=0; |   int nmu=0; | ||||||
|   int maxlat=24; |   int maxlat=32; | ||||||
|   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; |   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; |   std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; | ||||||
| @@ -80,7 +80,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   header(); |   header(); | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -92,11 +92,16 @@ int main (int argc, char ** argv) | |||||||
|       RealD Nnode = Grid.NodeCount(); |       RealD Nnode = Grid.NodeCount(); | ||||||
|       RealD ppn = Nrank/Nnode; |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	 | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > rbuf(8); | ||||||
|  |  | ||||||
|       int ncomm; |       int ncomm; | ||||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  |       for(int mu=0;mu<8;mu++){ | ||||||
|  | 	xbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	rbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl; | ||||||
|  |       } | ||||||
|  |  | ||||||
|       for(int i=0;i<Nloop;i++){ |       for(int i=0;i<Nloop;i++){ | ||||||
|       double start=usecond(); |       double start=usecond(); | ||||||
| @@ -112,7 +117,6 @@ int main (int argc, char ** argv) | |||||||
| 	    int comm_proc=1; | 	    int comm_proc=1; | ||||||
| 	    int xmit_to_rank; | 	    int xmit_to_rank; | ||||||
| 	    int recv_from_rank; | 	    int recv_from_rank; | ||||||
| 	     |  | ||||||
| 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
| 	    Grid.SendToRecvFromBegin(requests, | 	    Grid.SendToRecvFromBegin(requests, | ||||||
| 				   (void *)&xbuf[mu][0], | 				   (void *)&xbuf[mu][0], | ||||||
| @@ -163,7 +167,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat,lat,lat,lat}); |       std::vector<int> latt_size  ({lat,lat,lat,lat}); | ||||||
|  |  | ||||||
| @@ -172,9 +176,14 @@ int main (int argc, char ** argv) | |||||||
|       RealD Nnode = Grid.NodeCount(); |       RealD Nnode = Grid.NodeCount(); | ||||||
|       RealD ppn = Nrank/Nnode; |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > xbuf(8); | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > rbuf(8); | ||||||
|  |  | ||||||
|  |       for(int mu=0;mu<8;mu++){ | ||||||
|  | 	xbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	rbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl; | ||||||
|  |       } | ||||||
|  |  | ||||||
|       int ncomm; |       int ncomm; | ||||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
| @@ -249,7 +258,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -299,7 +308,7 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu][0], | 					      (void *)&rbuf[mu][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu); | ||||||
| 	 | 	 | ||||||
| 	    comm_proc = mpi_layout[mu]-1; | 	    comm_proc = mpi_layout[mu]-1; | ||||||
| 	   | 	   | ||||||
| @@ -310,11 +319,11 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu+4][0], | 					      (void *)&rbuf[mu+4][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu+4); | ||||||
| 	   | 	   | ||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
| 	Grid.StencilSendToRecvFromComplete(requests); | 	Grid.StencilSendToRecvFromComplete(requests,0); | ||||||
| 	Grid.Barrier(); | 	Grid.Barrier(); | ||||||
| 	double stop=usecond(); | 	double stop=usecond(); | ||||||
| 	t_time[i] = stop-start; // microseconds | 	t_time[i] = stop-start; // microseconds | ||||||
| @@ -346,7 +355,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -393,8 +402,8 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu][0], | 					      (void *)&rbuf[mu][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu); | ||||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | 	    Grid.StencilSendToRecvFromComplete(requests,mu); | ||||||
| 	    requests.resize(0); | 	    requests.resize(0); | ||||||
|  |  | ||||||
| 	    comm_proc = mpi_layout[mu]-1; | 	    comm_proc = mpi_layout[mu]-1; | ||||||
| @@ -406,8 +415,8 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu+4][0], | 					      (void *)&rbuf[mu+4][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu+4); | ||||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | 	    Grid.StencilSendToRecvFromComplete(requests,mu+4); | ||||||
| 	    requests.resize(0); | 	    requests.resize(0); | ||||||
| 	   | 	   | ||||||
| 	  } | 	  } | ||||||
| @@ -436,5 +445,97 @@ int main (int argc, char ** argv) | |||||||
|     } |     } | ||||||
|   }     |   }     | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   header(); | ||||||
|  |  | ||||||
|  |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|  |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|  |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|  |       				    lat*mpi_layout[1], | ||||||
|  |       				    lat*mpi_layout[2], | ||||||
|  |       				    lat*mpi_layout[3]}); | ||||||
|  |  | ||||||
|  |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |       RealD Nrank = Grid._Nprocessors; | ||||||
|  |       RealD Nnode = Grid.NodeCount(); | ||||||
|  |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|  |       std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||||
|  |       std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||||
|  |       Grid.ShmBufferFreeAll(); | ||||||
|  |       for(int d=0;d<8;d++){ | ||||||
|  | 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       int ncomm; | ||||||
|  |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  |       double dbytes; | ||||||
|  |       for(int i=0;i<Nloop;i++){ | ||||||
|  | 	double start=usecond(); | ||||||
|  |  | ||||||
|  | 	std::vector<CartesianCommunicator::CommsRequest_t> requests; | ||||||
|  | 	dbytes=0; | ||||||
|  | 	ncomm=0; | ||||||
|  |  | ||||||
|  | 	parallel_for(int dir=0;dir<8;dir++){ | ||||||
|  |  | ||||||
|  | 	  double tbytes; | ||||||
|  | 	  int mu =dir % 4; | ||||||
|  |  | ||||||
|  | 	  if (mpi_layout[mu]>1 ) { | ||||||
|  | 	   | ||||||
|  | 	    ncomm++; | ||||||
|  | 	    int xmit_to_rank; | ||||||
|  | 	    int recv_from_rank; | ||||||
|  | 	    if ( dir == mu ) {  | ||||||
|  | 	      int comm_proc=1; | ||||||
|  | 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	    } else {  | ||||||
|  | 	      int comm_proc = mpi_layout[mu]-1; | ||||||
|  | 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	    } | ||||||
|  |  | ||||||
|  | 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | ||||||
|  | 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir); | ||||||
|  |  | ||||||
|  | #pragma omp atomic | ||||||
|  | 	    dbytes+=tbytes; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  | 	Grid.Barrier(); | ||||||
|  | 	double stop=usecond(); | ||||||
|  | 	t_time[i] = stop-start; // microseconds | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       timestat.statistics(t_time); | ||||||
|  |  | ||||||
|  |       dbytes=dbytes*ppn; | ||||||
|  |       double xbytes    = dbytes*0.5; | ||||||
|  |       double rbytes    = dbytes*0.5; | ||||||
|  |       double bidibytes = dbytes; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |       std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||||
|  |                <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||||
|  |                <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||||
|  |                <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||||
|  |                << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||||
|  |                << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||||
|  |   | ||||||
|  |     } | ||||||
|  |   }     | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -503,9 +503,9 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; |   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; |   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||||
|  |  | ||||||
|   //assert(norm2(src_e)<1.0e-4); |   assert(norm2(src_e)<1.0e-4); | ||||||
|   //assert(norm2(src_o)<1.0e-4); |   assert(norm2(src_o)<1.0e-4); | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
|  |   exit(0); | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										10
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -342,14 +342,14 @@ case ${ac_COMMS} in | |||||||
|         AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) |         AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) | ||||||
|         comms_type='none' |         comms_type='none' | ||||||
|      ;; |      ;; | ||||||
|      mpi3l*) |  | ||||||
|        AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] ) |  | ||||||
|        comms_type='mpi3l' |  | ||||||
|      ;; |  | ||||||
|      mpi3*) |      mpi3*) | ||||||
|         AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) |         AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) | ||||||
|         comms_type='mpi3' |         comms_type='mpi3' | ||||||
|      ;; |      ;; | ||||||
|  |      mpit) | ||||||
|  |         AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] ) | ||||||
|  |         comms_type='mpit' | ||||||
|  |      ;; | ||||||
|      mpi*) |      mpi*) | ||||||
|         AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) |         AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) | ||||||
|         comms_type='mpi' |         comms_type='mpi' | ||||||
| @@ -377,7 +377,7 @@ esac | |||||||
| AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ]) | AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ]) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ]) | AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ]) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] ) | AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] ) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] ) | AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] ) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ]) | AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ]) | ||||||
|  |  | ||||||
| ############### RNG selection | ############### RNG selection | ||||||
|   | |||||||
| @@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3 | |||||||
|   extra_sources+=communicator/Communicator_base.cc |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
| if BUILD_COMMS_MPI3L | if BUILD_COMMS_MPIT | ||||||
|   extra_sources+=communicator/Communicator_mpi3_leader.cc |   extra_sources+=communicator/Communicator_mpit.cc | ||||||
|   extra_sources+=communicator/Communicator_base.cc |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,7 +1,5 @@ | |||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
|  | #include <fcntl.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) { | |||||||
|   return NULL; |   return NULL; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | void check_huge_pages(void *Buf,uint64_t BYTES) | ||||||
|  | { | ||||||
|  | #ifdef __linux__ | ||||||
|  |   int fd = open("/proc/self/pagemap", O_RDONLY); | ||||||
|  |   assert(fd >= 0); | ||||||
|  |   const int page_size = 4096; | ||||||
|  |   uint64_t virt_pfn = (uint64_t)Buf / page_size; | ||||||
|  |   off_t offset = sizeof(uint64_t) * virt_pfn; | ||||||
|  |   uint64_t npages = (BYTES + page_size-1) / page_size; | ||||||
|  |   uint64_t pagedata[npages]; | ||||||
|  |   uint64_t ret = lseek(fd, offset, SEEK_SET); | ||||||
|  |   assert(ret == offset); | ||||||
|  |   ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); | ||||||
|  |   assert(ret == sizeof(uint64_t) * npages); | ||||||
|  |   int nhugepages = npages / 512; | ||||||
|  |   int n4ktotal, nnothuge; | ||||||
|  |   n4ktotal = 0; | ||||||
|  |   nnothuge = 0; | ||||||
|  |   for (int i = 0; i < nhugepages; ++i) { | ||||||
|  |     uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; | ||||||
|  |     for (int j = 0; j < 512; ++j) { | ||||||
|  |       uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; | ||||||
|  |       ++n4ktotal; | ||||||
|  |       if (pageaddr != baseaddr + j * page_size) | ||||||
|  | 	++nnothuge; | ||||||
|  |       } | ||||||
|  |   } | ||||||
|  |   int rank = CartesianCommunicator::RankWorld(); | ||||||
|  |   printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -64,6 +64,8 @@ namespace Grid { | |||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   void check_huge_pages(void *Buf,uint64_t BYTES); | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////// | ||||||
| // A lattice of something, but assume the something is SIMDized. | // A lattice of something, but assume the something is SIMDized. | ||||||
| //////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////// | ||||||
| @@ -92,12 +94,20 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|  |  | ||||||
|     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); |     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); | ||||||
|  |     //    if ( ptr != NULL )  | ||||||
|  |     //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl; | ||||||
|  |  | ||||||
|  |     ////////////////// | ||||||
|  |     // Hack 2MB align; could make option probably doesn't need configurability | ||||||
|  |     ////////////////// | ||||||
|  | //define GRID_ALLOC_ALIGN (128) | ||||||
|  | #define GRID_ALLOC_ALIGN (2*1024*1024) | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); |     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); | ||||||
| #else | #else | ||||||
|     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); |     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); | ||||||
| #endif | #endif | ||||||
|  |     //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl; | ||||||
|     // First touch optimise in threaded loop |     // First touch optimise in threaded loop | ||||||
|     uint8_t *cp = (uint8_t *)ptr; |     uint8_t *cp = (uint8_t *)ptr; | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
| @@ -111,6 +121,7 @@ public: | |||||||
|  |  | ||||||
|   void deallocate(pointer __p, size_type __n) {  |   void deallocate(pointer __p, size_type __n) {  | ||||||
|     size_type bytes = __n * sizeof(_Tp); |     size_type bytes = __n * sizeof(_Tp); | ||||||
|  |  | ||||||
|     pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); |     pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); | ||||||
|  |  | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
| @@ -189,9 +200,9 @@ public: | |||||||
|   pointer allocate(size_type __n, const void* _p= 0)  |   pointer allocate(size_type __n, const void* _p= 0)  | ||||||
|   { |   { | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); |     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN); | ||||||
| #else | #else | ||||||
|     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); |     _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp)); | ||||||
| #endif | #endif | ||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|     uint8_t *cp = (uint8_t *)ptr; |     uint8_t *cp = (uint8_t *)ptr; | ||||||
|   | |||||||
| @@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
|  | #include <fcntl.h> | ||||||
|  | #include <unistd.h> | ||||||
|  | #include <limits.h> | ||||||
|  | #include <sys/mman.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -34,7 +38,10 @@ namespace Grid { | |||||||
| /////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////// | ||||||
| void *              CartesianCommunicator::ShmCommBuf; | void *              CartesianCommunicator::ShmCommBuf; | ||||||
| uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;  | uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;  | ||||||
| CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; | CartesianCommunicator::CommunicatorPolicy_t   | ||||||
|  | CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; | ||||||
|  | int CartesianCommunicator::nCommThreads = -1; | ||||||
|  | int CartesianCommunicator::Hugepages = 0; | ||||||
|  |  | ||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
| // Alloc, free shmem region | // Alloc, free shmem region | ||||||
| @@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | |||||||
|   GlobalSumVector((double *)c,2*N); |   GlobalSumVector((double *)c,2*N); | ||||||
| } | } | ||||||
|  |  | ||||||
| #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) | #if !defined( GRID_COMMS_MPI3)  | ||||||
|  |  | ||||||
| int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();}; | int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();}; | ||||||
| int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();}; | int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();}; | ||||||
|  | #endif | ||||||
|  | #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT) | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | ||||||
|  | 						     int xmit_to_rank, | ||||||
|  | 						     void *recv, | ||||||
|  | 						     int recv_from_rank, | ||||||
|  | 						     int bytes, int dir) | ||||||
|  | { | ||||||
|  |   std::vector<CommsRequest_t> list; | ||||||
|  |   // Discard the "dir" | ||||||
|  |   SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||||
|  |   SendToRecvFromComplete(list); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int xmit_to_rank, | 							 int xmit_to_rank, | ||||||
| 							 void *recv, | 							 void *recv, | ||||||
| 							 int recv_from_rank, | 							 int recv_from_rank, | ||||||
| 						       int bytes) | 							 int bytes, int dir) | ||||||
| { | { | ||||||
|  |   // Discard the "dir" | ||||||
|   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); |   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||||
|   return 2.0*bytes; |   return 2.0*bytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
| { | { | ||||||
|   SendToRecvFromComplete(waitall); |   SendToRecvFromComplete(waitall); | ||||||
| } | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #if !defined( GRID_COMMS_MPI3)  | ||||||
|  |  | ||||||
| void CartesianCommunicator::StencilBarrier(void){}; | void CartesianCommunicator::StencilBarrier(void){}; | ||||||
|  |  | ||||||
| commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; | commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; | ||||||
| @@ -121,8 +146,22 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { | |||||||
|   return NULL; |   return NULL; | ||||||
| } | } | ||||||
| void CartesianCommunicator::ShmInitGeneric(void){ | void CartesianCommunicator::ShmInitGeneric(void){ | ||||||
|  | #if 1 | ||||||
|  |  | ||||||
|  |   int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; | ||||||
|  | #ifdef MAP_HUGETLB | ||||||
|  |   if ( Hugepages ) mmap_flag |= MAP_HUGETLB; | ||||||
|  | #endif | ||||||
|  |   ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);  | ||||||
|  |   if (ShmCommBuf == (void *)MAP_FAILED) { | ||||||
|  |     perror("mmap failed "); | ||||||
|  |     exit(EXIT_FAILURE);   | ||||||
|  |   } | ||||||
|  | #else  | ||||||
|   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); |   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); | ||||||
|   ShmCommBuf=(void *)&ShmBufStorageVector[0]; |   ShmCommBuf=(void *)&ShmBufStorageVector[0]; | ||||||
|  | #endif | ||||||
|  |   bzero(ShmCommBuf,MAX_MPI_SHM_BYTES); | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifdef GRID_COMMS_MPI3 | #ifdef GRID_COMMS_MPI3 | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_MPI3L | #ifdef GRID_COMMS_MPIT | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
| @@ -50,12 +50,24 @@ namespace Grid { | |||||||
| class CartesianCommunicator { | class CartesianCommunicator { | ||||||
|   public:     |   public:     | ||||||
|  |  | ||||||
|   // 65536 ranks per node adequate for now |  | ||||||
|  |   //////////////////////////////////////////// | ||||||
|  |   // Isend/Irecv/Wait, or Sendrecv blocking | ||||||
|  |   //////////////////////////////////////////// | ||||||
|  |   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; | ||||||
|  |   static CommunicatorPolicy_t CommunicatorPolicy; | ||||||
|  |   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////// | ||||||
|  |   // Up to 65536 ranks per node adequate for now | ||||||
|   // 128MB shared memory for comms enought for 48^4 local vol comms |   // 128MB shared memory for comms enought for 48^4 local vol comms | ||||||
|   // Give external control (command line override?) of this |   // Give external control (command line override?) of this | ||||||
|  |   /////////////////////////////////////////// | ||||||
|   static const int MAXLOG2RANKSPERNODE = 16;             |   static const int MAXLOG2RANKSPERNODE = 16;             | ||||||
|   static uint64_t  MAX_MPI_SHM_BYTES; |   static uint64_t  MAX_MPI_SHM_BYTES; | ||||||
|  |   static int       nCommThreads; | ||||||
|  |   // use explicit huge pages | ||||||
|  |   static int       Hugepages; | ||||||
|  |  | ||||||
|   // Communicator should know nothing of the physics grid, only processor grid. |   // Communicator should know nothing of the physics grid, only processor grid. | ||||||
|   int              _Nprocessors;     // How many in all |   int              _Nprocessors;     // How many in all | ||||||
| @@ -64,14 +76,18 @@ class CartesianCommunicator { | |||||||
|   std::vector<int> _processor_coor;  // linear processor coordinate |   std::vector<int> _processor_coor;  // linear processor coordinate | ||||||
|   unsigned long _ndimension; |   unsigned long _ndimension; | ||||||
|  |  | ||||||
| #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L) | #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) | ||||||
|   static MPI_Comm communicator_world; |   static MPI_Comm communicator_world; | ||||||
|  |  | ||||||
|   MPI_Comm              communicator; |   MPI_Comm              communicator; | ||||||
|  |   std::vector<MPI_Comm> communicator_halo; | ||||||
|  |  | ||||||
|   typedef MPI_Request CommsRequest_t; |   typedef MPI_Request CommsRequest_t; | ||||||
| #else  | #else  | ||||||
|   typedef int CommsRequest_t; |   typedef int CommsRequest_t; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////// | ||||||
|   // Helper functionality for SHM Windows common to all other impls |   // Helper functionality for SHM Windows common to all other impls | ||||||
|   //////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////// | ||||||
| @@ -117,10 +133,6 @@ class CartesianCommunicator { | |||||||
|   ///////////////////////////////// |   ///////////////////////////////// | ||||||
|   static void * ShmCommBuf; |   static void * ShmCommBuf; | ||||||
|  |  | ||||||
|   // Isend/Irecv/Wait, or Sendrecv blocking |  | ||||||
|   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; |  | ||||||
|   static CommunicatorPolicy_t CommunicatorPolicy; |  | ||||||
|   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } |  | ||||||
|    |    | ||||||
|   size_t heap_top; |   size_t heap_top; | ||||||
|   size_t heap_bytes; |   size_t heap_bytes; | ||||||
| @@ -211,14 +223,21 @@ class CartesianCommunicator { | |||||||
|    |    | ||||||
|   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); |   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); | ||||||
|  |  | ||||||
|  |   double StencilSendToRecvFrom(void *xmit, | ||||||
|  | 			       int xmit_to_rank, | ||||||
|  | 			       void *recv, | ||||||
|  | 			       int recv_from_rank, | ||||||
|  | 			       int bytes,int dir); | ||||||
|  |  | ||||||
|   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, |   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 				    void *xmit, | 				    void *xmit, | ||||||
| 				    int xmit_to_rank, | 				    int xmit_to_rank, | ||||||
| 				    void *recv, | 				    void *recv, | ||||||
| 				    int recv_from_rank, | 				    int recv_from_rank, | ||||||
| 				  int bytes); | 				    int bytes,int dir); | ||||||
|    |    | ||||||
|   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); |    | ||||||
|  |   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); | ||||||
|   void StencilBarrier(void); |   void StencilBarrier(void); | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -41,9 +41,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifdef HAVE_NUMAIF_H | #ifdef HAVE_NUMAIF_H | ||||||
| #include <numaif.h> | #include <numaif.h> | ||||||
| #endif | #endif | ||||||
| #ifndef SHM_HUGETLB |  | ||||||
| #define SHM_HUGETLB 04000 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -214,12 +212,18 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      } |       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      } | ||||||
|       ftruncate(fd, size); |       ftruncate(fd, size); | ||||||
|        |        | ||||||
|       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); |       int mmap_flag = MAP_SHARED; | ||||||
|  | #ifdef MAP_HUGETLB | ||||||
|  |       if (Hugepages) mmap_flag |= MAP_HUGETLB; | ||||||
|  | #endif | ||||||
|  |       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); | ||||||
|  |  | ||||||
|       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } |       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } | ||||||
|       assert(((uint64_t)ptr&0x3F)==0); |       assert(((uint64_t)ptr&0x3F)==0); | ||||||
|  |  | ||||||
|       // Try to force numa domain on the shm segment if we have numaif.h | // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h | ||||||
| #ifdef HAVE_NUMAIF_H | #if 0 | ||||||
|  | //#ifdef HAVE_NUMAIF_H | ||||||
| 	int status; | 	int status; | ||||||
| 	int flags=MPOL_MF_MOVE; | 	int flags=MPOL_MF_MOVE; | ||||||
| #ifdef KNL | #ifdef KNL | ||||||
| @@ -266,7 +270,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|     for(int r=0;r<ShmSize;r++){ |     for(int r=0;r<ShmSize;r++){ | ||||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; |       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; | ||||||
|       key_t key   = 0x4545 + r; |       key_t key   = 0x4545 + r; | ||||||
|       if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { |       int flags = IPC_CREAT | SHM_R | SHM_W; | ||||||
|  | #ifdef SHM_HUGETLB | ||||||
|  |       flags|=SHM_HUGETLB; | ||||||
|  | #endif | ||||||
|  |       if ((shmids[r]= shmget(key,size, flags)) < 0) { | ||||||
| 	int errsv = errno; | 	int errsv = errno; | ||||||
| 	printf("Errno %d\n",errsv); | 	printf("Errno %d\n",errsv); | ||||||
| 	perror("shmget"); | 	perror("shmget"); | ||||||
| @@ -397,8 +405,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | |||||||
| {  | {  | ||||||
|   int ierr; |   int ierr; | ||||||
|   communicator=communicator_world; |   communicator=communicator_world; | ||||||
|  |  | ||||||
|   _ndimension = processors.size(); |   _ndimension = processors.size(); | ||||||
|  |  | ||||||
|  |   communicator_halo.resize (2*_ndimension); | ||||||
|  |   for(int i=0;i<_ndimension*2;i++){ | ||||||
|  |     MPI_Comm_dup(communicator,&communicator_halo[i]); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   // Assert power of two shm_size. |   // Assert power of two shm_size. | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -621,13 +635,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | ||||||
|  | 						     int dest, | ||||||
|  | 						     void *recv, | ||||||
|  | 						     int from, | ||||||
|  | 						     int bytes,int dir) | ||||||
|  | { | ||||||
|  |   std::vector<CommsRequest_t> list; | ||||||
|  |   double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); | ||||||
|  |   StencilSendToRecvFromComplete(list,dir); | ||||||
|  |   return offbytes; | ||||||
|  | } | ||||||
|  |  | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int dest, | 							 int dest, | ||||||
| 							 void *recv, | 							 void *recv, | ||||||
| 							 int from, | 							 int from, | ||||||
| 						       int bytes) | 							 int bytes,int dir) | ||||||
| { | { | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |  | ||||||
|   MPI_Request xrq; |   MPI_Request xrq; | ||||||
|   MPI_Request rrq; |   MPI_Request rrq; | ||||||
|  |  | ||||||
| @@ -646,26 +674,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|   gfrom = MPI_UNDEFINED; |   gfrom = MPI_UNDEFINED; | ||||||
| #endif | #endif | ||||||
|   if ( gfrom ==MPI_UNDEFINED) { |   if ( gfrom ==MPI_UNDEFINED) { | ||||||
|     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); |     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq); | ||||||
|     assert(ierr==0); |     assert(ierr==0); | ||||||
|     list.push_back(rrq); |     list.push_back(rrq); | ||||||
|     off_node_bytes+=bytes; |     off_node_bytes+=bytes; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( gdest == MPI_UNDEFINED ) { |   if ( gdest == MPI_UNDEFINED ) { | ||||||
|     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); |     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq); | ||||||
|     assert(ierr==0); |     assert(ierr==0); | ||||||
|     list.push_back(xrq); |     list.push_back(xrq); | ||||||
|     off_node_bytes+=bytes; |     off_node_bytes+=bytes; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {  |   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {  | ||||||
|     this->StencilSendToRecvFromComplete(list); |     this->StencilSendToRecvFromComplete(list,dir); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return off_node_bytes; |   return off_node_bytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
| { | { | ||||||
|   SendToRecvFromComplete(waitall); |   SendToRecvFromComplete(waitall); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										286
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/communicator/Communicator_mpi.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/GridCore.h> | ||||||
|  | #include <Grid/GridQCDcore.h> | ||||||
|  | #include <Grid/qcd/action/ActionCore.h> | ||||||
|  | #include <mpi.h> | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Info that is setup once and indept of cartesian layout | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | MPI_Comm CartesianCommunicator::communicator_world; | ||||||
|  |  | ||||||
|  | // Should error check all MPI calls. | ||||||
|  | void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||||
|  |   int flag; | ||||||
|  |   int provided; | ||||||
|  |   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||||
|  |   if ( !flag ) { | ||||||
|  |     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); | ||||||
|  |     if ( provided != MPI_THREAD_MULTIPLE ) { | ||||||
|  |       QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||||
|  |   ShmInitGeneric(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||||
|  | { | ||||||
|  |   _ndimension = processors.size(); | ||||||
|  |   std::vector<int> periodic(_ndimension,1); | ||||||
|  |  | ||||||
|  |   _Nprocessors=1; | ||||||
|  |   _processors = processors; | ||||||
|  |   _processor_coor.resize(_ndimension); | ||||||
|  |    | ||||||
|  |   MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); | ||||||
|  |   MPI_Comm_rank(communicator,&_processor); | ||||||
|  |   MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); | ||||||
|  |  | ||||||
|  |   for(int i=0;i<_ndimension;i++){ | ||||||
|  |     _Nprocessors*=_processors[i]; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   communicator_halo.resize (2*_ndimension); | ||||||
|  |   for(int i=0;i<_ndimension*2;i++){ | ||||||
|  |     MPI_Comm_dup(communicator,&communicator_halo[i]); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   int Size;  | ||||||
|  |   MPI_Comm_size(communicator,&Size); | ||||||
|  |    | ||||||
|  |   assert(Size==_Nprocessors); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(uint64_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalXOR(uint32_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalXOR(uint64_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(float &f){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(double &d) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) | ||||||
|  | { | ||||||
|  |   int rank; | ||||||
|  |   int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank); | ||||||
|  |   assert(ierr==0); | ||||||
|  |   return rank; | ||||||
|  | } | ||||||
|  | void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) | ||||||
|  | { | ||||||
|  |   coor.resize(_ndimension); | ||||||
|  |   int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Basic Halo comms primitive | ||||||
|  | void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||||
|  | 					   int dest, | ||||||
|  | 					   void *recv, | ||||||
|  | 					   int from, | ||||||
|  | 					   int bytes) | ||||||
|  | { | ||||||
|  |   std::vector<CommsRequest_t> reqs(0); | ||||||
|  |   SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); | ||||||
|  |   SendToRecvFromComplete(reqs); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||||
|  | 					   void *recv, | ||||||
|  | 					   int sender, | ||||||
|  | 					   int receiver, | ||||||
|  | 					   int bytes) | ||||||
|  | { | ||||||
|  |   MPI_Status stat; | ||||||
|  |   assert(sender != receiver); | ||||||
|  |   int tag = sender; | ||||||
|  |   if ( _processor == sender ) { | ||||||
|  |     MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); | ||||||
|  |   } | ||||||
|  |   if ( _processor == receiver ) {  | ||||||
|  |     MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Basic Halo comms primitive | ||||||
|  | void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 						void *xmit, | ||||||
|  | 						int dest, | ||||||
|  | 						void *recv, | ||||||
|  | 						int from, | ||||||
|  | 						int bytes) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||||
|  |     MPI_Request xrq; | ||||||
|  |     MPI_Request rrq; | ||||||
|  |  | ||||||
|  |     ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |     ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|  |      | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(xrq); | ||||||
|  |     list.push_back(rrq); | ||||||
|  |   } else {  | ||||||
|  |     // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |     ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, | ||||||
|  | 		      recv,bytes,MPI_CHAR,from, from, | ||||||
|  | 		      communicator,MPI_STATUS_IGNORE); | ||||||
|  |     assert(ierr==0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
|  | { | ||||||
|  |   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||||
|  |     int nreq=list.size(); | ||||||
|  |     std::vector<MPI_Status> status(nreq); | ||||||
|  |     int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||||
|  |     assert(ierr==0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Barrier(void) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Barrier(communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Bcast(data, | ||||||
|  | 		     bytes, | ||||||
|  | 		     MPI_BYTE, | ||||||
|  | 		     root, | ||||||
|  | 		     communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|  |   // Should only be used prior to Grid Init finished. | ||||||
|  |   // Check for this? | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|  | int CartesianCommunicator::RankWorld(void){  | ||||||
|  |   int r;  | ||||||
|  |   MPI_Comm_rank(communicator_world,&r); | ||||||
|  |   return r; | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||||
|  | { | ||||||
|  |   int ierr= MPI_Bcast(data, | ||||||
|  | 		      bytes, | ||||||
|  | 		      MPI_BYTE, | ||||||
|  | 		      root, | ||||||
|  | 		      communicator_world); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 							 void *xmit, | ||||||
|  | 							 int xmit_to_rank, | ||||||
|  | 							 void *recv, | ||||||
|  | 							 int recv_from_rank, | ||||||
|  | 							 int bytes,int dir) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |    | ||||||
|  |   //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl; | ||||||
|  |   // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |   MPI_Request req[2]; | ||||||
|  |   MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); | ||||||
|  |   MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]); | ||||||
|  |  | ||||||
|  |   list.push_back(req[0]); | ||||||
|  |   list.push_back(req[1]); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
|  | {  | ||||||
|  |   int nreq=waitall.size(); | ||||||
|  |   MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE); | ||||||
|  | }; | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, | ||||||
|  | 						    int xmit_to_rank, | ||||||
|  | 						    void *recv, | ||||||
|  | 						    int recv_from_rank, | ||||||
|  | 						    int bytes,int dir) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |    | ||||||
|  |   //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl; | ||||||
|  |   // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |   MPI_Request req[2]; | ||||||
|  |   MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); | ||||||
|  |   MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]); | ||||||
|  |   MPI_Waitall(2, req, MPI_STATUSES_IGNORE); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
| @@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/cshift/Cshift_mpi.h> | #include <Grid/cshift/Cshift_mpi.h> | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
| #ifdef GRID_COMMS_MPI3L | #ifdef GRID_COMMS_MPIT | ||||||
| #include <Grid/cshift/Cshift_mpi.h> | #include <Grid/cshift/Cshift_mpi.h> | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) { | |||||||
| //////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////// | ||||||
| void Grid_quiesce_nodes(void) { | void Grid_quiesce_nodes(void) { | ||||||
|   int me = 0; |   int me = 0; | ||||||
| #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) | #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) | ||||||
|   MPI_Comm_rank(MPI_COMM_WORLD, &me); |   MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ | |||||||
| #ifndef GRID_BINARY_IO_H | #ifndef GRID_BINARY_IO_H | ||||||
| #define GRID_BINARY_IO_H | #define GRID_BINARY_IO_H | ||||||
|  |  | ||||||
| #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)  | #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)  | ||||||
| #define USE_MPI_IO | #define USE_MPI_IO | ||||||
| #else | #else | ||||||
| #undef  USE_MPI_IO | #undef  USE_MPI_IO | ||||||
|   | |||||||
| @@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|   for(int i=0; i < Ls; i++){ |   for(int i=0; i < Ls; i++){ | ||||||
|     as[i] = 1.0; |     as[i] = 1.0; | ||||||
|     omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code |     omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code | ||||||
|     //    assert(fabs(omega[i])>0.0); |     assert(omega[i]!=Coeff_t(0.0)); | ||||||
|     bs[i] = 0.5*(bpc/omega[i] + bmc); |     bs[i] = 0.5*(bpc/omega[i] + bmc); | ||||||
|     cs[i] = 0.5*(bpc/omega[i] - bmc); |     cs[i] = 0.5*(bpc/omega[i] - bmc); | ||||||
|   } |   } | ||||||
| @@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|    |    | ||||||
|   for(int i=0;i<Ls;i++){ |   for(int i=0;i<Ls;i++){ | ||||||
|     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);      |     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);      | ||||||
|     //    assert(fabs(bee[i])>0.0); |     assert(bee[i]!=Coeff_t(0.0)); | ||||||
|     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); |     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); | ||||||
|     beo[i]=as[i]*bs[i]; |     beo[i]=as[i]*bs[i]; | ||||||
|     ceo[i]=-as[i]*cs[i]; |     ceo[i]=-as[i]*cs[i]; | ||||||
| @@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|      |      | ||||||
|     if ( i < Ls-1 ) { |     if ( i < Ls-1 ) { | ||||||
|  |  | ||||||
|  |       assert(bee[i]!=Coeff_t(0.0)); | ||||||
|  |       assert(bee[0]!=Coeff_t(0.0)); | ||||||
|  |        | ||||||
|       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column |       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column | ||||||
|        |        | ||||||
|       leem[i]=mass*cee[Ls-1]/bee[0]; |       leem[i]=mass*cee[Ls-1]/bee[0]; | ||||||
|       for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1]; |       for(int j=0;j<i;j++) { | ||||||
|  | 	assert(bee[j+1]!=Coeff_t(0.0)); | ||||||
|  | 	leem[i]*= aee[j]/bee[j+1]; | ||||||
|  |       } | ||||||
|        |        | ||||||
|       uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row |       uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row | ||||||
|        |        | ||||||
| @@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|   {  |   {  | ||||||
|     Coeff_t delta_d=mass*cee[Ls-1]; |     Coeff_t delta_d=mass*cee[Ls-1]; | ||||||
|     for(int j=0;j<Ls-1;j++) { |     for(int j=0;j<Ls-1;j++) { | ||||||
|       //      assert(fabs(bee[j])>0.0); |       assert(bee[j] != Coeff_t(0.0)); | ||||||
|       delta_d *= cee[j]/bee[j]; |       delta_d *= cee[j]/bee[j]; | ||||||
|     } |     } | ||||||
|     dee[Ls-1] += delta_d; |     dee[Ls-1] += delta_d; | ||||||
|   | |||||||
| @@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom | |||||||
| template<class vobj,class cobj> | template<class vobj,class cobj> | ||||||
| class WilsonStencil : public CartesianStencil<vobj,cobj> { | class WilsonStencil : public CartesianStencil<vobj,cobj> { | ||||||
| public: | public: | ||||||
|  |   double timer0; | ||||||
|  |   double timer1; | ||||||
|  |   double timer2; | ||||||
|  |   double timer3; | ||||||
|  |   double timer4; | ||||||
|  |   double timer5; | ||||||
|  |   double timer6; | ||||||
|  |   uint64_t callsi; | ||||||
|  |   void ZeroCountersi(void) | ||||||
|  |   { | ||||||
|  |     timer0=0; | ||||||
|  |     timer1=0; | ||||||
|  |     timer2=0; | ||||||
|  |     timer3=0; | ||||||
|  |     timer4=0; | ||||||
|  |     timer5=0; | ||||||
|  |     timer6=0; | ||||||
|  |     callsi=0; | ||||||
|  |   } | ||||||
|  |   void Reporti(int calls) | ||||||
|  |   { | ||||||
|  |     if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl; | ||||||
|  |     if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl; | ||||||
|  |     if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl; | ||||||
|  |     if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl; | ||||||
|  |     if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl; | ||||||
|  |   } | ||||||
|   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; |   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; | ||||||
|  |  | ||||||
|   std::vector<int> same_node; |   std::vector<int> same_node; | ||||||
| @@ -252,6 +278,7 @@ public: | |||||||
|     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , |     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , | ||||||
|     same_node(npoints) |     same_node(npoints) | ||||||
|   {  |   {  | ||||||
|  |     ZeroCountersi(); | ||||||
|     surface_list.resize(0); |     surface_list.resize(0); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
| @@ -261,7 +288,6 @@ public: | |||||||
|     // Here we know the distance is 1 for WilsonStencil |     // Here we know the distance is 1 for WilsonStencil | ||||||
|     for(int point=0;point<this->_npoints;point++){ |     for(int point=0;point<this->_npoints;point++){ | ||||||
|       same_node[point] = this->SameNode(point); |       same_node[point] = this->SameNode(point); | ||||||
|       //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl; |  | ||||||
|     } |     } | ||||||
|      |      | ||||||
|     for(int site = 0 ;site< vol4;site++){ |     for(int site = 0 ;site< vol4;site++){ | ||||||
| @@ -282,17 +308,28 @@ public: | |||||||
|   { |   { | ||||||
|     std::vector<std::vector<CommsRequest_t> > reqs; |     std::vector<std::vector<CommsRequest_t> > reqs; | ||||||
|     this->HaloExchangeOptGather(source,compress); |     this->HaloExchangeOptGather(source,compress); | ||||||
|     this->CommunicateBegin(reqs); |     double t1=usecond(); | ||||||
|     this->CommunicateComplete(reqs); |     // Asynchronous MPI calls multidirectional, Isend etc... | ||||||
|  |     //    this->CommunicateBegin(reqs); | ||||||
|  |     //    this->CommunicateComplete(reqs); | ||||||
|  |     // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways. | ||||||
|  |     this->Communicate(); | ||||||
|  |     double t2=usecond(); timer1 += t2-t1; | ||||||
|     this->CommsMerge(compress); |     this->CommsMerge(compress); | ||||||
|  |     double t3=usecond(); timer2 += t3-t2; | ||||||
|     this->CommsMergeSHM(compress); |     this->CommsMergeSHM(compress); | ||||||
|  |     double t4=usecond(); timer3 += t4-t3; | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   template <class compressor> |   template <class compressor> | ||||||
|   void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)  |   void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)  | ||||||
|   { |   { | ||||||
|     this->Prepare(); |     this->Prepare(); | ||||||
|  |     double t0=usecond(); | ||||||
|     this->HaloGatherOpt(source,compress); |     this->HaloGatherOpt(source,compress); | ||||||
|  |     double t1=usecond(); | ||||||
|  |     timer0 += t1-t0; | ||||||
|  |     callsi++; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template <class compressor> |   template <class compressor> | ||||||
| @@ -304,7 +341,9 @@ public: | |||||||
|     typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor; |     typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor; | ||||||
|     typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; |     typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; | ||||||
|  |  | ||||||
|  |     this->mpi3synctime_g-=usecond(); | ||||||
|     this->_grid->StencilBarrier(); |     this->_grid->StencilBarrier(); | ||||||
|  |     this->mpi3synctime_g+=usecond(); | ||||||
|  |  | ||||||
|     assert(source._grid==this->_grid); |     assert(source._grid==this->_grid); | ||||||
|     this->halogtime-=usecond(); |     this->halogtime-=usecond(); | ||||||
| @@ -323,7 +362,6 @@ public: | |||||||
|     int dag = compress.dag; |     int dag = compress.dag; | ||||||
|     int face_idx=0; |     int face_idx=0; | ||||||
|     if ( dag ) {  |     if ( dag ) {  | ||||||
|       //	std::cout << " Optimised Dagger compress " <<std::endl; |  | ||||||
|       assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); |       assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); | ||||||
|       assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); |       assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); | ||||||
|       assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); |       assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); | ||||||
|   | |||||||
| @@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu, | |||||||
|   int vol4; |   int vol4; | ||||||
|   vol4=FourDimGrid.oSites(); |   vol4=FourDimGrid.oSites(); | ||||||
|   Stencil.BuildSurfaceList(LLs,vol4); |   Stencil.BuildSurfaceList(LLs,vol4); | ||||||
|  |  | ||||||
|   vol4=FourDimRedBlackGrid.oSites(); |   vol4=FourDimRedBlackGrid.oSites(); | ||||||
|   StencilEven.BuildSurfaceList(LLs,vol4); |   StencilEven.BuildSurfaceList(LLs,vol4); | ||||||
|    StencilOdd.BuildSurfaceList(LLs,vol4); |    StencilOdd.BuildSurfaceList(LLs,vol4); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() |    //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() | ||||||
|                        <<" " << StencilEven.surface_list.size()<<std::endl; |    //                       <<" " << StencilEven.surface_list.size()<<std::endl; | ||||||
|  |  | ||||||
| } | } | ||||||
|       |       | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::Report(void) | void WilsonFermion5D<Impl>::Report(void) | ||||||
| { | { | ||||||
|     std::vector<int> latt = GridDefaultLatt();           |  | ||||||
|     RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; |  | ||||||
|   RealD NP     = _FourDimGrid->_Nprocessors; |   RealD NP     = _FourDimGrid->_Nprocessors; | ||||||
|   RealD NN     = _FourDimGrid->NodeCount(); |   RealD NN     = _FourDimGrid->NodeCount(); | ||||||
|  |   RealD volume = Ls;   | ||||||
|  |   std::vector<int> latt = _FourDimGrid->GlobalDimensions(); | ||||||
|  |   for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; | ||||||
|  |  | ||||||
|   if ( DhopCalls > 0 ) { |   if ( DhopCalls > 0 ) { | ||||||
|     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; |     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; | ||||||
| @@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void) | |||||||
|     std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report(); |     std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report(); | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report(); |     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report(); | ||||||
|   } |   } | ||||||
|  |   if ( DhopCalls > 0){ | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls); | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls); | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls); | ||||||
|  |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) { | |||||||
|   Stencil.ZeroCounters(); |   Stencil.ZeroCounters(); | ||||||
|   StencilEven.ZeroCounters(); |   StencilEven.ZeroCounters(); | ||||||
|   StencilOdd.ZeroCounters(); |   StencilOdd.ZeroCounters(); | ||||||
|  |   Stencil.ZeroCountersi(); | ||||||
|  |   StencilEven.ZeroCountersi(); | ||||||
|  |   StencilOdd.ZeroCountersi(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
| { | { | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); |   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); | ||||||
|   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; |  | ||||||
|  |  | ||||||
|   Compressor compressor(dag); |   Compressor compressor(dag); | ||||||
|  |  | ||||||
| @@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |   DhopFaceTime-=usecond(); | ||||||
|   st.HaloExchangeOptGather(in,compressor); |   st.HaloExchangeOptGather(in,compressor); | ||||||
|   DhopFaceTime+=usecond(); |   st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms | ||||||
|   std::vector<std::vector<CommsRequest_t> > reqs; |  | ||||||
|  |  | ||||||
|   // Rely on async comms; start comms before merge of local data |  | ||||||
|   DhopCommTime-=usecond(); |  | ||||||
|   st.CommunicateBegin(reqs); |  | ||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |  | ||||||
|   st.CommsMergeSHM(compressor); |  | ||||||
|   DhopFaceTime+=usecond(); |   DhopFaceTime+=usecond(); | ||||||
|  |  | ||||||
|   // Perhaps use omp task and region |   double ctime=0; | ||||||
| #pragma omp parallel  |   double ptime=0; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Ugly explicit thread mapping introduced for OPA reasons. | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #pragma omp parallel reduction(max:ctime) reduction(max:ptime) | ||||||
|   {  |   {  | ||||||
|  |     int tid = omp_get_thread_num(); | ||||||
|     int nthreads = omp_get_num_threads(); |     int nthreads = omp_get_num_threads(); | ||||||
|     int me = omp_get_thread_num(); |     int ncomms = CartesianCommunicator::nCommThreads; | ||||||
|     int myoff, mywork; |     if (ncomms == -1) ncomms = 1; | ||||||
|  |     assert(nthreads > ncomms); | ||||||
|     GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); |     if (tid >= ncomms) { | ||||||
|     int sF = LLs * myoff; |       double start = usecond(); | ||||||
|  |       nthreads -= ncomms; | ||||||
|     if ( me == 0 ) { |       int ttid = tid - ncomms; | ||||||
|       st.CommunicateComplete(reqs); |       int n = U._grid->oSites(); | ||||||
|       DhopCommTime+=usecond(); |       int chunk = n / nthreads; | ||||||
|  |       int rem = n % nthreads; | ||||||
|  |       int myblock, myn; | ||||||
|  |       if (ttid < rem) { | ||||||
|  | 	myblock = ttid * chunk + ttid; | ||||||
|  | 	myn = chunk+1; | ||||||
|       } else { |       } else { | ||||||
|       // Interior links in stencil | 	myblock = ttid*chunk + rem; | ||||||
|       if ( me==1 ) DhopComputeTime-=usecond(); | 	myn = chunk; | ||||||
|       if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); |       } | ||||||
|       else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); |        | ||||||
|       if ( me==1 ) DhopComputeTime+=usecond(); |       // do the compute | ||||||
|  |       if (dag == DaggerYes) { | ||||||
|  | 	for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|  | 	  int sU = ss; | ||||||
|  | 	  int sF = LLs * sU; | ||||||
|  | 	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); | ||||||
|  | 	} | ||||||
|  |       } else { | ||||||
|  | 	for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|  | 	  int sU = ss; | ||||||
|  | 	  int sF = LLs * sU; | ||||||
|  | 	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|  | 	ptime = usecond() - start; | ||||||
|  |     } | ||||||
|  |     { | ||||||
|  |       double start = usecond(); | ||||||
|  |       st.CommunicateThreaded(); | ||||||
|  |       ctime = usecond() - start; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   DhopCommTime += ctime; | ||||||
|  |   DhopComputeTime+=ptime; | ||||||
|  |  | ||||||
|  |   // First to enter, last to leave timing | ||||||
|  |   st.CollateThreads(); | ||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |   DhopFaceTime-=usecond(); | ||||||
|   st.CommsMerge(compressor); |   st.CommsMerge(compressor); | ||||||
|   DhopFaceTime+=usecond(); |   DhopFaceTime+=usecond(); | ||||||
|  |  | ||||||
|   // Load imbalance alert. Should use dynamic schedule OMP for loop |  | ||||||
|   // Perhaps create a list of only those sites with face work, and  |  | ||||||
|   // load balance process the list. |  | ||||||
|   DhopComputeTime2-=usecond(); |   DhopComputeTime2-=usecond(); | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     int sz=st.surface_list.size(); |     int sz=st.surface_list.size(); | ||||||
| @@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
| #else  | #else  | ||||||
|   assert(0); |   assert(0); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | ||||||
| 					 DoubledGaugeField & U, | 					 DoubledGaugeField & U, | ||||||
|   | |||||||
| @@ -26,6 +26,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| //#include <Grid/Grid.h> | //#include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | #ifndef GRID_QCD_GAUGE_FIX_H | ||||||
|  | #define GRID_QCD_GAUGE_FIX_H | ||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
| @@ -188,3 +190,4 @@ class FourierAcceleratedGaugeFixer  : public Gimpl { | |||||||
|  |  | ||||||
| } | } | ||||||
| } | } | ||||||
|  | #endif | ||||||
|   | |||||||
| @@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   // Timing info; ugly; possibly temporary |   // Timing info; ugly; possibly temporary | ||||||
|   ///////////////////////////////////////// |   ///////////////////////////////////////// | ||||||
|   double commtime; |   double commtime; | ||||||
|  |   double mpi3synctime; | ||||||
|  |   double mpi3synctime_g; | ||||||
|  |   double shmmergetime; | ||||||
|   double gathertime; |   double gathertime; | ||||||
|   double gathermtime; |   double gathermtime; | ||||||
|   double halogtime; |   double halogtime; | ||||||
| @@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   double splicetime; |   double splicetime; | ||||||
|   double nosplicetime; |   double nosplicetime; | ||||||
|   double calls; |   double calls; | ||||||
|  |   std::vector<double> comm_bytes_thr; | ||||||
|  |   std::vector<double> comm_time_thr; | ||||||
|  |   std::vector<double> comm_enter_thr; | ||||||
|  |   std::vector<double> comm_leave_thr; | ||||||
|  |  | ||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   // Stencil query |   // Stencil query | ||||||
| @@ -248,6 +255,57 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|   // Comms packet queue for asynch thread |   // Comms packet queue for asynch thread | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|  |   void CommunicateThreaded() | ||||||
|  |   { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |     // must be called in parallel region | ||||||
|  |     int mythread = omp_get_thread_num(); | ||||||
|  |     int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  | #else | ||||||
|  |     int mythread = 0; | ||||||
|  |     int nthreads = 1; | ||||||
|  | #endif | ||||||
|  |     if (nthreads == -1) nthreads = 1; | ||||||
|  |     if (mythread < nthreads) { | ||||||
|  |       comm_enter_thr[mythread] = usecond(); | ||||||
|  |       for (int i = mythread; i < Packets.size(); i += nthreads) { | ||||||
|  | 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, | ||||||
|  | 						      Packets[i].to_rank, | ||||||
|  | 						      Packets[i].recv_buf, | ||||||
|  | 						      Packets[i].from_rank, | ||||||
|  | 						      Packets[i].bytes,i); | ||||||
|  | 	comm_bytes_thr[mythread] += bytes; | ||||||
|  |       } | ||||||
|  |       comm_leave_thr[mythread]= usecond(); | ||||||
|  |       comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   void CollateThreads(void) | ||||||
|  |   { | ||||||
|  |     int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  |     double first=0.0; | ||||||
|  |     double last =0.0; | ||||||
|  |  | ||||||
|  |     for(int t=0;t<nthreads;t++) { | ||||||
|  |  | ||||||
|  |       double t0 = comm_enter_thr[t]; | ||||||
|  |       double t1 = comm_leave_thr[t]; | ||||||
|  |       comms_bytes+=comm_bytes_thr[t]; | ||||||
|  |  | ||||||
|  |       comm_enter_thr[t] = 0.0; | ||||||
|  |       comm_leave_thr[t] = 0.0; | ||||||
|  |       comm_time_thr[t]   = 0.0; | ||||||
|  |       comm_bytes_thr[t]=0; | ||||||
|  |  | ||||||
|  |       if ( first == 0.0 ) first = t0;                   // first is t0 | ||||||
|  |       if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen | ||||||
|  |  | ||||||
|  |       if ( t1 > last ) last = t1;                       // max time seen | ||||||
|  |        | ||||||
|  |     } | ||||||
|  |     commtime+= last-first; | ||||||
|  |   } | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     reqs.resize(Packets.size()); |     reqs.resize(Packets.size()); | ||||||
| @@ -258,25 +316,59 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 						     Packets[i].to_rank, | 						     Packets[i].to_rank, | ||||||
| 						     Packets[i].recv_buf, | 						     Packets[i].recv_buf, | ||||||
| 						     Packets[i].from_rank, | 						     Packets[i].from_rank, | ||||||
| 					  Packets[i].bytes); | 						     Packets[i].bytes,i); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       _grid->StencilSendToRecvFromComplete(reqs[i]); |       _grid->StencilSendToRecvFromComplete(reqs[i],i); | ||||||
|     } |     } | ||||||
|     commtime+=usecond(); |     commtime+=usecond(); | ||||||
|   } |   } | ||||||
|  |   void Communicate(void) | ||||||
|  |   { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp parallel  | ||||||
|  |     { | ||||||
|  |       // must be called in parallel region | ||||||
|  |       int mythread  = omp_get_thread_num(); | ||||||
|  |       int maxthreads= omp_get_max_threads(); | ||||||
|  |       int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  |       assert(nthreads <= maxthreads); | ||||||
|  |  | ||||||
|  |       if (nthreads == -1) nthreads = 1; | ||||||
|  | #else | ||||||
|  |       int mythread = 0; | ||||||
|  |       int nthreads = 1; | ||||||
|  | #endif | ||||||
|  |       if (mythread < nthreads) { | ||||||
|  | 	for (int i = mythread; i < Packets.size(); i += nthreads) { | ||||||
|  | 	  double start = usecond(); | ||||||
|  | 	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf, | ||||||
|  | 								   Packets[i].to_rank, | ||||||
|  | 								   Packets[i].recv_buf, | ||||||
|  | 								   Packets[i].from_rank, | ||||||
|  | 								   Packets[i].bytes,i); | ||||||
|  | 	  comm_time_thr[mythread] += usecond() - start; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|    |    | ||||||
|   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)  |   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)  | ||||||
|   { |   { | ||||||
|     std::vector<std::vector<CommsRequest_t> > reqs; |     std::vector<std::vector<CommsRequest_t> > reqs; | ||||||
|     Prepare(); |     Prepare(); | ||||||
|     HaloGather(source,compress); |     HaloGather(source,compress); | ||||||
|     CommunicateBegin(reqs); |     // Concurrent | ||||||
|     CommunicateComplete(reqs); |     //CommunicateBegin(reqs); | ||||||
|  |     //CommunicateComplete(reqs); | ||||||
|  |     // Sequential, possibly threaded | ||||||
|  |     Communicate(); | ||||||
|     CommsMergeSHM(compress);  |     CommsMergeSHM(compress);  | ||||||
|     CommsMerge(compress);  |     CommsMerge(compress);  | ||||||
|   } |   } | ||||||
| @@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   template<class compressor> |   template<class compressor> | ||||||
|   void HaloGather(const Lattice<vobj> &source,compressor &compress) |   void HaloGather(const Lattice<vobj> &source,compressor &compress) | ||||||
|   { |   { | ||||||
|  |     mpi3synctime_g-=usecond(); | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |     mpi3synctime_g+=usecond(); | ||||||
|  |  | ||||||
|     // conformable(source._grid,_grid); |     // conformable(source._grid,_grid); | ||||||
|     assert(source._grid==_grid); |     assert(source._grid==_grid); | ||||||
| @@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     CommsMerge(decompress,Mergers,Decompressions);  |     CommsMerge(decompress,Mergers,Decompressions);  | ||||||
|   } |   } | ||||||
|   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { |   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { | ||||||
|  |     mpi3synctime-=usecond();     | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |     mpi3synctime+=usecond();     | ||||||
|  |     shmmergetime-=usecond();     | ||||||
|     CommsMerge(decompress,MergersSHM,DecompressionsSHM); |     CommsMerge(decompress,MergersSHM,DecompressionsSHM); | ||||||
|  |     shmmergetime+=usecond();     | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class decompressor> |   template<class decompressor> | ||||||
| @@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 		  int checkerboard, | 		  int checkerboard, | ||||||
| 		  const std::vector<int> &directions, | 		  const std::vector<int> &directions, | ||||||
| 		  const std::vector<int> &distances)  | 		  const std::vector<int> &distances)  | ||||||
|    :   _permute_type(npoints), _comm_buf_size(npoints) |    : _permute_type(npoints),  | ||||||
|  |     _comm_buf_size(npoints), | ||||||
|  |     comm_bytes_thr(npoints),  | ||||||
|  |     comm_enter_thr(npoints), | ||||||
|  |     comm_leave_thr(npoints),  | ||||||
|  |        comm_time_thr(npoints) | ||||||
|   { |   { | ||||||
|     face_table_computed=0; |     face_table_computed=0; | ||||||
|     _npoints = npoints; |     _npoints = npoints; | ||||||
| @@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   void ZeroCounters(void) { |   void ZeroCounters(void) { | ||||||
|     gathertime = 0.; |     gathertime = 0.; | ||||||
|     commtime = 0.; |     commtime = 0.; | ||||||
|  |     mpi3synctime=0.; | ||||||
|  |     mpi3synctime_g=0.; | ||||||
|  |     shmmergetime=0.; | ||||||
|  |     for(int i=0;i<_npoints;i++){ | ||||||
|  |       comm_time_thr[i]=0; | ||||||
|  |       comm_bytes_thr[i]=0; | ||||||
|  |       comm_enter_thr[i]=0; | ||||||
|  |       comm_leave_thr[i]=0; | ||||||
|  |     } | ||||||
|     halogtime = 0.; |     halogtime = 0.; | ||||||
|     mergetime = 0.; |     mergetime = 0.; | ||||||
|     decompresstime = 0.; |     decompresstime = 0.; | ||||||
| @@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; | #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; | ||||||
|     RealD NP = _grid->_Nprocessors; |     RealD NP = _grid->_Nprocessors; | ||||||
|     RealD NN = _grid->NodeCount(); |     RealD NN = _grid->NodeCount(); | ||||||
|  |     double t = 0; | ||||||
|  |     // if comm_time_thr is set they were all done in parallel so take the max | ||||||
|  |     // but add up the bytes | ||||||
|  |     int threaded = 0 ; | ||||||
|  |     for (int i = 0; i < 8; ++i) { | ||||||
|  |       if ( comm_time_thr[i]>0.0 ) { | ||||||
|  | 	threaded = 1; | ||||||
|  | 	comms_bytes += comm_bytes_thr[i]; | ||||||
|  | 	if (t < comm_time_thr[i]) t = comm_time_thr[i]; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if (threaded) commtime += t; | ||||||
|      |      | ||||||
|     _grid->GlobalSum(commtime);    commtime/=NP; |     _grid->GlobalSum(commtime);    commtime/=NP; | ||||||
|     if ( calls > 0. ) { |     if ( calls > 0. ) { | ||||||
| @@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; | 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; | ||||||
| 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; | 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; | ||||||
|       } |       } | ||||||
|  |       PRINTIT(mpi3synctime); | ||||||
|  |       PRINTIT(mpi3synctime_g); | ||||||
|  |       PRINTIT(shmmergetime); | ||||||
|       PRINTIT(splicetime); |       PRINTIT(splicetime); | ||||||
|       PRINTIT(nosplicetime); |       PRINTIT(nosplicetime); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; |     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){ | ||||||
|  |     CartesianCommunicator::Hugepages = 1; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ | ||||||
|     Grid_debug_handler_init(); |     Grid_debug_handler_init(); | ||||||
|   } |   } | ||||||
| @@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl; |     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl; |     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl; |     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; |     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
| @@ -356,10 +362,15 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ | ||||||
|     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ | ||||||
|     LebesgueOrder::UseLebesgueOrder=1; |     LebesgueOrder::UseLebesgueOrder=1; | ||||||
|   } |   } | ||||||
|  |   CartesianCommunicator::nCommThreads = -1; | ||||||
|  |   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ | ||||||
|  |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); | ||||||
|  |     GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); | ||||||
|  |   } | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ | ||||||
|     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); | ||||||
|     GridCmdOptionIntVector(arg,LebesgueOrder::Block); |     GridCmdOptionIntVector(arg,LebesgueOrder::Block); | ||||||
| @@ -374,7 +385,10 @@ void Grid_init(int *argc,char ***argv) | |||||||
| 		  Grid_default_latt, | 		  Grid_default_latt, | ||||||
| 		  Grid_default_mpi); | 		  Grid_default_mpi); | ||||||
|  |  | ||||||
|   std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; |   std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; | ||||||
|  |   if ( CartesianCommunicator::Hugepages) { | ||||||
|  |     std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ | ||||||
|     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n"; |     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n"; | ||||||
| @@ -393,7 +407,7 @@ void Grid_init(int *argc,char ***argv) | |||||||
|  |  | ||||||
| void Grid_finalize(void) | void Grid_finalize(void) | ||||||
| { | { | ||||||
| #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) | #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) | ||||||
|   MPI_Finalize(); |   MPI_Finalize(); | ||||||
|   Grid_unquiesce_nodes(); |   Grid_unquiesce_nodes(); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/Grid.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Grid::QCD; | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) | int main (int argc, char ** argv) | ||||||
| { | { | ||||||
|   std::vector<int> seeds({1,2,3,4}); |   std::vector<int> seeds({1,2,3,4}); | ||||||
| @@ -82,6 +85,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   Uorg = Uorg - Umu; |   Uorg = Uorg - Umu; | ||||||
|   std::cout << " Norm Difference "<< norm2(Uorg) << std::endl; |   std::cout << " Norm Difference "<< norm2(Uorg) << std::endl; | ||||||
|  |   std::cout << " Norm "<< norm2(Umu) << std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout<< "*****************************************************************" <<std::endl; |   std::cout<< "*****************************************************************" <<std::endl; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user