mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/json-fix
This commit is contained in:
		
							
								
								
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							@@ -9,68 +9,6 @@ matrix:
 | 
			
		||||
    - os:        osx
 | 
			
		||||
      osx_image: xcode8.3
 | 
			
		||||
      compiler: clang
 | 
			
		||||
    - compiler: gcc
 | 
			
		||||
      dist: trusty
 | 
			
		||||
      sudo: required
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.9
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - libopenmpi-dev
 | 
			
		||||
            - openmpi-bin
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: VERSION=-4.9
 | 
			
		||||
    - compiler: gcc
 | 
			
		||||
      dist: trusty
 | 
			
		||||
      sudo: required
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-5
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - libopenmpi-dev
 | 
			
		||||
            - openmpi-bin
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: VERSION=-5
 | 
			
		||||
    - compiler: clang
 | 
			
		||||
      dist: trusty
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.8
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - libopenmpi-dev
 | 
			
		||||
            - openmpi-bin
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 | 
			
		||||
    - compiler: clang
 | 
			
		||||
      dist: trusty
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.8
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - libopenmpi-dev
 | 
			
		||||
            - openmpi-bin
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 | 
			
		||||
      
 | 
			
		||||
before_install:
 | 
			
		||||
    - export GRIDDIR=`pwd`
 | 
			
		||||
@@ -106,9 +44,3 @@ script:
 | 
			
		||||
    - make -j4
 | 
			
		||||
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
 | 
			
		||||
    - make check
 | 
			
		||||
    - echo make clean
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										31
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								README.md
									
									
									
									
									
								
							@@ -1,18 +1,4 @@
 | 
			
		||||
# Grid
 | 
			
		||||
<table>
 | 
			
		||||
<tr>
 | 
			
		||||
    <td>Last stable release</td>
 | 
			
		||||
    <td><a href="https://travis-ci.org/paboyle/Grid">
 | 
			
		||||
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
 | 
			
		||||
    </td>
 | 
			
		||||
</tr>
 | 
			
		||||
<tr>
 | 
			
		||||
    <td>Development branch</td>
 | 
			
		||||
    <td><a href="https://travis-ci.org/paboyle/Grid">
 | 
			
		||||
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
 | 
			
		||||
    </td>
 | 
			
		||||
</tr>
 | 
			
		||||
</table>
 | 
			
		||||
# Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [](https://travis-ci.org/paboyle/Grid)
 | 
			
		||||
 | 
			
		||||
**Data parallel C++ mathematical object library.**
 | 
			
		||||
 | 
			
		||||
@@ -324,6 +310,13 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 | 
			
		||||
```
 | 
			
		||||
This is the default. 
 | 
			
		||||
 | 
			
		||||
#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 | 
			
		||||
 | 
			
		||||
mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 | 
			
		||||
 | 
			
		||||
TBA
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Build setup for AMD EPYC / RYZEN
 | 
			
		||||
 | 
			
		||||
The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
 | 
			
		||||
@@ -378,6 +371,14 @@ echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 | 
			
		||||
$@
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Performance:
 | 
			
		||||
 | 
			
		||||
#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 | 
			
		||||
 | 
			
		||||
mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 | 
			
		||||
 | 
			
		||||
TBA
 | 
			
		||||
 | 
			
		||||
### Build setup for BlueGene/Q
 | 
			
		||||
 | 
			
		||||
To be written...
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										16
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								TODO
									
									
									
									
									
								
							@@ -2,18 +2,20 @@ TODO:
 | 
			
		||||
---------------
 | 
			
		||||
 | 
			
		||||
Large item work list:
 | 
			
		||||
1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
 | 
			
		||||
 | 
			
		||||
1)- BG/Q port and check
 | 
			
		||||
2)- Christoph's local basis expansion Lanczos
 | 
			
		||||
3)- BG/Q port and check
 | 
			
		||||
4)- Precision conversion and sort out localConvert      <-- partial
 | 
			
		||||
3)- Precision conversion and sort out localConvert      <-- partial
 | 
			
		||||
 | 
			
		||||
  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 | 
			
		||||
5)- Physical propagator interface
 | 
			
		||||
6)- Conserved currents
 | 
			
		||||
7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 | 
			
		||||
8)- HDCR resume
 | 
			
		||||
4)- Physical propagator interface
 | 
			
		||||
5)- Conserved currents
 | 
			
		||||
6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 | 
			
		||||
7)- HDCR resume
 | 
			
		||||
 | 
			
		||||
Recent DONE 
 | 
			
		||||
 | 
			
		||||
-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE
 | 
			
		||||
-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 | 
			
		||||
-- GaugeFix into central location                      <-- DONE
 | 
			
		||||
-- Scidac and Ildg metadata handling                   <-- DONE
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										797
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										797
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,797 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
std::vector<int> L_list;
 | 
			
		||||
std::vector<int> Ls_list;
 | 
			
		||||
std::vector<double> mflop_list;
 | 
			
		||||
 | 
			
		||||
double mflop_ref;
 | 
			
		||||
double mflop_ref_err;
 | 
			
		||||
 | 
			
		||||
int NN_global;
 | 
			
		||||
 | 
			
		||||
struct time_statistics{
 | 
			
		||||
  double mean;
 | 
			
		||||
  double err;
 | 
			
		||||
  double min;
 | 
			
		||||
  double max;
 | 
			
		||||
 | 
			
		||||
  void statistics(std::vector<double> v){
 | 
			
		||||
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
 | 
			
		||||
      mean = sum / v.size();
 | 
			
		||||
 | 
			
		||||
      std::vector<double> diff(v.size());
 | 
			
		||||
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
 | 
			
		||||
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
 | 
			
		||||
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
 | 
			
		||||
 | 
			
		||||
      auto result = std::minmax_element(v.begin(), v.end());
 | 
			
		||||
      min = *result.first;
 | 
			
		||||
      max = *result.second;
 | 
			
		||||
}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void comms_header(){
 | 
			
		||||
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
 | 
			
		||||
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
Gamma::Algebra Gmu [] = {
 | 
			
		||||
  Gamma::Algebra::GammaX,
 | 
			
		||||
  Gamma::Algebra::GammaY,
 | 
			
		||||
  Gamma::Algebra::GammaZ,
 | 
			
		||||
  Gamma::Algebra::GammaT
 | 
			
		||||
};
 | 
			
		||||
struct controls {
 | 
			
		||||
  int Opt;
 | 
			
		||||
  int CommsOverlap;
 | 
			
		||||
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
 | 
			
		||||
  //  int HugePages;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class Benchmark {
 | 
			
		||||
public:
 | 
			
		||||
  static void Decomposition (void ) {
 | 
			
		||||
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void Comms(void)
 | 
			
		||||
  {
 | 
			
		||||
    int Nloop=200;
 | 
			
		||||
    int nmu=0;
 | 
			
		||||
    int maxlat=32;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
 | 
			
		||||
    std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
 | 
			
		||||
 | 
			
		||||
    std::vector<double> t_time(Nloop);
 | 
			
		||||
    time_statistics timestat;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
    comms_header();
 | 
			
		||||
 | 
			
		||||
    for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
      for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
	std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
	      lat*mpi_layout[1],
 | 
			
		||||
	      lat*mpi_layout[2],
 | 
			
		||||
	      lat*mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
	RealD Nrank = Grid._Nprocessors;
 | 
			
		||||
	RealD Nnode = Grid.NodeCount();
 | 
			
		||||
	RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
	std::vector<HalfSpinColourVectorD *> xbuf(8);
 | 
			
		||||
	std::vector<HalfSpinColourVectorD *> rbuf(8);
 | 
			
		||||
	Grid.ShmBufferFreeAll();
 | 
			
		||||
	for(int d=0;d<8;d++){
 | 
			
		||||
	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
	int ncomm;
 | 
			
		||||
	double dbytes;
 | 
			
		||||
	std::vector<double> times(Nloop);
 | 
			
		||||
	for(int i=0;i<Nloop;i++){
 | 
			
		||||
 | 
			
		||||
	  double start=usecond();
 | 
			
		||||
 | 
			
		||||
	  dbytes=0;
 | 
			
		||||
	  ncomm=0;
 | 
			
		||||
 | 
			
		||||
	  parallel_for(int dir=0;dir<8;dir++){
 | 
			
		||||
 | 
			
		||||
	    double tbytes;
 | 
			
		||||
	    int mu =dir % 4;
 | 
			
		||||
 | 
			
		||||
	    if (mpi_layout[mu]>1 ) {
 | 
			
		||||
	        
 | 
			
		||||
	      int xmit_to_rank;
 | 
			
		||||
	      int recv_from_rank;
 | 
			
		||||
	      if ( dir == mu ) { 
 | 
			
		||||
		int comm_proc=1;
 | 
			
		||||
		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	      } else { 
 | 
			
		||||
		int comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	      }
 | 
			
		||||
	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 | 
			
		||||
						 (void *)&rbuf[dir][0], recv_from_rank,
 | 
			
		||||
						 bytes,dir);
 | 
			
		||||
	  
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp atomic
 | 
			
		||||
#endif
 | 
			
		||||
	      ncomm++;
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp atomic
 | 
			
		||||
#endif
 | 
			
		||||
	      dbytes+=tbytes;
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	  Grid.Barrier();
 | 
			
		||||
	  double stop=usecond();
 | 
			
		||||
	  t_time[i] = stop-start; // microseconds
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	//	for(int i=0;i<t_time.size();i++){
 | 
			
		||||
	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 | 
			
		||||
	//	}
 | 
			
		||||
 | 
			
		||||
	dbytes=dbytes*ppn;
 | 
			
		||||
	double xbytes    = dbytes*0.5;
 | 
			
		||||
	double rbytes    = dbytes*0.5;
 | 
			
		||||
	double bidibytes = dbytes;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 | 
			
		||||
		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 | 
			
		||||
		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 | 
			
		||||
		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 | 
			
		||||
		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 | 
			
		||||
		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
	
 | 
			
		||||
	    }
 | 
			
		||||
    }    
 | 
			
		||||
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void Memory(void)
 | 
			
		||||
  {
 | 
			
		||||
    const int Nvec=8;
 | 
			
		||||
    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
 | 
			
		||||
    typedef iVector<vReal,Nvec> Vec;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
 | 
			
		||||
    std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
    uint64_t NP;
 | 
			
		||||
    uint64_t NN;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  uint64_t lmax=48;
 | 
			
		||||
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 | 
			
		||||
 | 
			
		||||
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
    for(int lat=8;lat<=lmax;lat+=4){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      NP= Grid.RankCount();
 | 
			
		||||
      NN =Grid.NodeCount();
 | 
			
		||||
 | 
			
		||||
      Vec rn ; random(sRNG,rn);
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); z=rn;
 | 
			
		||||
      LatticeVec x(&Grid); x=rn;
 | 
			
		||||
      LatticeVec y(&Grid); y=rn;
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	z=a*x-y;
 | 
			
		||||
        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
 | 
			
		||||
        y._odata[4]=z._odata[4];
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
     
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) 
 | 
			
		||||
	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
 | 
			
		||||
	       << "\t\t"<< bytes/time/NN <<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static double DWF5(int Ls,int L)
 | 
			
		||||
  {
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
    double mflops;
 | 
			
		||||
    double mflops_best = 0;
 | 
			
		||||
    double mflops_worst= 0;
 | 
			
		||||
    std::vector<double> mflops_all;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    // Set/Get the layout & grid size
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
 | 
			
		||||
    std::vector<int> local({L,L,L,L});
 | 
			
		||||
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    uint64_t NP = TmpGrid->RankCount();
 | 
			
		||||
    uint64_t NN = TmpGrid->NodeCount();
 | 
			
		||||
    NN_global=NN;
 | 
			
		||||
    uint64_t SHM=NP/NN;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> internal;
 | 
			
		||||
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
 | 
			
		||||
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
 | 
			
		||||
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
 | 
			
		||||
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
 | 
			
		||||
    else assert(0);
 | 
			
		||||
 | 
			
		||||
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
 | 
			
		||||
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
 | 
			
		||||
 | 
			
		||||
    ///////// Welcome message ////////////
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Lattice Init ////////////
 | 
			
		||||
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
 | 
			
		||||
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
			
		||||
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
    ///////// RNG Init ////////////
 | 
			
		||||
    std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
    std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Source preparation ////////////
 | 
			
		||||
    LatticeFermion src   (sFGrid); random(RNG5,src);
 | 
			
		||||
    LatticeFermion tmp   (sFGrid);
 | 
			
		||||
 | 
			
		||||
    RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
    src = src*N2;
 | 
			
		||||
    
 | 
			
		||||
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
 | 
			
		||||
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
 | 
			
		||||
    LatticeFermion src_e (sFrbGrid);
 | 
			
		||||
    LatticeFermion src_o (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_e   (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_o   (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_eo  (sFGrid);
 | 
			
		||||
    LatticeFermion err   (sFGrid);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
      const int num_cases = 6;
 | 
			
		||||
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#else
 | 
			
		||||
      const int num_cases = 4;
 | 
			
		||||
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#endif
 | 
			
		||||
      controls Cases [] = {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
#endif
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
 | 
			
		||||
      }; 
 | 
			
		||||
 | 
			
		||||
      for(int c=0;c<num_cases;c++) {
 | 
			
		||||
 | 
			
		||||
	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 | 
			
		||||
	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 | 
			
		||||
	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
	int nwarm = 100;
 | 
			
		||||
	uint64_t ncall = 1000;
 | 
			
		||||
 | 
			
		||||
	double t0=usecond();
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	for(int i=0;i<nwarm;i++){
 | 
			
		||||
	  sDw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	}
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	double t1=usecond();
 | 
			
		||||
 | 
			
		||||
	sDw.ZeroCounters();
 | 
			
		||||
	time_statistics timestat;
 | 
			
		||||
	std::vector<double> t_time(ncall);
 | 
			
		||||
	for(uint64_t i=0;i<ncall;i++){
 | 
			
		||||
	  t0=usecond();
 | 
			
		||||
	  sDw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	  t1=usecond();
 | 
			
		||||
	  t_time[i] = t1-t0;
 | 
			
		||||
	}
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	
 | 
			
		||||
	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
	double flops=(1344.0*volume)/2;
 | 
			
		||||
	double mf_hi, mf_lo, mf_err;
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	mf_hi = flops/timestat.min;
 | 
			
		||||
	mf_lo = flops/timestat.max;
 | 
			
		||||
	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 | 
			
		||||
 | 
			
		||||
	mflops = flops/timestat.mean;
 | 
			
		||||
	mflops_all.push_back(mflops);
 | 
			
		||||
	if ( mflops_best == 0   ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops_worst== 0   ) mflops_worst= mflops;
 | 
			
		||||
	if ( mflops>mflops_best ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops<mflops_worst) mflops_worst= mflops;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 | 
			
		||||
 | 
			
		||||
	sDw.Report();
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      double robust = mflops_worst/mflops_best;;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage <<fmt << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<mflops_all.size();i++){
 | 
			
		||||
	std::cout<<mflops_all[i]/NN<<" ; " ;
 | 
			
		||||
      }
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    return mflops_best;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static double DWF(int Ls,int L, double & robust)
 | 
			
		||||
  {
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
    double mflops;
 | 
			
		||||
    double mflops_best = 0;
 | 
			
		||||
    double mflops_worst= 0;
 | 
			
		||||
    std::vector<double> mflops_all;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    // Set/Get the layout & grid size
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
 | 
			
		||||
    std::vector<int> local({L,L,L,L});
 | 
			
		||||
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    uint64_t NP = TmpGrid->RankCount();
 | 
			
		||||
    uint64_t NN = TmpGrid->NodeCount();
 | 
			
		||||
    NN_global=NN;
 | 
			
		||||
    uint64_t SHM=NP/NN;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> internal;
 | 
			
		||||
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
 | 
			
		||||
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
 | 
			
		||||
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
 | 
			
		||||
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
 | 
			
		||||
    else assert(0);
 | 
			
		||||
 | 
			
		||||
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
 | 
			
		||||
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
 | 
			
		||||
 | 
			
		||||
    ///////// Welcome message ////////////
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ///////// Lattice Init ////////////
 | 
			
		||||
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    ///////// RNG Init ////////////
 | 
			
		||||
    std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
    std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Source preparation ////////////
 | 
			
		||||
    LatticeFermion src   (FGrid); random(RNG5,src);
 | 
			
		||||
    LatticeFermion ref   (FGrid);
 | 
			
		||||
    LatticeFermion tmp   (FGrid);
 | 
			
		||||
 | 
			
		||||
    RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
    src = src*N2;
 | 
			
		||||
    
 | 
			
		||||
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
 | 
			
		||||
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    // Naive wilson implementation
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    {
 | 
			
		||||
      LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
      std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
			
		||||
      for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      ref = zero;
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
 | 
			
		||||
      }
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	
 | 
			
		||||
	tmp = U[mu]*Cshift(src,mu+1,1);
 | 
			
		||||
	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
			
		||||
	
 | 
			
		||||
	tmp =adj(U[mu])*src;
 | 
			
		||||
	tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
			
		||||
      }
 | 
			
		||||
      ref = -0.5*ref;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LatticeFermion src_e (FrbGrid);
 | 
			
		||||
    LatticeFermion src_o (FrbGrid);
 | 
			
		||||
    LatticeFermion r_e   (FrbGrid);
 | 
			
		||||
    LatticeFermion r_o   (FrbGrid);
 | 
			
		||||
    LatticeFermion r_eo  (FGrid);
 | 
			
		||||
    LatticeFermion err   (FGrid);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
      const int num_cases = 6;
 | 
			
		||||
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#else
 | 
			
		||||
      const int num_cases = 4;
 | 
			
		||||
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#endif
 | 
			
		||||
      controls Cases [] = {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
#endif
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
 | 
			
		||||
      }; 
 | 
			
		||||
 | 
			
		||||
      for(int c=0;c<num_cases;c++) {
 | 
			
		||||
 | 
			
		||||
	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 | 
			
		||||
	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 | 
			
		||||
	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
	int nwarm = 200;
 | 
			
		||||
	double t0=usecond();
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	for(int i=0;i<nwarm;i++){
 | 
			
		||||
	  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	double t1=usecond();
 | 
			
		||||
	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 | 
			
		||||
	//	if (ncall < 500) ncall = 500;
 | 
			
		||||
	uint64_t ncall = 1000;
 | 
			
		||||
 | 
			
		||||
	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 | 
			
		||||
 | 
			
		||||
	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 | 
			
		||||
	Dw.ZeroCounters();
 | 
			
		||||
 | 
			
		||||
	time_statistics timestat;
 | 
			
		||||
	std::vector<double> t_time(ncall);
 | 
			
		||||
	for(uint64_t i=0;i<ncall;i++){
 | 
			
		||||
	  t0=usecond();
 | 
			
		||||
	  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	  t1=usecond();
 | 
			
		||||
	  t_time[i] = t1-t0;
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	
 | 
			
		||||
	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
	double flops=(1344.0*volume)/2;
 | 
			
		||||
	double mf_hi, mf_lo, mf_err;
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	mf_hi = flops/timestat.min;
 | 
			
		||||
	mf_lo = flops/timestat.max;
 | 
			
		||||
	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 | 
			
		||||
 | 
			
		||||
	mflops = flops/timestat.mean;
 | 
			
		||||
	mflops_all.push_back(mflops);
 | 
			
		||||
	if ( mflops_best == 0   ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops_worst== 0   ) mflops_worst= mflops;
 | 
			
		||||
	if ( mflops>mflops_best ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops<mflops_worst) mflops_worst= mflops;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 | 
			
		||||
 | 
			
		||||
	Dw.Report();
 | 
			
		||||
 | 
			
		||||
	Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
			
		||||
	setCheckerboard(r_eo,r_o);
 | 
			
		||||
	setCheckerboard(r_eo,r_e);
 | 
			
		||||
	err = r_eo-ref; 
 | 
			
		||||
	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
	assert((norm2(err)<1.0e-4));
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      robust = mflops_worst/mflops_best;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage <<fmt << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage ;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<mflops_all.size();i++){
 | 
			
		||||
	std::cout<<mflops_all[i]/NN<<" ; " ;
 | 
			
		||||
      }
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    return mflops_best;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 | 
			
		||||
#ifdef KNL
 | 
			
		||||
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 | 
			
		||||
#else
 | 
			
		||||
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 | 
			
		||||
#endif
 | 
			
		||||
  Benchmark::Decomposition();
 | 
			
		||||
 | 
			
		||||
  int do_memory=1;
 | 
			
		||||
  int do_comms =1;
 | 
			
		||||
  int do_su3   =0;
 | 
			
		||||
  int do_wilson=1;
 | 
			
		||||
  int do_dwf   =1;
 | 
			
		||||
 | 
			
		||||
  if ( do_su3 ) {
 | 
			
		||||
    // empty for now
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int sel=2;
 | 
			
		||||
  std::vector<int> L_list({8,12,16,24});
 | 
			
		||||
 | 
			
		||||
  //int sel=1;
 | 
			
		||||
  //  std::vector<int> L_list({8,12});
 | 
			
		||||
  std::vector<double> robust_list;
 | 
			
		||||
 | 
			
		||||
  std::vector<double> wilson;
 | 
			
		||||
  std::vector<double> dwf4;
 | 
			
		||||
  std::vector<double> dwf5;
 | 
			
		||||
 | 
			
		||||
  if ( do_wilson ) {
 | 
			
		||||
    int Ls=1;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      double robust;
 | 
			
		||||
      wilson.push_back(Benchmark::DWF(1,L_list[l],robust));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int Ls=16;
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      double robust;
 | 
			
		||||
      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
 | 
			
		||||
      dwf4.push_back(result);
 | 
			
		||||
      robust_list.push_back(robust);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int NN=NN_global;
 | 
			
		||||
  if ( do_memory ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::Memory();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_comms && (NN>1) ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::Comms();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Comparison point     result: "  << dwf4[sel]/NN << " Mflop/s per node"<<std::endl;
 | 
			
		||||
  std::cout<<std::setprecision(3);
 | 
			
		||||
  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  int Nloop=100;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  int maxlat=24;
 | 
			
		||||
  int maxlat=32;
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
 | 
			
		||||
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  header();
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -92,11 +92,16 @@ int main (int argc, char ** argv)
 | 
			
		||||
      RealD Nnode = Grid.NodeCount();
 | 
			
		||||
      RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
 | 
			
		||||
      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
 | 
			
		||||
 | 
			
		||||
      int ncomm;
 | 
			
		||||
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
      for(int mu=0;mu<8;mu++){
 | 
			
		||||
	xbuf[mu].resize(lat*lat*lat*Ls);
 | 
			
		||||
	rbuf[mu].resize(lat*lat*lat*Ls);
 | 
			
		||||
	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
@@ -112,7 +117,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
	    int comm_proc=1;
 | 
			
		||||
	    int xmit_to_rank;
 | 
			
		||||
	    int recv_from_rank;
 | 
			
		||||
	    
 | 
			
		||||
	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	    Grid.SendToRecvFromBegin(requests,
 | 
			
		||||
				   (void *)&xbuf[mu][0],
 | 
			
		||||
@@ -163,7 +167,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 | 
			
		||||
@@ -172,9 +176,14 @@ int main (int argc, char ** argv)
 | 
			
		||||
      RealD Nnode = Grid.NodeCount();
 | 
			
		||||
      RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
 | 
			
		||||
      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
 | 
			
		||||
 | 
			
		||||
      for(int mu=0;mu<8;mu++){
 | 
			
		||||
	xbuf[mu].resize(lat*lat*lat*Ls);
 | 
			
		||||
	rbuf[mu].resize(lat*lat*lat*Ls);
 | 
			
		||||
	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      int ncomm;
 | 
			
		||||
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
@@ -249,7 +258,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -299,7 +308,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
					      bytes,mu);
 | 
			
		||||
	
 | 
			
		||||
	    comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
	  
 | 
			
		||||
@@ -310,11 +319,11 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu+4][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
					      bytes,mu+4);
 | 
			
		||||
	  
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
	Grid.StencilSendToRecvFromComplete(requests,0);
 | 
			
		||||
	Grid.Barrier();
 | 
			
		||||
	double stop=usecond();
 | 
			
		||||
	t_time[i] = stop-start; // microseconds
 | 
			
		||||
@@ -346,7 +355,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -393,8 +402,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
					      bytes,mu);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests,mu);
 | 
			
		||||
	    requests.resize(0);
 | 
			
		||||
 | 
			
		||||
	    comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
@@ -406,8 +415,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu+4][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
					      bytes,mu+4);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 | 
			
		||||
	    requests.resize(0);
 | 
			
		||||
	  
 | 
			
		||||
	  }
 | 
			
		||||
@@ -436,5 +445,97 @@ int main (int argc, char ** argv)
 | 
			
		||||
    }
 | 
			
		||||
  }    
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
      				    lat*mpi_layout[2],
 | 
			
		||||
      				    lat*mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      RealD Nrank = Grid._Nprocessors;
 | 
			
		||||
      RealD Nnode = Grid.NodeCount();
 | 
			
		||||
      RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
      std::vector<HalfSpinColourVectorD *> xbuf(8);
 | 
			
		||||
      std::vector<HalfSpinColourVectorD *> rbuf(8);
 | 
			
		||||
      Grid.ShmBufferFreeAll();
 | 
			
		||||
      for(int d=0;d<8;d++){
 | 
			
		||||
	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      int ncomm;
 | 
			
		||||
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
      double dbytes;
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	double start=usecond();
 | 
			
		||||
 | 
			
		||||
	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 | 
			
		||||
	dbytes=0;
 | 
			
		||||
	ncomm=0;
 | 
			
		||||
 | 
			
		||||
	parallel_for(int dir=0;dir<8;dir++){
 | 
			
		||||
 | 
			
		||||
	  double tbytes;
 | 
			
		||||
	  int mu =dir % 4;
 | 
			
		||||
 | 
			
		||||
	  if (mpi_layout[mu]>1 ) {
 | 
			
		||||
	  
 | 
			
		||||
	    ncomm++;
 | 
			
		||||
	    int xmit_to_rank;
 | 
			
		||||
	    int recv_from_rank;
 | 
			
		||||
	    if ( dir == mu ) { 
 | 
			
		||||
	      int comm_proc=1;
 | 
			
		||||
	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	    } else { 
 | 
			
		||||
	      int comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	    }
 | 
			
		||||
 | 
			
		||||
	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 | 
			
		||||
					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
 | 
			
		||||
 | 
			
		||||
#pragma omp atomic
 | 
			
		||||
	    dbytes+=tbytes;
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	Grid.Barrier();
 | 
			
		||||
	double stop=usecond();
 | 
			
		||||
	t_time[i] = stop-start; // microseconds
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      timestat.statistics(t_time);
 | 
			
		||||
 | 
			
		||||
      dbytes=dbytes*ppn;
 | 
			
		||||
      double xbytes    = dbytes*0.5;
 | 
			
		||||
      double rbytes    = dbytes*0.5;
 | 
			
		||||
      double bidibytes = dbytes;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 | 
			
		||||
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 | 
			
		||||
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 | 
			
		||||
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 | 
			
		||||
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 | 
			
		||||
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 | 
			
		||||
 
 | 
			
		||||
    }
 | 
			
		||||
  }    
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -503,9 +503,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  //assert(norm2(src_e)<1.0e-4);
 | 
			
		||||
  //assert(norm2(src_o)<1.0e-4);
 | 
			
		||||
 | 
			
		||||
  assert(norm2(src_e)<1.0e-4);
 | 
			
		||||
  assert(norm2(src_o)<1.0e-4);
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
  exit(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										54
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										54
									
								
								configure.ac
									
									
									
									
									
								
							@@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 | 
			
		||||
################ Get git info
 | 
			
		||||
#AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 | 
			
		||||
 | 
			
		||||
################ Set flags
 | 
			
		||||
# do not move!
 | 
			
		||||
CXXFLAGS="-O3 $CXXFLAGS"
 | 
			
		||||
 | 
			
		||||
############### Checks for programs
 | 
			
		||||
AC_PROG_CXX
 | 
			
		||||
AC_PROG_RANLIB
 | 
			
		||||
@@ -27,7 +31,6 @@ AX_GXX_VERSION
 | 
			
		||||
AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
 | 
			
		||||
      [version of g++ that will compile the code])
 | 
			
		||||
 | 
			
		||||
CXXFLAGS="-O3 $CXXFLAGS"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
############### Checks for typedefs, structures, and compiler characteristics
 | 
			
		||||
@@ -55,6 +58,10 @@ AC_CHECK_HEADERS(numaif.h)
 | 
			
		||||
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 | 
			
		||||
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 | 
			
		||||
 | 
			
		||||
############## Standard libraries
 | 
			
		||||
AC_CHECK_LIB([m],[cos])
 | 
			
		||||
AC_CHECK_LIB([stdc++],[abort])
 | 
			
		||||
 | 
			
		||||
############### GMP and MPFR
 | 
			
		||||
AC_ARG_WITH([gmp],
 | 
			
		||||
    [AS_HELP_STRING([--with-gmp=prefix],
 | 
			
		||||
@@ -324,8 +331,41 @@ case ${ac_PRECISION} in
 | 
			
		||||
     double)
 | 
			
		||||
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
 | 
			
		||||
     ;;
 | 
			
		||||
     *)
 | 
			
		||||
     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
######################  Shared memory allocation technique under MPI3
 | 
			
		||||
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
 | 
			
		||||
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
 | 
			
		||||
 | 
			
		||||
case ${ac_SHM} in
 | 
			
		||||
 | 
			
		||||
     shmget)
 | 
			
		||||
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
 | 
			
		||||
     ;;
 | 
			
		||||
 | 
			
		||||
     shmopen)
 | 
			
		||||
     AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
 | 
			
		||||
     ;;
 | 
			
		||||
 | 
			
		||||
     hugetlbfs)
 | 
			
		||||
     AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
 | 
			
		||||
     ;;
 | 
			
		||||
 | 
			
		||||
     *)
 | 
			
		||||
     AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]);
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
######################  Shared base path for SHMMMAP
 | 
			
		||||
AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
 | 
			
		||||
              [Select SHM mmap base path for hugetlbfs])],
 | 
			
		||||
	      [ac_SHMPATH=${enable_shmpath}],
 | 
			
		||||
	      [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
 | 
			
		||||
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 | 
			
		||||
 | 
			
		||||
############### communication type selection
 | 
			
		||||
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
 | 
			
		||||
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 | 
			
		||||
@@ -335,14 +375,14 @@ case ${ac_COMMS} in
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
 | 
			
		||||
        comms_type='none'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi3l*)
 | 
			
		||||
       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
 | 
			
		||||
       comms_type='mpi3l'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi3*)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
 | 
			
		||||
        comms_type='mpi3'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpit)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
 | 
			
		||||
        comms_type='mpit'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi*)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
 | 
			
		||||
        comms_type='mpi'
 | 
			
		||||
@@ -370,7 +410,7 @@ esac
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 | 
			
		||||
 | 
			
		||||
############### RNG selection
 | 
			
		||||
@@ -475,6 +515,8 @@ compiler version            : ${ax_cv_gxx_version}
 | 
			
		||||
SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 | 
			
		||||
Threading                   : ${ac_openmp}
 | 
			
		||||
Communications type         : ${comms_type}
 | 
			
		||||
Shared memory allocator     : ${ac_SHM}
 | 
			
		||||
Shared memory mmap path     : ${ac_SHMPATH}
 | 
			
		||||
Default precision           : ${ac_PRECISION}
 | 
			
		||||
Software FP16 conversion    : ${ac_SFW_FP16}
 | 
			
		||||
RNG choice                  : ${ac_RNG}
 | 
			
		||||
 
 | 
			
		||||
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
 | 
			
		||||
  extra_sources+=communicator/Communicator_base.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
if BUILD_COMMS_MPI3L
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpi3_leader.cc
 | 
			
		||||
if BUILD_COMMS_MPIT
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpit.cc
 | 
			
		||||
  extra_sources+=communicator/Communicator_base.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Cholesky from Eigen
 | 
			
		||||
  // There exists a ldlt that is documented as more stable
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
 | 
			
		||||
  // Force manifest hermitian to avoid rounding related
 | 
			
		||||
  m_rr = 0.5*(m_rr+m_rr.adjoint());
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
 | 
			
		||||
  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
 | 
			
		||||
  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
 | 
			
		||||
  auto  D_ldlt = m_rr.ldlt().vectorD(); 
 | 
			
		||||
  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
 | 
			
		||||
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
 | 
			
		||||
  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
 | 
			
		||||
  C    = L.adjoint();
 | 
			
		||||
  Cinv = C.inverse();
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Q = R C^{-1}
 | 
			
		||||
  //
 | 
			
		||||
@@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
 | 
			
		||||
  //
 | 
			
		||||
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // FIXME:: make a sliceMulMatrix to avoid zero vector
 | 
			
		||||
  sliceMulMatrix(Q,Cinv,R,Orthog);
 | 
			
		||||
}
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -199,7 +205,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 | 
			
		||||
 | 
			
		||||
  Linop.HermOp(X, AD);
 | 
			
		||||
  tmp = B - AD;  
 | 
			
		||||
  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
 | 
			
		||||
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
 | 
			
		||||
  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
 | 
			
		||||
  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
 | 
			
		||||
  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
 | 
			
		||||
  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
 | 
			
		||||
  D=Q;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
 | 
			
		||||
@@ -221,12 +232,14 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 | 
			
		||||
    MatrixTimer.Start();
 | 
			
		||||
    Linop.HermOp(D, Z);      
 | 
			
		||||
    MatrixTimer.Stop();
 | 
			
		||||
    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
 | 
			
		||||
 | 
			
		||||
    //4. M  = [D^dag Z]^{-1}
 | 
			
		||||
    sliceInnerTimer.Start();
 | 
			
		||||
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
 | 
			
		||||
    sliceInnerTimer.Stop();
 | 
			
		||||
    m_M       = m_DZ.inverse();
 | 
			
		||||
    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    //5. X  = X + D MC
 | 
			
		||||
    m_tmp     = m_M * m_C;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,5 @@
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#include <fcntl.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
@@ -11,7 +9,7 @@ int PointerCache::victim;
 | 
			
		||||
 | 
			
		||||
void *PointerCache::Insert(void *ptr,size_t bytes) {
 | 
			
		||||
 | 
			
		||||
  if (bytes < 4096 ) return NULL;
 | 
			
		||||
  if (bytes < 4096 ) return ptr;
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
@@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
 | 
			
		||||
  return NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void check_huge_pages(void *Buf,uint64_t BYTES)
 | 
			
		||||
{
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
  int fd = open("/proc/self/pagemap", O_RDONLY);
 | 
			
		||||
  assert(fd >= 0);
 | 
			
		||||
  const int page_size = 4096;
 | 
			
		||||
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
 | 
			
		||||
  off_t offset = sizeof(uint64_t) * virt_pfn;
 | 
			
		||||
  uint64_t npages = (BYTES + page_size-1) / page_size;
 | 
			
		||||
  uint64_t pagedata[npages];
 | 
			
		||||
  uint64_t ret = lseek(fd, offset, SEEK_SET);
 | 
			
		||||
  assert(ret == offset);
 | 
			
		||||
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
 | 
			
		||||
  assert(ret == sizeof(uint64_t) * npages);
 | 
			
		||||
  int nhugepages = npages / 512;
 | 
			
		||||
  int n4ktotal, nnothuge;
 | 
			
		||||
  n4ktotal = 0;
 | 
			
		||||
  nnothuge = 0;
 | 
			
		||||
  for (int i = 0; i < nhugepages; ++i) {
 | 
			
		||||
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
 | 
			
		||||
    for (int j = 0; j < 512; ++j) {
 | 
			
		||||
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
 | 
			
		||||
      ++n4ktotal;
 | 
			
		||||
      if (pageaddr != baseaddr + j * page_size)
 | 
			
		||||
	++nnothuge;
 | 
			
		||||
      }
 | 
			
		||||
  }
 | 
			
		||||
  int rank = CartesianCommunicator::RankWorld();
 | 
			
		||||
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -64,6 +64,8 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  void check_huge_pages(void *Buf,uint64_t BYTES);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
// A lattice of something, but assume the something is SIMDized.
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -92,12 +94,20 @@ public:
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
 | 
			
		||||
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
 | 
			
		||||
    //    if ( ptr != NULL ) 
 | 
			
		||||
    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
 | 
			
		||||
 | 
			
		||||
    //////////////////
 | 
			
		||||
    // Hack 2MB align; could make option probably doesn't need configurability
 | 
			
		||||
    //////////////////
 | 
			
		||||
//define GRID_ALLOC_ALIGN (128)
 | 
			
		||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 | 
			
		||||
#else
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 | 
			
		||||
#endif
 | 
			
		||||
    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
 | 
			
		||||
    // First touch optimise in threaded loop
 | 
			
		||||
    uint8_t *cp = (uint8_t *)ptr;
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
@@ -111,6 +121,7 @@ public:
 | 
			
		||||
 | 
			
		||||
  void deallocate(pointer __p, size_type __n) { 
 | 
			
		||||
    size_type bytes = __n * sizeof(_Tp);
 | 
			
		||||
 | 
			
		||||
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
@@ -189,16 +200,18 @@ public:
 | 
			
		||||
  pointer allocate(size_type __n, const void* _p= 0) 
 | 
			
		||||
  {
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 | 
			
		||||
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 | 
			
		||||
#else
 | 
			
		||||
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 | 
			
		||||
    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 | 
			
		||||
#endif
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
    uint8_t *cp = (uint8_t *)ptr;
 | 
			
		||||
    if ( ptr ) { 
 | 
			
		||||
    // One touch per 4k page, static OMP loop to catch same loop order
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
    for(size_type n=0;n<bytes;n+=4096){
 | 
			
		||||
      cp[n]=0;
 | 
			
		||||
      for(size_type n=0;n<bytes;n+=4096){
 | 
			
		||||
	cp[n]=0;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -185,17 +185,18 @@ public:
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    void show_decomposition(){
 | 
			
		||||
      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
 | 
			
		||||
      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
 | 
			
		||||
      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
 | 
			
		||||
      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
 | 
			
		||||
    } 
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -62,77 +62,81 @@ public:
 | 
			
		||||
      return shift;
 | 
			
		||||
    }
 | 
			
		||||
    GridCartesian(const std::vector<int> &dimensions,
 | 
			
		||||
		  const std::vector<int> &simd_layout,
 | 
			
		||||
		  const std::vector<int> &processor_grid
 | 
			
		||||
		  ) : GridBase(processor_grid)
 | 
			
		||||
                  const std::vector<int> &simd_layout,
 | 
			
		||||
                  const std::vector<int> &processor_grid) : GridBase(processor_grid)
 | 
			
		||||
    {
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        // Grid information
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        _ndimension = dimensions.size();
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // Grid information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _ndimension = dimensions.size();
 | 
			
		||||
 | 
			
		||||
        _fdimensions.resize(_ndimension);
 | 
			
		||||
        _gdimensions.resize(_ndimension);
 | 
			
		||||
        _ldimensions.resize(_ndimension);
 | 
			
		||||
        _rdimensions.resize(_ndimension);
 | 
			
		||||
        _simd_layout.resize(_ndimension);
 | 
			
		||||
	_lstart.resize(_ndimension);
 | 
			
		||||
	_lend.resize(_ndimension);
 | 
			
		||||
      _fdimensions.resize(_ndimension);
 | 
			
		||||
      _gdimensions.resize(_ndimension);
 | 
			
		||||
      _ldimensions.resize(_ndimension);
 | 
			
		||||
      _rdimensions.resize(_ndimension);
 | 
			
		||||
      _simd_layout.resize(_ndimension);
 | 
			
		||||
      _lstart.resize(_ndimension);
 | 
			
		||||
      _lend.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
        _ostride.resize(_ndimension);
 | 
			
		||||
        _istride.resize(_ndimension);
 | 
			
		||||
      _ostride.resize(_ndimension);
 | 
			
		||||
      _istride.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
        _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
      _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
 | 
			
		||||
        for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	  _fdimensions[d] = dimensions[d]; // Global dimensions
 | 
			
		||||
	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
 | 
			
		||||
	  _simd_layout[d] = simd_layout[d];
 | 
			
		||||
	  _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
	  _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        _fdimensions[d] = dimensions[d];   // Global dimensions
 | 
			
		||||
        _gdimensions[d] = _fdimensions[d]; // Global dimensions
 | 
			
		||||
        _simd_layout[d] = simd_layout[d];
 | 
			
		||||
        _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
        _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
 | 
			
		||||
	  //FIXME check for exact division
 | 
			
		||||
        // Use a reduced simd grid
 | 
			
		||||
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
 | 
			
		||||
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
 | 
			
		||||
 | 
			
		||||
	  // Use a reduced simd grid
 | 
			
		||||
	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
 | 
			
		||||
	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
 | 
			
		||||
	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
 | 
			
		||||
	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 | 
			
		||||
	  _osites  *= _rdimensions[d];
 | 
			
		||||
	  _isites  *= _simd_layout[d];
 | 
			
		||||
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
 | 
			
		||||
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 | 
			
		||||
 | 
			
		||||
	  // Addressing support
 | 
			
		||||
	  if ( d==0 ) {
 | 
			
		||||
	    _ostride[d] = 1;
 | 
			
		||||
	    _istride[d] = 1;
 | 
			
		||||
	  } else {
 | 
			
		||||
	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 | 
			
		||||
	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
 | 
			
		||||
	  }
 | 
			
		||||
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
 | 
			
		||||
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
 | 
			
		||||
        _osites *= _rdimensions[d];
 | 
			
		||||
        _isites *= _simd_layout[d];
 | 
			
		||||
 | 
			
		||||
        // Addressing support
 | 
			
		||||
        if (d == 0)
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = 1;
 | 
			
		||||
          _istride[d] = 1;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        // subplane information
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        _slice_block.resize(_ndimension);
 | 
			
		||||
        _slice_stride.resize(_ndimension);
 | 
			
		||||
        _slice_nblock.resize(_ndimension);
 | 
			
		||||
            
 | 
			
		||||
        int block =1;
 | 
			
		||||
        int nblock=1;
 | 
			
		||||
        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
 | 
			
		||||
            
 | 
			
		||||
        for(int d=0;d<_ndimension;d++){
 | 
			
		||||
            nblock/=_rdimensions[d];
 | 
			
		||||
            _slice_block[d] =block;
 | 
			
		||||
            _slice_stride[d]=_ostride[d]*_rdimensions[d];
 | 
			
		||||
            _slice_nblock[d]=nblock;
 | 
			
		||||
            block = block*_rdimensions[d];
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
 | 
			
		||||
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // subplane information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _slice_block.resize(_ndimension);
 | 
			
		||||
      _slice_stride.resize(_ndimension);
 | 
			
		||||
      _slice_nblock.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
      int block = 1;
 | 
			
		||||
      int nblock = 1;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
        nblock *= _rdimensions[d];
 | 
			
		||||
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        nblock /= _rdimensions[d];
 | 
			
		||||
        _slice_block[d] = block;
 | 
			
		||||
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
 | 
			
		||||
        _slice_nblock[d] = nblock;
 | 
			
		||||
        block = block * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -131,20 +131,20 @@ public:
 | 
			
		||||
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
 | 
			
		||||
    }
 | 
			
		||||
    void Init(const std::vector<int> &dimensions,
 | 
			
		||||
	      const std::vector<int> &simd_layout,
 | 
			
		||||
	      const std::vector<int> &processor_grid,
 | 
			
		||||
	      const std::vector<int> &checker_dim_mask,
 | 
			
		||||
	      int checker_dim)
 | 
			
		||||
              const std::vector<int> &simd_layout,
 | 
			
		||||
              const std::vector<int> &processor_grid,
 | 
			
		||||
              const std::vector<int> &checker_dim_mask,
 | 
			
		||||
              int checker_dim)
 | 
			
		||||
    {
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
    // Grid information
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // Grid information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _checker_dim = checker_dim;
 | 
			
		||||
      assert(checker_dim_mask[checker_dim]==1);
 | 
			
		||||
      assert(checker_dim_mask[checker_dim] == 1);
 | 
			
		||||
      _ndimension = dimensions.size();
 | 
			
		||||
      assert(checker_dim_mask.size()==_ndimension);
 | 
			
		||||
      assert(processor_grid.size()==_ndimension);
 | 
			
		||||
      assert(simd_layout.size()==_ndimension);
 | 
			
		||||
      assert(checker_dim_mask.size() == _ndimension);
 | 
			
		||||
      assert(processor_grid.size() == _ndimension);
 | 
			
		||||
      assert(simd_layout.size() == _ndimension);
 | 
			
		||||
 | 
			
		||||
      _fdimensions.resize(_ndimension);
 | 
			
		||||
      _gdimensions.resize(_ndimension);
 | 
			
		||||
@@ -159,47 +159,55 @@ public:
 | 
			
		||||
 | 
			
		||||
      _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
 | 
			
		||||
      _checker_dim_mask=checker_dim_mask;
 | 
			
		||||
      _checker_dim_mask = checker_dim_mask;
 | 
			
		||||
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	_fdimensions[d] = dimensions[d];
 | 
			
		||||
	_gdimensions[d] = _fdimensions[d];
 | 
			
		||||
	_fsites = _fsites * _fdimensions[d];
 | 
			
		||||
	_gsites = _gsites * _gdimensions[d];
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        _fdimensions[d] = dimensions[d];
 | 
			
		||||
        _gdimensions[d] = _fdimensions[d];
 | 
			
		||||
        _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
        _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
 | 
			
		||||
	if (d==_checker_dim) {
 | 
			
		||||
	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
 | 
			
		||||
	}
 | 
			
		||||
	_ldimensions[d] = _gdimensions[d]/_processors[d];
 | 
			
		||||
	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
 | 
			
		||||
	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          assert((_gdimensions[d] & 0x1) == 0);
 | 
			
		||||
          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
 | 
			
		||||
        }
 | 
			
		||||
        _ldimensions[d] = _gdimensions[d] / _processors[d];
 | 
			
		||||
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
 | 
			
		||||
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
 | 
			
		||||
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
 | 
			
		||||
 | 
			
		||||
	// Use a reduced simd grid
 | 
			
		||||
	_simd_layout[d] = simd_layout[d];
 | 
			
		||||
	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 | 
			
		||||
	assert(_rdimensions[d]>0);
 | 
			
		||||
        // Use a reduced simd grid
 | 
			
		||||
        _simd_layout[d] = simd_layout[d];
 | 
			
		||||
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
 | 
			
		||||
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 | 
			
		||||
        assert(_rdimensions[d] > 0);
 | 
			
		||||
 | 
			
		||||
	// all elements of a simd vector must have same checkerboard.
 | 
			
		||||
	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 | 
			
		||||
	if ( _simd_layout[d]>1 ) {
 | 
			
		||||
	  if ( checker_dim_mask[d] ) { 
 | 
			
		||||
	    assert( (_rdimensions[d]&0x1) == 0 );
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	_osites *= _rdimensions[d];
 | 
			
		||||
	_isites *= _simd_layout[d];
 | 
			
		||||
        
 | 
			
		||||
	// Addressing support
 | 
			
		||||
	if ( d==0 ) {
 | 
			
		||||
	  _ostride[d] = 1;
 | 
			
		||||
	  _istride[d] = 1;
 | 
			
		||||
	} else {
 | 
			
		||||
	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 | 
			
		||||
	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 | 
			
		||||
	}
 | 
			
		||||
        // all elements of a simd vector must have same checkerboard.
 | 
			
		||||
        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
 | 
			
		||||
        if (_simd_layout[d] > 1)
 | 
			
		||||
        {
 | 
			
		||||
          if (checker_dim_mask[d])
 | 
			
		||||
          {
 | 
			
		||||
            assert((_rdimensions[d] & 0x1) == 0);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        _osites *= _rdimensions[d];
 | 
			
		||||
        _isites *= _simd_layout[d];
 | 
			
		||||
 | 
			
		||||
        // Addressing support
 | 
			
		||||
        if (d == 0)
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = 1;
 | 
			
		||||
          _istride[d] = 1;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
 | 
			
		||||
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -209,58 +217,69 @@ public:
 | 
			
		||||
      _slice_stride.resize(_ndimension);
 | 
			
		||||
      _slice_nblock.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
      int block =1;
 | 
			
		||||
      int nblock=1;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
 | 
			
		||||
      int block = 1;
 | 
			
		||||
      int nblock = 1;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
        nblock *= _rdimensions[d];
 | 
			
		||||
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	nblock/=_rdimensions[d];
 | 
			
		||||
	_slice_block[d] =block;
 | 
			
		||||
	_slice_stride[d]=_ostride[d]*_rdimensions[d];
 | 
			
		||||
	_slice_nblock[d]=nblock;
 | 
			
		||||
	block = block*_rdimensions[d];
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        nblock /= _rdimensions[d];
 | 
			
		||||
        _slice_block[d] = block;
 | 
			
		||||
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
 | 
			
		||||
        _slice_nblock[d] = nblock;
 | 
			
		||||
        block = block * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////////////
 | 
			
		||||
      // Create a checkerboard lookup table
 | 
			
		||||
      ////////////////////////////////////////////////
 | 
			
		||||
      int rvol = 1;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	rvol=rvol * _rdimensions[d];
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        rvol = rvol * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
      _checker_board.resize(rvol);
 | 
			
		||||
      for(int osite=0;osite<_osites;osite++){
 | 
			
		||||
	_checker_board[osite] = CheckerBoardFromOindex (osite);
 | 
			
		||||
      for (int osite = 0; osite < _osites; osite++)
 | 
			
		||||
      {
 | 
			
		||||
        _checker_board[osite] = CheckerBoardFromOindex(osite);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    };
 | 
			
		||||
protected:
 | 
			
		||||
 | 
			
		||||
  protected:
 | 
			
		||||
    virtual int oIndex(std::vector<int> &coor)
 | 
			
		||||
    {
 | 
			
		||||
      int idx=0;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++) {
 | 
			
		||||
	if( d==_checker_dim ) {
 | 
			
		||||
	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
 | 
			
		||||
	} else {
 | 
			
		||||
	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
 | 
			
		||||
	}
 | 
			
		||||
      int idx = 0;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      return idx;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    virtual int iIndex(std::vector<int> &lcoor)
 | 
			
		||||
    {
 | 
			
		||||
        int idx=0;
 | 
			
		||||
        for(int d=0;d<_ndimension;d++) {
 | 
			
		||||
	  if( d==_checker_dim ) {
 | 
			
		||||
	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
 | 
			
		||||
	  } else { 
 | 
			
		||||
	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
        return idx;
 | 
			
		||||
      int idx = 0;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      return idx;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#include <fcntl.h>
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#include <limits.h>
 | 
			
		||||
#include <sys/mman.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
@@ -33,8 +37,11 @@ namespace Grid {
 | 
			
		||||
// Info that is setup once and indept of cartesian layout
 | 
			
		||||
///////////////////////////////////////////////////////////////
 | 
			
		||||
void *              CartesianCommunicator::ShmCommBuf;
 | 
			
		||||
uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
 | 
			
		||||
CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 | 
			
		||||
uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
 | 
			
		||||
CartesianCommunicator::CommunicatorPolicy_t  
 | 
			
		||||
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 | 
			
		||||
int CartesianCommunicator::nCommThreads = -1;
 | 
			
		||||
int CartesianCommunicator::Hugepages = 0;
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////
 | 
			
		||||
// Alloc, free shmem region
 | 
			
		||||
@@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 | 
			
		||||
  GlobalSumVector((double *)c,2*N);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) 
 | 
			
		||||
 | 
			
		||||
int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 | 
			
		||||
int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
 | 
			
		||||
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						       void *xmit,
 | 
			
		||||
						       int xmit_to_rank,
 | 
			
		||||
						       void *recv,
 | 
			
		||||
						       int recv_from_rank,
 | 
			
		||||
						       int bytes)
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 | 
			
		||||
						     int xmit_to_rank,
 | 
			
		||||
						     void *recv,
 | 
			
		||||
						     int recv_from_rank,
 | 
			
		||||
						     int bytes, int dir)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<CommsRequest_t> list;
 | 
			
		||||
  // Discard the "dir"
 | 
			
		||||
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 | 
			
		||||
  SendToRecvFromComplete(list);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
							 void *xmit,
 | 
			
		||||
							 int xmit_to_rank,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int recv_from_rank,
 | 
			
		||||
							 int bytes, int dir)
 | 
			
		||||
{
 | 
			
		||||
  // Discard the "dir"
 | 
			
		||||
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{
 | 
			
		||||
  SendToRecvFromComplete(waitall);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) 
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::StencilBarrier(void){};
 | 
			
		||||
 | 
			
		||||
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 | 
			
		||||
@@ -121,8 +146,25 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
 | 
			
		||||
  return NULL;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::ShmInitGeneric(void){
 | 
			
		||||
#if 1
 | 
			
		||||
 | 
			
		||||
  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
 | 
			
		||||
#ifdef MAP_HUGETLB
 | 
			
		||||
  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 | 
			
		||||
#endif
 | 
			
		||||
  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
 | 
			
		||||
  if (ShmCommBuf == (void *)MAP_FAILED) {
 | 
			
		||||
    perror("mmap failed ");
 | 
			
		||||
    exit(EXIT_FAILURE);  
 | 
			
		||||
  }
 | 
			
		||||
#ifdef MADV_HUGEPAGE
 | 
			
		||||
  if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
 | 
			
		||||
#endif
 | 
			
		||||
#else 
 | 
			
		||||
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
 | 
			
		||||
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 | 
			
		||||
#endif
 | 
			
		||||
  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_COMMS_MPI3
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_MPI3L
 | 
			
		||||
#ifdef GRID_COMMS_MPIT
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
@@ -50,12 +50,24 @@ namespace Grid {
 | 
			
		||||
class CartesianCommunicator {
 | 
			
		||||
  public:    
 | 
			
		||||
 | 
			
		||||
  // 65536 ranks per node adequate for now
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////
 | 
			
		||||
  // Isend/Irecv/Wait, or Sendrecv blocking
 | 
			
		||||
  ////////////////////////////////////////////
 | 
			
		||||
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
 | 
			
		||||
  static CommunicatorPolicy_t CommunicatorPolicy;
 | 
			
		||||
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  // Up to 65536 ranks per node adequate for now
 | 
			
		||||
  // 128MB shared memory for comms enought for 48^4 local vol comms
 | 
			
		||||
  // Give external control (command line override?) of this
 | 
			
		||||
 | 
			
		||||
  static const int      MAXLOG2RANKSPERNODE = 16;            
 | 
			
		||||
  static uint64_t MAX_MPI_SHM_BYTES;
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  static const int MAXLOG2RANKSPERNODE = 16;            
 | 
			
		||||
  static uint64_t  MAX_MPI_SHM_BYTES;
 | 
			
		||||
  static int       nCommThreads;
 | 
			
		||||
  // use explicit huge pages
 | 
			
		||||
  static int       Hugepages;
 | 
			
		||||
 | 
			
		||||
  // Communicator should know nothing of the physics grid, only processor grid.
 | 
			
		||||
  int              _Nprocessors;     // How many in all
 | 
			
		||||
@@ -64,14 +76,18 @@ class CartesianCommunicator {
 | 
			
		||||
  std::vector<int> _processor_coor;  // linear processor coordinate
 | 
			
		||||
  unsigned long _ndimension;
 | 
			
		||||
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
 | 
			
		||||
  static MPI_Comm communicator_world;
 | 
			
		||||
         MPI_Comm communicator;
 | 
			
		||||
 | 
			
		||||
  MPI_Comm              communicator;
 | 
			
		||||
  std::vector<MPI_Comm> communicator_halo;
 | 
			
		||||
 | 
			
		||||
  typedef MPI_Request CommsRequest_t;
 | 
			
		||||
#else 
 | 
			
		||||
  typedef int CommsRequest_t;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Helper functionality for SHM Windows common to all other impls
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -117,10 +133,6 @@ class CartesianCommunicator {
 | 
			
		||||
  /////////////////////////////////
 | 
			
		||||
  static void * ShmCommBuf;
 | 
			
		||||
 | 
			
		||||
  // Isend/Irecv/Wait, or Sendrecv blocking
 | 
			
		||||
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
 | 
			
		||||
  static CommunicatorPolicy_t CommunicatorPolicy;
 | 
			
		||||
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
 | 
			
		||||
  
 | 
			
		||||
  size_t heap_top;
 | 
			
		||||
  size_t heap_bytes;
 | 
			
		||||
@@ -211,14 +223,21 @@ class CartesianCommunicator {
 | 
			
		||||
  
 | 
			
		||||
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 | 
			
		||||
 | 
			
		||||
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
				  void *xmit,
 | 
			
		||||
				  int xmit_to_rank,
 | 
			
		||||
				  void *recv,
 | 
			
		||||
				  int recv_from_rank,
 | 
			
		||||
				  int bytes);
 | 
			
		||||
  double StencilSendToRecvFrom(void *xmit,
 | 
			
		||||
			       int xmit_to_rank,
 | 
			
		||||
			       void *recv,
 | 
			
		||||
			       int recv_from_rank,
 | 
			
		||||
			       int bytes,int dir);
 | 
			
		||||
 | 
			
		||||
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 | 
			
		||||
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
				    void *xmit,
 | 
			
		||||
				    int xmit_to_rank,
 | 
			
		||||
				    void *recv,
 | 
			
		||||
				    int recv_from_rank,
 | 
			
		||||
				    int bytes,int dir);
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
 | 
			
		||||
  void StencilBarrier(void);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -41,9 +41,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef HAVE_NUMAIF_H
 | 
			
		||||
#include <numaif.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef SHM_HUGETLB
 | 
			
		||||
#define SHM_HUGETLB 04000
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
@@ -200,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  ShmCommBuf = 0;
 | 
			
		||||
  ShmCommBufs.resize(ShmSize);
 | 
			
		||||
 | 
			
		||||
#if 1
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Hugetlbf and others map filesystems as mappable huge pages
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_MPI3_SHMMMAP
 | 
			
		||||
  char shm_name [NAME_MAX];
 | 
			
		||||
  for(int r=0;r<ShmSize;r++){
 | 
			
		||||
    
 | 
			
		||||
    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
 | 
			
		||||
    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
 | 
			
		||||
    //sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
 | 
			
		||||
    //    printf("Opening file %s \n",shm_name);
 | 
			
		||||
    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
 | 
			
		||||
    if ( fd == -1) { 
 | 
			
		||||
      printf("open %s failed\n",shm_name);
 | 
			
		||||
      perror("open hugetlbfs");
 | 
			
		||||
      exit(0);
 | 
			
		||||
    }
 | 
			
		||||
    int mmap_flag = MAP_SHARED ;
 | 
			
		||||
#ifdef MAP_POPULATE    
 | 
			
		||||
    mmap_flag|=MAP_POPULATE;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef MAP_HUGETLB
 | 
			
		||||
    if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 | 
			
		||||
#endif
 | 
			
		||||
    void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
 | 
			
		||||
    if ( ptr == (void *)MAP_FAILED ) {    
 | 
			
		||||
      printf("mmap %s failed\n",shm_name);
 | 
			
		||||
      perror("failed mmap");      assert(0);    
 | 
			
		||||
    }
 | 
			
		||||
    assert(((uint64_t)ptr&0x3F)==0);
 | 
			
		||||
    ShmCommBufs[r] =ptr;
 | 
			
		||||
    
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
 | 
			
		||||
  // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
 | 
			
		||||
  // the posix shm virtual file system
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_MPI3_SHMOPEN
 | 
			
		||||
  char shm_name [NAME_MAX];
 | 
			
		||||
  if ( ShmRank == 0 ) {
 | 
			
		||||
    for(int r=0;r<ShmSize;r++){
 | 
			
		||||
@@ -214,12 +251,21 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
 | 
			
		||||
      ftruncate(fd, size);
 | 
			
		||||
      
 | 
			
		||||
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 | 
			
		||||
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
 | 
			
		||||
      int mmap_flag = MAP_SHARED;
 | 
			
		||||
#ifdef MAP_POPULATE 
 | 
			
		||||
      mmap_flag |= MAP_POPULATE;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef MAP_HUGETLB
 | 
			
		||||
      if (Hugepages) mmap_flag |= MAP_HUGETLB;
 | 
			
		||||
#endif
 | 
			
		||||
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
 | 
			
		||||
 | 
			
		||||
      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
 | 
			
		||||
      assert(((uint64_t)ptr&0x3F)==0);
 | 
			
		||||
 | 
			
		||||
      // Try to force numa domain on the shm segment if we have numaif.h
 | 
			
		||||
#ifdef HAVE_NUMAIF_H
 | 
			
		||||
// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
 | 
			
		||||
#if 0
 | 
			
		||||
//#ifdef HAVE_NUMAIF_H
 | 
			
		||||
	int status;
 | 
			
		||||
	int flags=MPOL_MF_MOVE;
 | 
			
		||||
#ifdef KNL
 | 
			
		||||
@@ -236,7 +282,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
	  if (ierr && (page==0)) perror("numa relocate command failed");
 | 
			
		||||
	}
 | 
			
		||||
#endif
 | 
			
		||||
      ShmCommBufs[r] =ptr;
 | 
			
		||||
	ShmCommBufs[r] =ptr;
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
@@ -258,21 +304,32 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
      ShmCommBufs[r] =ptr;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
#endif
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // SHMGET SHMAT and SHM_HUGETLB flag
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_MPI3_SHMGET
 | 
			
		||||
  std::vector<int> shmids(ShmSize);
 | 
			
		||||
 | 
			
		||||
  if ( ShmRank == 0 ) {
 | 
			
		||||
    for(int r=0;r<ShmSize;r++){
 | 
			
		||||
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
 | 
			
		||||
      key_t key   = 0x4545 + r;
 | 
			
		||||
      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
 | 
			
		||||
      key_t key   = IPC_PRIVATE;
 | 
			
		||||
      int flags = IPC_CREAT | SHM_R | SHM_W;
 | 
			
		||||
#ifdef SHM_HUGETLB
 | 
			
		||||
      if (Hugepages) flags|=SHM_HUGETLB;
 | 
			
		||||
#endif
 | 
			
		||||
      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
 | 
			
		||||
	int errsv = errno;
 | 
			
		||||
	printf("Errno %d\n",errsv);
 | 
			
		||||
	printf("key   %d\n",key);
 | 
			
		||||
	printf("size  %lld\n",size);
 | 
			
		||||
	printf("flags %d\n",flags);
 | 
			
		||||
	perror("shmget");
 | 
			
		||||
	exit(1);
 | 
			
		||||
      } else { 
 | 
			
		||||
	printf("shmid: 0x%x\n", shmids[r]);
 | 
			
		||||
      }
 | 
			
		||||
      printf("shmid: 0x%x\n", shmids[r]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  MPI_Barrier(ShmComm);
 | 
			
		||||
@@ -397,8 +454,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{ 
 | 
			
		||||
  int ierr;
 | 
			
		||||
  communicator=communicator_world;
 | 
			
		||||
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
 | 
			
		||||
  communicator_halo.resize (2*_ndimension);
 | 
			
		||||
  for(int i=0;i<_ndimension*2;i++){
 | 
			
		||||
    MPI_Comm_dup(communicator,&communicator_halo[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Assert power of two shm_size.
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -621,13 +684,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						       void *xmit,
 | 
			
		||||
						       int dest,
 | 
			
		||||
						       void *recv,
 | 
			
		||||
						       int from,
 | 
			
		||||
						       int bytes)
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 | 
			
		||||
						     int dest,
 | 
			
		||||
						     void *recv,
 | 
			
		||||
						     int from,
 | 
			
		||||
						     int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<CommsRequest_t> list;
 | 
			
		||||
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
 | 
			
		||||
  StencilSendToRecvFromComplete(list,dir);
 | 
			
		||||
  return offbytes;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
							 void *xmit,
 | 
			
		||||
							 int dest,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int from,
 | 
			
		||||
							 int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  assert(dir < communicator_halo.size());
 | 
			
		||||
 | 
			
		||||
  MPI_Request xrq;
 | 
			
		||||
  MPI_Request rrq;
 | 
			
		||||
 | 
			
		||||
@@ -646,26 +723,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
  gfrom = MPI_UNDEFINED;
 | 
			
		||||
#endif
 | 
			
		||||
  if ( gfrom ==MPI_UNDEFINED) {
 | 
			
		||||
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
 | 
			
		||||
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    list.push_back(rrq);
 | 
			
		||||
    off_node_bytes+=bytes;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( gdest == MPI_UNDEFINED ) {
 | 
			
		||||
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
 | 
			
		||||
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    list.push_back(xrq);
 | 
			
		||||
    off_node_bytes+=bytes;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
 | 
			
		||||
    this->StencilSendToRecvFromComplete(list);
 | 
			
		||||
    this->StencilSendToRecvFromComplete(list,dir);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return off_node_bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{
 | 
			
		||||
  SendToRecvFromComplete(waitall);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										286
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,286 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/communicator/Communicator_mpi.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#include <Grid/GridQCDcore.h>
 | 
			
		||||
#include <Grid/qcd/action/ActionCore.h>
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Info that is setup once and indept of cartesian layout
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
MPI_Comm CartesianCommunicator::communicator_world;
 | 
			
		||||
 | 
			
		||||
// Should error check all MPI calls.
 | 
			
		||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  int flag;
 | 
			
		||||
  int provided;
 | 
			
		||||
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
 | 
			
		||||
  if ( !flag ) {
 | 
			
		||||
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
 | 
			
		||||
    if ( provided != MPI_THREAD_MULTIPLE ) {
 | 
			
		||||
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
 | 
			
		||||
  ShmInitGeneric();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
  std::vector<int> periodic(_ndimension,1);
 | 
			
		||||
 | 
			
		||||
  _Nprocessors=1;
 | 
			
		||||
  _processors = processors;
 | 
			
		||||
  _processor_coor.resize(_ndimension);
 | 
			
		||||
  
 | 
			
		||||
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
 | 
			
		||||
  MPI_Comm_rank(communicator,&_processor);
 | 
			
		||||
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<_ndimension;i++){
 | 
			
		||||
    _Nprocessors*=_processors[i];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  communicator_halo.resize (2*_ndimension);
 | 
			
		||||
  for(int i=0;i<_ndimension*2;i++){
 | 
			
		||||
    MPI_Comm_dup(communicator,&communicator_halo[i]);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  int Size; 
 | 
			
		||||
  MPI_Comm_size(communicator,&Size);
 | 
			
		||||
  
 | 
			
		||||
  assert(Size==_Nprocessors);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalXOR(uint32_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalXOR(uint64_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(double &d)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 | 
			
		||||
{
 | 
			
		||||
  int rank;
 | 
			
		||||
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  return rank;
 | 
			
		||||
}
 | 
			
		||||
void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 | 
			
		||||
{
 | 
			
		||||
  coor.resize(_ndimension);
 | 
			
		||||
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
 | 
			
		||||
					   int dest,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int from,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<CommsRequest_t> reqs(0);
 | 
			
		||||
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
 | 
			
		||||
  SendToRecvFromComplete(reqs);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int sender,
 | 
			
		||||
					   int receiver,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  MPI_Status stat;
 | 
			
		||||
  assert(sender != receiver);
 | 
			
		||||
  int tag = sender;
 | 
			
		||||
  if ( _processor == sender ) {
 | 
			
		||||
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
 | 
			
		||||
  }
 | 
			
		||||
  if ( _processor == receiver ) { 
 | 
			
		||||
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						void *xmit,
 | 
			
		||||
						int dest,
 | 
			
		||||
						void *recv,
 | 
			
		||||
						int from,
 | 
			
		||||
						int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
 | 
			
		||||
    MPI_Request xrq;
 | 
			
		||||
    MPI_Request rrq;
 | 
			
		||||
 | 
			
		||||
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
 | 
			
		||||
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
 | 
			
		||||
    
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    list.push_back(xrq);
 | 
			
		||||
    list.push_back(rrq);
 | 
			
		||||
  } else { 
 | 
			
		||||
    // Give the CPU to MPI immediately; can use threads to overlap optionally
 | 
			
		||||
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 | 
			
		||||
		      recv,bytes,MPI_CHAR,from, from,
 | 
			
		||||
		      communicator,MPI_STATUS_IGNORE);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
 | 
			
		||||
    int nreq=list.size();
 | 
			
		||||
    std::vector<MPI_Status> status(nreq);
 | 
			
		||||
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::Barrier(void)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Barrier(communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Bcast(data,
 | 
			
		||||
		     bytes,
 | 
			
		||||
		     MPI_BYTE,
 | 
			
		||||
		     root,
 | 
			
		||||
		     communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
  ///////////////////////////////////////////////////////
 | 
			
		||||
  // Should only be used prior to Grid Init finished.
 | 
			
		||||
  // Check for this?
 | 
			
		||||
  ///////////////////////////////////////////////////////
 | 
			
		||||
int CartesianCommunicator::RankWorld(void){ 
 | 
			
		||||
  int r; 
 | 
			
		||||
  MPI_Comm_rank(communicator_world,&r);
 | 
			
		||||
  return r;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int ierr= MPI_Bcast(data,
 | 
			
		||||
		      bytes,
 | 
			
		||||
		      MPI_BYTE,
 | 
			
		||||
		      root,
 | 
			
		||||
		      communicator_world);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
							 void *xmit,
 | 
			
		||||
							 int xmit_to_rank,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int recv_from_rank,
 | 
			
		||||
							 int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  assert(dir < communicator_halo.size());
 | 
			
		||||
  
 | 
			
		||||
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
 | 
			
		||||
  // Give the CPU to MPI immediately; can use threads to overlap optionally
 | 
			
		||||
  MPI_Request req[2];
 | 
			
		||||
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
 | 
			
		||||
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
 | 
			
		||||
 | 
			
		||||
  list.push_back(req[0]);
 | 
			
		||||
  list.push_back(req[1]);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{ 
 | 
			
		||||
  int nreq=waitall.size();
 | 
			
		||||
  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
 | 
			
		||||
};
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 | 
			
		||||
						    int xmit_to_rank,
 | 
			
		||||
						    void *recv,
 | 
			
		||||
						    int recv_from_rank,
 | 
			
		||||
						    int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  assert(dir < communicator_halo.size());
 | 
			
		||||
  
 | 
			
		||||
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
 | 
			
		||||
  // Give the CPU to MPI immediately; can use threads to overlap optionally
 | 
			
		||||
  MPI_Request req[2];
 | 
			
		||||
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
 | 
			
		||||
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
 | 
			
		||||
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_MPI3L
 | 
			
		||||
#ifdef GRID_COMMS_MPIT
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 | 
			
		||||
{
 | 
			
		||||
  int NN    = BlockSolverGrid->_ndimension;
 | 
			
		||||
@@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
 | 
			
		||||
  }
 | 
			
		||||
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 | 
			
		||||
}
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 | 
			
		||||
@@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
 | 
			
		||||
  int Nblock = X._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
 | 
			
		||||
  GridBase *FullGrid  = X._grid;
 | 
			
		||||
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
 | 
			
		||||
  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
 | 
			
		||||
  assert( FullGrid->_simd_layout[Orthog]==1);
 | 
			
		||||
  int nh =  FullGrid->_ndimension;
 | 
			
		||||
  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  //  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  int nl = nh-1;
 | 
			
		||||
 | 
			
		||||
  //FIXME package in a convenient iterator
 | 
			
		||||
  //Should loop over a plane orthogonal to direction "Orthog"
 | 
			
		||||
@@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
 | 
			
		||||
  int Nblock = X._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
 | 
			
		||||
  GridBase *FullGrid  = X._grid;
 | 
			
		||||
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
 | 
			
		||||
  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  //  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
 | 
			
		||||
  assert( FullGrid->_simd_layout[Orthog]==1);
 | 
			
		||||
  int nh =  FullGrid->_ndimension;
 | 
			
		||||
  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  //  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  int nl=1;
 | 
			
		||||
 | 
			
		||||
  //FIXME package in a convenient iterator
 | 
			
		||||
  //Should loop over a plane orthogonal to direction "Orthog"
 | 
			
		||||
@@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *FullGrid  = lhs._grid;
 | 
			
		||||
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  
 | 
			
		||||
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
 | 
			
		||||
  
 | 
			
		||||
  Lattice<vobj> Lslice(SliceGrid);
 | 
			
		||||
  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Lslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  
 | 
			
		||||
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 | 
			
		||||
 | 
			
		||||
  assert( FullGrid->_simd_layout[Orthog]==1);
 | 
			
		||||
  int nh =  FullGrid->_ndimension;
 | 
			
		||||
  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  //  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  int nl = nh-1;
 | 
			
		||||
 | 
			
		||||
  //FIXME package in a convenient iterator
 | 
			
		||||
  //Should loop over a plane orthogonal to direction "Orthog"
 | 
			
		||||
@@ -550,6 +554,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
      mat += mat_thread;
 | 
			
		||||
    }  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nblock;i++){
 | 
			
		||||
  for(int j=0;j<Nblock;j++){
 | 
			
		||||
    ComplexD sum = mat(i,j);
 | 
			
		||||
    FullGrid->GlobalSum(sum);
 | 
			
		||||
    mat(i,j)=sum;
 | 
			
		||||
  }}
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 | 
			
		||||
////////////////////////////////////////////////////////////
 | 
			
		||||
void Grid_quiesce_nodes(void) {
 | 
			
		||||
  int me = 0;
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
 | 
			
		||||
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
 
 | 
			
		||||
@@ -29,7 +29,7 @@
 | 
			
		||||
#ifndef GRID_BINARY_IO_H
 | 
			
		||||
#define GRID_BINARY_IO_H
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 | 
			
		||||
#define USE_MPI_IO
 | 
			
		||||
#else
 | 
			
		||||
#undef  USE_MPI_IO
 | 
			
		||||
@@ -99,34 +99,38 @@ class BinaryIO {
 | 
			
		||||
    NerscChecksum(grid,scalardata,nersc_csum);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
 | 
			
		||||
  template <class fobj>
 | 
			
		||||
  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
 | 
			
		||||
  {
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    uint64_t lsites              =grid->lSites();
 | 
			
		||||
    if (fbuf.size()==1) {
 | 
			
		||||
      lsites=1;
 | 
			
		||||
    uint64_t lsites = grid->lSites();
 | 
			
		||||
    if (fbuf.size() == 1)
 | 
			
		||||
    {
 | 
			
		||||
      lsites = 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
      uint32_t nersc_csum_thr=0;
 | 
			
		||||
      uint32_t nersc_csum_thr = 0;
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(uint64_t local_site=0;local_site<lsites;local_site++){
 | 
			
		||||
	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
 | 
			
		||||
	for(uint64_t j=0;j<size32;j++){
 | 
			
		||||
	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
 | 
			
		||||
	}
 | 
			
		||||
      #pragma omp for
 | 
			
		||||
      for (uint64_t local_site = 0; local_site < lsites; local_site++)
 | 
			
		||||
      {
 | 
			
		||||
        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
 | 
			
		||||
        for (uint64_t j = 0; j < size32; j++)
 | 
			
		||||
        {
 | 
			
		||||
          nersc_csum_thr = nersc_csum_thr + site_buf[j];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
#pragma omp critical
 | 
			
		||||
      #pragma omp critical
 | 
			
		||||
      {
 | 
			
		||||
	nersc_csum  += nersc_csum_thr;
 | 
			
		||||
        nersc_csum += nersc_csum_thr;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
 | 
			
		||||
  {
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
 | 
			
		||||
@@ -363,17 +367,21 @@ class BinaryIO {
 | 
			
		||||
	assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
      } else {
 | 
			
		||||
	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
 | 
			
		||||
		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	std::ifstream fin;
 | 
			
		||||
	fin.open(file,std::ios::binary|std::ios::in);
 | 
			
		||||
	if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
	  fin.seekg(-sizeof(fobj),fin.end);
 | 
			
		||||
	} else { 
 | 
			
		||||
	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
 | 
			
		||||
	}
 | 
			
		||||
	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
 | 
			
		||||
	fin.close();
 | 
			
		||||
        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
 | 
			
		||||
                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
 | 
			
		||||
        std::ifstream fin;
 | 
			
		||||
        fin.open(file, std::ios::binary | std::ios::in);
 | 
			
		||||
        if (control & BINARYIO_MASTER_APPEND)
 | 
			
		||||
        {
 | 
			
		||||
          fin.seekg(-sizeof(fobj), fin.end);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          fin.seekg(offset + myrank * lsites * sizeof(fobj));
 | 
			
		||||
        }
 | 
			
		||||
        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
 | 
			
		||||
        assert(fin.fail() == 0);
 | 
			
		||||
        fin.close();
 | 
			
		||||
      }
 | 
			
		||||
      timer.Stop();
 | 
			
		||||
 | 
			
		||||
@@ -405,30 +413,78 @@ class BinaryIO {
 | 
			
		||||
      timer.Start();
 | 
			
		||||
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 | 
			
		||||
#ifdef USE_MPI_IO
 | 
			
		||||
	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
 | 
			
		||||
	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
 | 
			
		||||
	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
 | 
			
		||||
	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
 | 
			
		||||
	MPI_File_close(&fh);
 | 
			
		||||
	MPI_Type_free(&fileArray);
 | 
			
		||||
	MPI_Type_free(&localArray);
 | 
			
		||||
        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
 | 
			
		||||
        std::cout << GridLogMessage << "Checking for errors" << std::endl;
 | 
			
		||||
        if (ierr != MPI_SUCCESS)
 | 
			
		||||
        {
 | 
			
		||||
          char error_string[BUFSIZ];
 | 
			
		||||
          int length_of_error_string, error_class;
 | 
			
		||||
 | 
			
		||||
          MPI_Error_class(ierr, &error_class);
 | 
			
		||||
          MPI_Error_string(error_class, error_string, &length_of_error_string);
 | 
			
		||||
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
 | 
			
		||||
          MPI_Error_string(ierr, error_string, &length_of_error_string);
 | 
			
		||||
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
 | 
			
		||||
          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
 | 
			
		||||
        assert(ierr == 0);
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
 | 
			
		||||
        assert(ierr == 0);
 | 
			
		||||
 | 
			
		||||
        MPI_File_close(&fh);
 | 
			
		||||
        MPI_Type_free(&fileArray);
 | 
			
		||||
        MPI_Type_free(&localArray);
 | 
			
		||||
#else 
 | 
			
		||||
	assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
      } else { 
 | 
			
		||||
	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
			
		||||
	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
 | 
			
		||||
		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
        
 | 
			
		||||
	std::ofstream fout; 
 | 
			
		||||
  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
 | 
			
		||||
  try {
 | 
			
		||||
    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
			
		||||
  } catch (const std::fstream::failure& exc) {
 | 
			
		||||
    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
 | 
			
		||||
    #ifdef USE_MPI_IO
 | 
			
		||||
    MPI_Abort(MPI_COMM_WORLD,1);
 | 
			
		||||
    #else
 | 
			
		||||
    exit(1);
 | 
			
		||||
    #endif
 | 
			
		||||
  }
 | 
			
		||||
	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
 | 
			
		||||
		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	
 | 
			
		||||
  if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
	  fout.seekp(0,fout.end);
 | 
			
		||||
	} else {
 | 
			
		||||
	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
 | 
			
		||||
	}
 | 
			
		||||
	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
 | 
			
		||||
  
 | 
			
		||||
  try {
 | 
			
		||||
  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
 | 
			
		||||
  }
 | 
			
		||||
  catch (const std::fstream::failure& exc) {
 | 
			
		||||
    std::cout << "Exception in writing file " << file << std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
 | 
			
		||||
    #ifdef USE_MPI_IO
 | 
			
		||||
    MPI_Abort(MPI_COMM_WORLD,1);
 | 
			
		||||
    #else
 | 
			
		||||
    exit(1);
 | 
			
		||||
    #endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
	fout.close();
 | 
			
		||||
      }
 | 
			
		||||
      timer.Stop();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  timer.Stop();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage<<"IOobject: ";
 | 
			
		||||
    if ( control & BINARYIO_READ) std::cout << " read  ";
 | 
			
		||||
@@ -442,11 +498,14 @@ class BinaryIO {
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Safety check
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
    grid->GlobalSum(nersc_csum);
 | 
			
		||||
    grid->GlobalXOR(scidac_csuma);
 | 
			
		||||
    grid->GlobalXOR(scidac_csumb);
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
    // if the data size is 1 we do not want to sum over the MPI ranks
 | 
			
		||||
    if (iodata.size() != 1){
 | 
			
		||||
      grid->Barrier();
 | 
			
		||||
      grid->GlobalSum(nersc_csum);
 | 
			
		||||
      grid->GlobalXOR(scidac_csuma);
 | 
			
		||||
      grid->GlobalXOR(scidac_csumb);
 | 
			
		||||
      grid->Barrier();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -546,9 +605,9 @@ class BinaryIO {
 | 
			
		||||
    int gsites = grid->gSites();
 | 
			
		||||
    int lsites = grid->lSites();
 | 
			
		||||
 | 
			
		||||
    uint32_t nersc_csum_tmp;
 | 
			
		||||
    uint32_t scidac_csuma_tmp;
 | 
			
		||||
    uint32_t scidac_csumb_tmp;
 | 
			
		||||
    uint32_t nersc_csum_tmp   = 0;
 | 
			
		||||
    uint32_t scidac_csuma_tmp = 0;
 | 
			
		||||
    uint32_t scidac_csumb_tmp = 0;
 | 
			
		||||
 | 
			
		||||
    GridStopWatch timer;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  for(int i=0; i < Ls; i++){
 | 
			
		||||
    as[i] = 1.0;
 | 
			
		||||
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
 | 
			
		||||
    //    assert(fabs(omega[i])>0.0);
 | 
			
		||||
    assert(omega[i]!=Coeff_t(0.0));
 | 
			
		||||
    bs[i] = 0.5*(bpc/omega[i] + bmc);
 | 
			
		||||
    cs[i] = 0.5*(bpc/omega[i] - bmc);
 | 
			
		||||
  }
 | 
			
		||||
@@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  
 | 
			
		||||
  for(int i=0;i<Ls;i++){
 | 
			
		||||
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
 | 
			
		||||
    //    assert(fabs(bee[i])>0.0);
 | 
			
		||||
    assert(bee[i]!=Coeff_t(0.0));
 | 
			
		||||
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
 | 
			
		||||
    beo[i]=as[i]*bs[i];
 | 
			
		||||
    ceo[i]=-as[i]*cs[i];
 | 
			
		||||
@@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
    
 | 
			
		||||
    if ( i < Ls-1 ) {
 | 
			
		||||
 | 
			
		||||
      assert(bee[i]!=Coeff_t(0.0));
 | 
			
		||||
      assert(bee[0]!=Coeff_t(0.0));
 | 
			
		||||
      
 | 
			
		||||
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
 | 
			
		||||
      
 | 
			
		||||
      leem[i]=mass*cee[Ls-1]/bee[0];
 | 
			
		||||
      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
 | 
			
		||||
      for(int j=0;j<i;j++) {
 | 
			
		||||
	assert(bee[j+1]!=Coeff_t(0.0));
 | 
			
		||||
	leem[i]*= aee[j]/bee[j+1];
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
 | 
			
		||||
      
 | 
			
		||||
@@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  { 
 | 
			
		||||
    Coeff_t delta_d=mass*cee[Ls-1];
 | 
			
		||||
    for(int j=0;j<Ls-1;j++) {
 | 
			
		||||
      //      assert(fabs(bee[j])>0.0);
 | 
			
		||||
      assert(bee[j] != Coeff_t(0.0));
 | 
			
		||||
      delta_d *= cee[j]/bee[j];
 | 
			
		||||
    }
 | 
			
		||||
    dee[Ls-1] += delta_d;
 | 
			
		||||
 
 | 
			
		||||
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
{
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
  int LLs = in._grid->_rdimensions[0];
 | 
			
		||||
  st.HaloExchange(in,compressor);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  DhopTotalTime -= usecond();
 | 
			
		||||
  DhopCommTime -= usecond();
 | 
			
		||||
  st.HaloExchange(in,compressor);
 | 
			
		||||
  DhopCommTime += usecond();
 | 
			
		||||
  
 | 
			
		||||
  DhopComputeTime -= usecond();
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime += usecond();
 | 
			
		||||
  DhopTotalTime   += usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=1;
 | 
			
		||||
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
 | 
			
		||||
  conformable(in._grid,out._grid); // drops the cb check
 | 
			
		||||
 | 
			
		||||
@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=1;
 | 
			
		||||
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
 | 
			
		||||
  conformable(in._grid,out._grid); // drops the cb check
 | 
			
		||||
 | 
			
		||||
@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=2;
 | 
			
		||||
  conformable(in._grid,FermionGrid()); // verifies full grid
 | 
			
		||||
  conformable(in._grid,out._grid);
 | 
			
		||||
 | 
			
		||||
@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
 | 
			
		||||
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 | 
			
		||||
{
 | 
			
		||||
  std::vector<int> latt = GridDefaultLatt();          
 | 
			
		||||
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
  RealD NP = _FourDimGrid->_Nprocessors;
 | 
			
		||||
  RealD NN = _FourDimGrid->NodeCount();
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 | 
			
		||||
	    << DhopCalls   << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 | 
			
		||||
	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 | 
			
		||||
	    << DhopCommTime    / DhopCalls << " us" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 | 
			
		||||
	    << DhopComputeTime / DhopCalls << " us" << std::endl;
 | 
			
		||||
 | 
			
		||||
  // Average the compute time
 | 
			
		||||
  _FourDimGrid->GlobalSum(DhopComputeTime);
 | 
			
		||||
  DhopComputeTime/=NP;
 | 
			
		||||
 | 
			
		||||
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 | 
			
		||||
}
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls       = 0;
 | 
			
		||||
  DhopTotalTime    = 0;
 | 
			
		||||
  DhopCommTime    = 0;
 | 
			
		||||
  DhopComputeTime = 0;
 | 
			
		||||
  Stencil.ZeroCounters();
 | 
			
		||||
  StencilEven.ZeroCounters();
 | 
			
		||||
  StencilOdd.ZeroCounters();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Implement the general interface. Here we use SAME mass on all slices
 | 
			
		||||
 
 | 
			
		||||
@@ -55,6 +55,16 @@ namespace QCD {
 | 
			
		||||
      FermionField _tmp;
 | 
			
		||||
      FermionField &tmp(void) { return _tmp; }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////
 | 
			
		||||
      // Performance monitoring
 | 
			
		||||
      ////////////////////////////////////////
 | 
			
		||||
      void Report(void);
 | 
			
		||||
      void ZeroCounters(void);
 | 
			
		||||
      double DhopTotalTime;
 | 
			
		||||
      double DhopCalls;
 | 
			
		||||
      double DhopCommTime;
 | 
			
		||||
      double DhopComputeTime;
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Implement the abstract base
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
 | 
			
		||||
template<class vobj,class cobj>
 | 
			
		||||
class WilsonStencil : public CartesianStencil<vobj,cobj> {
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  double timer0;
 | 
			
		||||
  double timer1;
 | 
			
		||||
  double timer2;
 | 
			
		||||
  double timer3;
 | 
			
		||||
  double timer4;
 | 
			
		||||
  double timer5;
 | 
			
		||||
  double timer6;
 | 
			
		||||
  uint64_t callsi;
 | 
			
		||||
  void ZeroCountersi(void)
 | 
			
		||||
  {
 | 
			
		||||
    timer0=0;
 | 
			
		||||
    timer1=0;
 | 
			
		||||
    timer2=0;
 | 
			
		||||
    timer3=0;
 | 
			
		||||
    timer4=0;
 | 
			
		||||
    timer5=0;
 | 
			
		||||
    timer6=0;
 | 
			
		||||
    callsi=0;
 | 
			
		||||
  }
 | 
			
		||||
  void Reporti(int calls)
 | 
			
		||||
  {
 | 
			
		||||
    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
 | 
			
		||||
    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
 | 
			
		||||
    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
 | 
			
		||||
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
 | 
			
		||||
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> same_node;
 | 
			
		||||
@@ -252,6 +278,7 @@ public:
 | 
			
		||||
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
 | 
			
		||||
    same_node(npoints)
 | 
			
		||||
  { 
 | 
			
		||||
    ZeroCountersi();
 | 
			
		||||
    surface_list.resize(0);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
@@ -261,7 +288,6 @@ public:
 | 
			
		||||
    // Here we know the distance is 1 for WilsonStencil
 | 
			
		||||
    for(int point=0;point<this->_npoints;point++){
 | 
			
		||||
      same_node[point] = this->SameNode(point);
 | 
			
		||||
      //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    for(int site = 0 ;site< vol4;site++){
 | 
			
		||||
@@ -282,17 +308,28 @@ public:
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
    this->HaloExchangeOptGather(source,compress);
 | 
			
		||||
    this->CommunicateBegin(reqs);
 | 
			
		||||
    this->CommunicateComplete(reqs);
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    // Asynchronous MPI calls multidirectional, Isend etc...
 | 
			
		||||
    //    this->CommunicateBegin(reqs);
 | 
			
		||||
    //    this->CommunicateComplete(reqs);
 | 
			
		||||
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
 | 
			
		||||
    this->Communicate();
 | 
			
		||||
    double t2=usecond(); timer1 += t2-t1;
 | 
			
		||||
    this->CommsMerge(compress);
 | 
			
		||||
    double t3=usecond(); timer2 += t3-t2;
 | 
			
		||||
    this->CommsMergeSHM(compress);
 | 
			
		||||
    double t4=usecond(); timer3 += t4-t3;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class compressor>
 | 
			
		||||
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
  {
 | 
			
		||||
    this->Prepare();
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    this->HaloGatherOpt(source,compress);
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    timer0 += t1-t0;
 | 
			
		||||
    callsi++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <class compressor>
 | 
			
		||||
@@ -304,7 +341,9 @@ public:
 | 
			
		||||
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
 | 
			
		||||
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
 | 
			
		||||
 | 
			
		||||
    this->mpi3synctime_g-=usecond();
 | 
			
		||||
    this->_grid->StencilBarrier();
 | 
			
		||||
    this->mpi3synctime_g+=usecond();
 | 
			
		||||
 | 
			
		||||
    assert(source._grid==this->_grid);
 | 
			
		||||
    this->halogtime-=usecond();
 | 
			
		||||
@@ -323,7 +362,6 @@ public:
 | 
			
		||||
    int dag = compress.dag;
 | 
			
		||||
    int face_idx=0;
 | 
			
		||||
    if ( dag ) { 
 | 
			
		||||
      //	std::cout << " Optimised Dagger compress " <<std::endl;
 | 
			
		||||
      assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
 | 
			
		||||
      assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
 | 
			
		||||
      assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
 | 
			
		||||
 
 | 
			
		||||
@@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
  int vol4;
 | 
			
		||||
  vol4=FourDimGrid.oSites();
 | 
			
		||||
  Stencil.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
 | 
			
		||||
  vol4=FourDimRedBlackGrid.oSites();
 | 
			
		||||
  StencilEven.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
   StencilOdd.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
 | 
			
		||||
                       <<" " << StencilEven.surface_list.size()<<std::endl;
 | 
			
		||||
   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
 | 
			
		||||
   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
     
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::Report(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<int> latt = GridDefaultLatt();          
 | 
			
		||||
    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
    RealD NP = _FourDimGrid->_Nprocessors;
 | 
			
		||||
    RealD NN = _FourDimGrid->NodeCount();
 | 
			
		||||
  RealD NP     = _FourDimGrid->_Nprocessors;
 | 
			
		||||
  RealD NN     = _FourDimGrid->NodeCount();
 | 
			
		||||
  RealD volume = Ls;  
 | 
			
		||||
  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
 | 
			
		||||
  if ( DhopCalls > 0 ) {
 | 
			
		||||
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
 | 
			
		||||
@@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 | 
			
		||||
  }
 | 
			
		||||
  if ( DhopCalls > 0){
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
 | 
			
		||||
  Stencil.ZeroCounters();
 | 
			
		||||
  StencilEven.ZeroCounters();
 | 
			
		||||
  StencilOdd.ZeroCounters();
 | 
			
		||||
  Stencil.ZeroCountersi();
 | 
			
		||||
  StencilEven.ZeroCountersi();
 | 
			
		||||
  StencilOdd.ZeroCountersi();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
@@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.HaloExchangeOptGather(in,compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
  std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
 | 
			
		||||
  // Rely on async comms; start comms before merge of local data
 | 
			
		||||
  DhopCommTime-=usecond();
 | 
			
		||||
  st.CommunicateBegin(reqs);
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMergeSHM(compressor);
 | 
			
		||||
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  // Perhaps use omp task and region
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  double ctime=0;
 | 
			
		||||
  double ptime=0;
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Ugly explicit thread mapping introduced for OPA reasons.
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
 | 
			
		||||
  { 
 | 
			
		||||
    int tid = omp_get_thread_num();
 | 
			
		||||
    int nthreads = omp_get_num_threads();
 | 
			
		||||
    int me = omp_get_thread_num();
 | 
			
		||||
    int myoff, mywork;
 | 
			
		||||
    int ncomms = CartesianCommunicator::nCommThreads;
 | 
			
		||||
    if (ncomms == -1) ncomms = 1;
 | 
			
		||||
    assert(nthreads > ncomms);
 | 
			
		||||
    if (tid >= ncomms) {
 | 
			
		||||
      double start = usecond();
 | 
			
		||||
      nthreads -= ncomms;
 | 
			
		||||
      int ttid = tid - ncomms;
 | 
			
		||||
      int n = U._grid->oSites();
 | 
			
		||||
      int chunk = n / nthreads;
 | 
			
		||||
      int rem = n % nthreads;
 | 
			
		||||
      int myblock, myn;
 | 
			
		||||
      if (ttid < rem) {
 | 
			
		||||
	myblock = ttid * chunk + ttid;
 | 
			
		||||
	myn = chunk+1;
 | 
			
		||||
      } else {
 | 
			
		||||
	myblock = ttid*chunk + rem;
 | 
			
		||||
	myn = chunk;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
 | 
			
		||||
    int sF = LLs * myoff;
 | 
			
		||||
 | 
			
		||||
    if ( me == 0 ) {
 | 
			
		||||
      st.CommunicateComplete(reqs);
 | 
			
		||||
      DhopCommTime+=usecond();
 | 
			
		||||
    } else { 
 | 
			
		||||
      // Interior links in stencil
 | 
			
		||||
      if ( me==1 ) DhopComputeTime-=usecond();
 | 
			
		||||
      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
 | 
			
		||||
      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
 | 
			
		||||
      if ( me==1 ) DhopComputeTime+=usecond();
 | 
			
		||||
      // do the compute
 | 
			
		||||
      if (dag == DaggerYes) {
 | 
			
		||||
	for (int ss = myblock; ss < myblock+myn; ++ss) {
 | 
			
		||||
	  int sU = ss;
 | 
			
		||||
	  int sF = LLs * sU;
 | 
			
		||||
	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 | 
			
		||||
	}
 | 
			
		||||
      } else {
 | 
			
		||||
	for (int ss = myblock; ss < myblock+myn; ++ss) {
 | 
			
		||||
	  int sU = ss;
 | 
			
		||||
	  int sF = LLs * sU;
 | 
			
		||||
	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
	ptime = usecond() - start;
 | 
			
		||||
    }
 | 
			
		||||
    {
 | 
			
		||||
      double start = usecond();
 | 
			
		||||
      st.CommunicateThreaded();
 | 
			
		||||
      ctime = usecond() - start;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  DhopCommTime += ctime;
 | 
			
		||||
  DhopComputeTime+=ptime;
 | 
			
		||||
 | 
			
		||||
  // First to enter, last to leave timing
 | 
			
		||||
  st.CollateThreads();
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMerge(compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  // Load imbalance alert. Should use dynamic schedule OMP for loop
 | 
			
		||||
  // Perhaps create a list of only those sites with face work, and 
 | 
			
		||||
  // load balance process the list.
 | 
			
		||||
  DhopComputeTime2-=usecond();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    int sz=st.surface_list.size();
 | 
			
		||||
@@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
#else 
 | 
			
		||||
  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
					 DoubledGaugeField & U,
 | 
			
		||||
 
 | 
			
		||||
@@ -165,7 +165,7 @@ class HMCResourceManager {
 | 
			
		||||
  // Grids
 | 
			
		||||
  //////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  void AddGrid(std::string s, GridModule& M) {
 | 
			
		||||
  void AddGrid(const std::string s, GridModule& M) {
 | 
			
		||||
    // Check for name clashes
 | 
			
		||||
    auto search = Grids.find(s);
 | 
			
		||||
    if (search != Grids.end()) {
 | 
			
		||||
@@ -174,14 +174,24 @@ class HMCResourceManager {
 | 
			
		||||
      exit(1);
 | 
			
		||||
    }
 | 
			
		||||
    Grids[s] = std::move(M);
 | 
			
		||||
    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
 | 
			
		||||
    Grids[s].show_full_decomposition();
 | 
			
		||||
    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Add a named grid set, 4d shortcut
 | 
			
		||||
  void AddFourDimGrid(std::string s) {
 | 
			
		||||
  void AddFourDimGrid(const std::string s) {
 | 
			
		||||
    GridFourDimModule<vComplex> Mod;
 | 
			
		||||
    AddGrid(s, Mod);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Add a named grid set, 4d shortcut + tweak simd lanes
 | 
			
		||||
  void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
 | 
			
		||||
    GridFourDimModule<vComplex> Mod(simd_decomposition);
 | 
			
		||||
    AddGrid(s, Mod);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  GridCartesian* GetCartesian(std::string s = "") {
 | 
			
		||||
 
 | 
			
		||||
@@ -43,11 +43,12 @@ public:
 | 
			
		||||
  std::string, lattice,
 | 
			
		||||
  std::string, mpi);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> getLattice(){return strToVec<int>(lattice);}
 | 
			
		||||
  std::vector<int> getMpi()    {return strToVec<int>(mpi);}
 | 
			
		||||
  std::vector<int> getLattice() const {return strToVec<int>(lattice);}
 | 
			
		||||
  std::vector<int> getMpi()     const {return strToVec<int>(mpi);}
 | 
			
		||||
 | 
			
		||||
  void check(){
 | 
			
		||||
    if (getLattice().size() != getMpi().size()) {
 | 
			
		||||
 | 
			
		||||
  void check() const {
 | 
			
		||||
    if (getLattice().size() != getMpi().size() ) {
 | 
			
		||||
      std::cout << GridLogError
 | 
			
		||||
                << "Error in GridModuleParameters: lattice and mpi dimensions "
 | 
			
		||||
                   "do not match"
 | 
			
		||||
@@ -84,6 +85,8 @@ class GridModule {
 | 
			
		||||
 | 
			
		||||
  void set_full(GridCartesian* grid) { grid_.reset(grid); }
 | 
			
		||||
  void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
 | 
			
		||||
  void show_full_decomposition(){ grid_->show_decomposition(); }
 | 
			
		||||
  void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
 | 
			
		||||
 | 
			
		||||
 protected:
 | 
			
		||||
  std::unique_ptr<GridCartesian> grid_;
 | 
			
		||||
@@ -95,31 +98,72 @@ class GridModule {
 | 
			
		||||
// Classes for the user
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
// Note: the space time grid should be out of the QCD namespace
 | 
			
		||||
template< class vector_type>
 | 
			
		||||
class GridFourDimModule : public GridModule {
 | 
			
		||||
 public:
 | 
			
		||||
  GridFourDimModule() {
 | 
			
		||||
template <class vector_type>
 | 
			
		||||
class GridFourDimModule : public GridModule
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  GridFourDimModule()
 | 
			
		||||
  {
 | 
			
		||||
    using namespace QCD;
 | 
			
		||||
    set_full(SpaceTimeGrid::makeFourDimGrid(
 | 
			
		||||
        GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
 | 
			
		||||
        GridDefaultLatt(), 
 | 
			
		||||
        GridDefaultSimd(4, vector_type::Nsimd()),
 | 
			
		||||
        GridDefaultMpi()));
 | 
			
		||||
    set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridFourDimModule(GridModuleParameters Params) {
 | 
			
		||||
  GridFourDimModule(const std::vector<int> tweak_simd)
 | 
			
		||||
  {
 | 
			
		||||
    using namespace QCD;
 | 
			
		||||
    if (tweak_simd.size() != 4)
 | 
			
		||||
    {
 | 
			
		||||
      std::cout << GridLogError
 | 
			
		||||
                << "Error in GridFourDimModule: SIMD size different from 4" 
 | 
			
		||||
                << std::endl;
 | 
			
		||||
      exit(1);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Checks that the product agrees with the expectation
 | 
			
		||||
    int simd_sum = 1;
 | 
			
		||||
    for (auto &n : tweak_simd)
 | 
			
		||||
      simd_sum *= n;
 | 
			
		||||
    std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl;
 | 
			
		||||
 | 
			
		||||
    if (simd_sum == vector_type::Nsimd())
 | 
			
		||||
    {
 | 
			
		||||
      set_full(SpaceTimeGrid::makeFourDimGrid(
 | 
			
		||||
          GridDefaultLatt(), 
 | 
			
		||||
          tweak_simd, 
 | 
			
		||||
          GridDefaultMpi()));
 | 
			
		||||
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      std::cout << GridLogError 
 | 
			
		||||
                << "Error in GridFourDimModule: SIMD lanes must sum to " 
 | 
			
		||||
                << vector_type::Nsimd() 
 | 
			
		||||
                << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridFourDimModule(const GridModuleParameters Params)
 | 
			
		||||
  {
 | 
			
		||||
    using namespace QCD;
 | 
			
		||||
    Params.check();
 | 
			
		||||
    std::vector<int> lattice_v = Params.getLattice();
 | 
			
		||||
    std::vector<int> mpi_v = Params.getMpi();
 | 
			
		||||
    if (lattice_v.size() == 4) {
 | 
			
		||||
    if (lattice_v.size() == 4)
 | 
			
		||||
    {
 | 
			
		||||
      set_full(SpaceTimeGrid::makeFourDimGrid(
 | 
			
		||||
          lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
 | 
			
		||||
          lattice_v, 
 | 
			
		||||
          GridDefaultSimd(4, vector_type::Nsimd()),
 | 
			
		||||
          mpi_v));
 | 
			
		||||
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
 | 
			
		||||
    } else {
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      std::cout << GridLogError
 | 
			
		||||
          << "Error in GridFourDimModule: lattice dimension different from 4"
 | 
			
		||||
          << std::endl;
 | 
			
		||||
                << "Error in GridFourDimModule: lattice dimension different from 4"
 | 
			
		||||
                << std::endl;
 | 
			
		||||
      exit(1);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
 | 
			
		||||
  typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
 | 
			
		||||
  using ObsBase::ObsBase; // for constructors
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // acquire resource
 | 
			
		||||
  virtual void initialize(){
 | 
			
		||||
    this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
 | 
			
		||||
@@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
 | 
			
		||||
  PlaquetteMod(): ObsBase(NoParameters()){}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template < class Impl >
 | 
			
		||||
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
 | 
			
		||||
  typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
 | 
			
		||||
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
 | 
			
		||||
  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
 | 
			
		||||
  using ObsBase::ObsBase; // for constructors
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // acquire resource
 | 
			
		||||
  virtual void initialize(){
 | 
			
		||||
    this->ObservablePtr.reset(new TopologicalCharge<Impl>());
 | 
			
		||||
    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
 | 
			
		||||
  }
 | 
			
		||||
  public:
 | 
			
		||||
  TopologicalChargeMod(): ObsBase(NoParameters()){}
 | 
			
		||||
  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
 | 
			
		||||
  TopologicalChargeMod(): ObsBase(){}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}// QCD temporarily here
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -33,9 +33,45 @@ directory
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
struct TopologySmearingParameters : Serializable {
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
 | 
			
		||||
    int, steps,
 | 
			
		||||
    float, step_size,
 | 
			
		||||
    int, meas_interval,
 | 
			
		||||
    float, maxTau);
 | 
			
		||||
 | 
			
		||||
    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
 | 
			
		||||
        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
 | 
			
		||||
 | 
			
		||||
    template < class ReaderClass >
 | 
			
		||||
    TopologySmearingParameters(Reader<ReaderClass>& Reader){
 | 
			
		||||
        read(Reader, "Smearing", *this);  
 | 
			
		||||
    }  
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
struct TopologyObsParameters : Serializable {
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
 | 
			
		||||
      int, interval,
 | 
			
		||||
      bool, do_smearing,
 | 
			
		||||
      TopologySmearingParameters, Smearing);  
 | 
			
		||||
 | 
			
		||||
    TopologyObsParameters(int interval = 1, bool smearing = false):
 | 
			
		||||
        interval(interval), Smearing(smearing){}
 | 
			
		||||
 | 
			
		||||
    template <class ReaderClass >
 | 
			
		||||
      TopologyObsParameters(Reader<ReaderClass>& Reader){
 | 
			
		||||
        read(Reader, "TopologyMeasurement", *this);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// this is only defined for a gauge theory
 | 
			
		||||
template <class Impl>
 | 
			
		||||
class TopologicalCharge : public HmcObservable<typename Impl::Field> {
 | 
			
		||||
    TopologyObsParameters Pars;
 | 
			
		||||
 | 
			
		||||
 public:
 | 
			
		||||
    // here forces the Impl to be of gauge fields
 | 
			
		||||
    // if not the compiler will complain
 | 
			
		||||
@@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
 | 
			
		||||
    // necessary for HmcObservable compatibility
 | 
			
		||||
    typedef typename Impl::Field Field;
 | 
			
		||||
 | 
			
		||||
    TopologicalCharge(int interval = 1, bool do_smearing = false):
 | 
			
		||||
        Pars(interval, do_smearing){}
 | 
			
		||||
    
 | 
			
		||||
    TopologicalCharge(TopologyObsParameters P):Pars(P){
 | 
			
		||||
        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void TrajectoryComplete(int traj,
 | 
			
		||||
                            Field &U,
 | 
			
		||||
                            GridSerialRNG &sRNG,
 | 
			
		||||
                            GridParallelRNG &pRNG) {
 | 
			
		||||
 | 
			
		||||
    Real q = WilsonLoops<Impl>::TopologicalCharge(U);
 | 
			
		||||
    if (traj%Pars.interval == 0){
 | 
			
		||||
        // Smearing
 | 
			
		||||
        Field Usmear = U;
 | 
			
		||||
        int def_prec = std::cout.precision();
 | 
			
		||||
        
 | 
			
		||||
    int def_prec = std::cout.precision();
 | 
			
		||||
        if (Pars.do_smearing){
 | 
			
		||||
            // using wilson flow by default here
 | 
			
		||||
            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
 | 
			
		||||
            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
 | 
			
		||||
            Real T0   = WF.energyDensityPlaquette(Usmear);
 | 
			
		||||
            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 | 
			
		||||
                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage
 | 
			
		||||
        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 | 
			
		||||
        << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
 | 
			
		||||
        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
 | 
			
		||||
        std::cout << GridLogMessage
 | 
			
		||||
            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 | 
			
		||||
            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
 | 
			
		||||
 | 
			
		||||
    std::cout.precision(def_prec);
 | 
			
		||||
        std::cout.precision(def_prec);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
@@ -26,12 +26,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
//#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
#ifndef GRID_QCD_GAUGE_FIX_H
 | 
			
		||||
#define GRID_QCD_GAUGE_FIX_H
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
template <class Gimpl> 
 | 
			
		||||
class FourierAcceleratedGaugeFixer  : public Gimpl {
 | 
			
		||||
  public:
 | 
			
		||||
 public:
 | 
			
		||||
  INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
 | 
			
		||||
  typedef typename Gimpl::GaugeLinkField GaugeMat;
 | 
			
		||||
@@ -186,3 +188,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
 | 
			
		||||
  }  
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -82,11 +82,11 @@ namespace Optimization {
 | 
			
		||||
      double tmp[2]={a,b};
 | 
			
		||||
      return vld1q_f64(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    //Real double // N:tbc
 | 
			
		||||
    //Real double
 | 
			
		||||
    inline float64x2_t operator()(double a){
 | 
			
		||||
      return vdupq_n_f64(a);
 | 
			
		||||
    }
 | 
			
		||||
    //Integer // N:tbc
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline uint32x4_t operator()(Integer a){
 | 
			
		||||
      return vdupq_n_u32(a);
 | 
			
		||||
    }
 | 
			
		||||
@@ -124,33 +124,32 @@ namespace Optimization {
 | 
			
		||||
  // Nils: Vset untested; not used currently in Grid at all;
 | 
			
		||||
  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
 | 
			
		||||
  struct Vset{
 | 
			
		||||
    // Complex float // N:ok
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline float32x4_t operator()(Grid::ComplexF *a){
 | 
			
		||||
      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
 | 
			
		||||
      return vld1q_f32(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double // N:ok
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline float64x2_t operator()(Grid::ComplexD *a){
 | 
			
		||||
      double tmp[2]={a[0].imag(),a[0].real()};
 | 
			
		||||
      return vld1q_f64(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Real float // N:ok
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float32x4_t operator()(float *a){
 | 
			
		||||
      float tmp[4]={a[3],a[2],a[1],a[0]};
 | 
			
		||||
      return vld1q_f32(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Real double // N:ok
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline float64x2_t operator()(double *a){
 | 
			
		||||
      double tmp[2]={a[1],a[0]};
 | 
			
		||||
      return vld1q_f64(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Integer // N:ok
 | 
			
		||||
    // Integer
 | 
			
		||||
    inline uint32x4_t operator()(Integer *a){
 | 
			
		||||
      return vld1q_dup_u32(a);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  // N:leaving as is
 | 
			
		||||
  template <typename Out_type, typename In_type>
 | 
			
		||||
  struct Reduce{
 | 
			
		||||
    //Need templated class to overload output type
 | 
			
		||||
@@ -249,9 +248,9 @@ namespace Optimization {
 | 
			
		||||
      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
 | 
			
		||||
 | 
			
		||||
      // no fma, use mul and add
 | 
			
		||||
      //float32x4_t r5;
 | 
			
		||||
      //r5 = vmulq_f32(r0, a);
 | 
			
		||||
      //return vaddq_f32(r4, r5);
 | 
			
		||||
      // float32x4_t r5;
 | 
			
		||||
      // r5 = vmulq_f32(r0, a);
 | 
			
		||||
      // return vaddq_f32(r4, r5);
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
 | 
			
		||||
@@ -272,9 +271,9 @@ namespace Optimization {
 | 
			
		||||
      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
 | 
			
		||||
 | 
			
		||||
      // no fma, use mul and add
 | 
			
		||||
      //float64x2_t r5;
 | 
			
		||||
      //r5 = vmulq_f64(r0, a);
 | 
			
		||||
      //return vaddq_f64(r4, r5);
 | 
			
		||||
      // float64x2_t r5;
 | 
			
		||||
      // r5 = vmulq_f64(r0, a);
 | 
			
		||||
      // return vaddq_f64(r4, r5);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
@@ -421,11 +420,6 @@ namespace Optimization {
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
// working, but no restriction on n
 | 
			
		||||
//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
 | 
			
		||||
//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
 | 
			
		||||
 | 
			
		||||
// restriction on n
 | 
			
		||||
    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
 | 
			
		||||
    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
 | 
			
		||||
 | 
			
		||||
@@ -441,7 +435,7 @@ namespace Optimization {
 | 
			
		||||
      sb = vcvt_high_f32_f16(h);
 | 
			
		||||
      // there is no direct conversion from lower float32x4_t to float64x2_t
 | 
			
		||||
      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
 | 
			
		||||
      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
 | 
			
		||||
      // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
 | 
			
		||||
      // workaround for clang
 | 
			
		||||
      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
 | 
			
		||||
      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
 | 
			
		||||
@@ -547,7 +541,7 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //Complex double Reduce
 | 
			
		||||
  template<> // N:by Boyle
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
 | 
			
		||||
    u128d conv; conv.v = in;
 | 
			
		||||
    return Grid::ComplexD(conv.f[0],conv.f[1]);
 | 
			
		||||
@@ -562,9 +556,7 @@ namespace Optimization {
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
 | 
			
		||||
    // FIXME unimplemented
 | 
			
		||||
    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
			
		||||
    assert(0);
 | 
			
		||||
    return vaddvq_u32(in);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -604,3 +596,4 @@ namespace Optimization {
 | 
			
		||||
  typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  // Timing info; ugly; possibly temporary
 | 
			
		||||
  /////////////////////////////////////////
 | 
			
		||||
  double commtime;
 | 
			
		||||
  double mpi3synctime;
 | 
			
		||||
  double mpi3synctime_g;
 | 
			
		||||
  double shmmergetime;
 | 
			
		||||
  double gathertime;
 | 
			
		||||
  double gathermtime;
 | 
			
		||||
  double halogtime;
 | 
			
		||||
@@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  double splicetime;
 | 
			
		||||
  double nosplicetime;
 | 
			
		||||
  double calls;
 | 
			
		||||
  std::vector<double> comm_bytes_thr;
 | 
			
		||||
  std::vector<double> comm_time_thr;
 | 
			
		||||
  std::vector<double> comm_enter_thr;
 | 
			
		||||
  std::vector<double> comm_leave_thr;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
  // Stencil query
 | 
			
		||||
@@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  // Comms packet queue for asynch thread
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  void CommunicateThreaded()
 | 
			
		||||
  {
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
    // must be called in parallel region
 | 
			
		||||
    int mythread = omp_get_thread_num();
 | 
			
		||||
    int nthreads = CartesianCommunicator::nCommThreads;
 | 
			
		||||
#else
 | 
			
		||||
    int mythread = 0;
 | 
			
		||||
    int nthreads = 1;
 | 
			
		||||
#endif
 | 
			
		||||
    if (nthreads == -1) nthreads = 1;
 | 
			
		||||
    if (mythread < nthreads) {
 | 
			
		||||
      comm_enter_thr[mythread] = usecond();
 | 
			
		||||
      for (int i = mythread; i < Packets.size(); i += nthreads) {
 | 
			
		||||
	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 | 
			
		||||
						      Packets[i].to_rank,
 | 
			
		||||
						      Packets[i].recv_buf,
 | 
			
		||||
						      Packets[i].from_rank,
 | 
			
		||||
						      Packets[i].bytes,i);
 | 
			
		||||
	comm_bytes_thr[mythread] += bytes;
 | 
			
		||||
      }
 | 
			
		||||
      comm_leave_thr[mythread]= usecond();
 | 
			
		||||
      comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  void CollateThreads(void)
 | 
			
		||||
  {
 | 
			
		||||
    int nthreads = CartesianCommunicator::nCommThreads;
 | 
			
		||||
    double first=0.0;
 | 
			
		||||
    double last =0.0;
 | 
			
		||||
 | 
			
		||||
    for(int t=0;t<nthreads;t++) {
 | 
			
		||||
 | 
			
		||||
      double t0 = comm_enter_thr[t];
 | 
			
		||||
      double t1 = comm_leave_thr[t];
 | 
			
		||||
      comms_bytes+=comm_bytes_thr[t];
 | 
			
		||||
 | 
			
		||||
      comm_enter_thr[t] = 0.0;
 | 
			
		||||
      comm_leave_thr[t] = 0.0;
 | 
			
		||||
      comm_time_thr[t]   = 0.0;
 | 
			
		||||
      comm_bytes_thr[t]=0;
 | 
			
		||||
 | 
			
		||||
      if ( first == 0.0 ) first = t0;                   // first is t0
 | 
			
		||||
      if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen
 | 
			
		||||
 | 
			
		||||
      if ( t1 > last ) last = t1;                       // max time seen
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
    commtime+= last-first;
 | 
			
		||||
  }
 | 
			
		||||
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    reqs.resize(Packets.size());
 | 
			
		||||
    commtime-=usecond();
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
      comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
 | 
			
		||||
					  Packets[i].send_buf,
 | 
			
		||||
					  Packets[i].to_rank,
 | 
			
		||||
					  Packets[i].recv_buf,
 | 
			
		||||
					  Packets[i].from_rank,
 | 
			
		||||
					  Packets[i].bytes);
 | 
			
		||||
						     Packets[i].send_buf,
 | 
			
		||||
						     Packets[i].to_rank,
 | 
			
		||||
						     Packets[i].recv_buf,
 | 
			
		||||
						     Packets[i].from_rank,
 | 
			
		||||
						     Packets[i].bytes,i);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
      _grid->StencilSendToRecvFromComplete(reqs[i]);
 | 
			
		||||
      _grid->StencilSendToRecvFromComplete(reqs[i],i);
 | 
			
		||||
    }
 | 
			
		||||
    commtime+=usecond();
 | 
			
		||||
  }
 | 
			
		||||
  void Communicate(void)
 | 
			
		||||
  {
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
    {
 | 
			
		||||
      // must be called in parallel region
 | 
			
		||||
      int mythread  = omp_get_thread_num();
 | 
			
		||||
      int maxthreads= omp_get_max_threads();
 | 
			
		||||
      int nthreads = CartesianCommunicator::nCommThreads;
 | 
			
		||||
      assert(nthreads <= maxthreads);
 | 
			
		||||
 | 
			
		||||
      if (nthreads == -1) nthreads = 1;
 | 
			
		||||
#else
 | 
			
		||||
      int mythread = 0;
 | 
			
		||||
      int nthreads = 1;
 | 
			
		||||
#endif
 | 
			
		||||
      if (mythread < nthreads) {
 | 
			
		||||
	for (int i = mythread; i < Packets.size(); i += nthreads) {
 | 
			
		||||
	  double start = usecond();
 | 
			
		||||
	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 | 
			
		||||
								   Packets[i].to_rank,
 | 
			
		||||
								   Packets[i].recv_buf,
 | 
			
		||||
								   Packets[i].from_rank,
 | 
			
		||||
								   Packets[i].bytes,i);
 | 
			
		||||
	  comm_time_thr[mythread] += usecond() - start;
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
    Prepare();
 | 
			
		||||
    HaloGather(source,compress);
 | 
			
		||||
    CommunicateBegin(reqs);
 | 
			
		||||
    CommunicateComplete(reqs);
 | 
			
		||||
    // Concurrent
 | 
			
		||||
    //CommunicateBegin(reqs);
 | 
			
		||||
    //CommunicateComplete(reqs);
 | 
			
		||||
    // Sequential, possibly threaded
 | 
			
		||||
    Communicate();
 | 
			
		||||
    CommsMergeSHM(compress); 
 | 
			
		||||
    CommsMerge(compress); 
 | 
			
		||||
  }
 | 
			
		||||
@@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  template<class compressor>
 | 
			
		||||
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
			
		||||
  {
 | 
			
		||||
    mpi3synctime_g-=usecond();
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
    mpi3synctime_g+=usecond();
 | 
			
		||||
 | 
			
		||||
    // conformable(source._grid,_grid);
 | 
			
		||||
    assert(source._grid==_grid);
 | 
			
		||||
@@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
    CommsMerge(decompress,Mergers,Decompressions); 
 | 
			
		||||
  }
 | 
			
		||||
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
 | 
			
		||||
    mpi3synctime-=usecond();    
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
    mpi3synctime+=usecond();    
 | 
			
		||||
    shmmergetime-=usecond();    
 | 
			
		||||
    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
 | 
			
		||||
    shmmergetime+=usecond();    
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class decompressor>
 | 
			
		||||
@@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
		  int checkerboard,
 | 
			
		||||
		  const std::vector<int> &directions,
 | 
			
		||||
		  const std::vector<int> &distances) 
 | 
			
		||||
   :   _permute_type(npoints), _comm_buf_size(npoints)
 | 
			
		||||
   : _permute_type(npoints), 
 | 
			
		||||
    _comm_buf_size(npoints),
 | 
			
		||||
    comm_bytes_thr(npoints), 
 | 
			
		||||
    comm_enter_thr(npoints),
 | 
			
		||||
    comm_leave_thr(npoints), 
 | 
			
		||||
       comm_time_thr(npoints)
 | 
			
		||||
  {
 | 
			
		||||
    face_table_computed=0;
 | 
			
		||||
    _npoints = npoints;
 | 
			
		||||
@@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  void ZeroCounters(void) {
 | 
			
		||||
    gathertime = 0.;
 | 
			
		||||
    commtime = 0.;
 | 
			
		||||
    mpi3synctime=0.;
 | 
			
		||||
    mpi3synctime_g=0.;
 | 
			
		||||
    shmmergetime=0.;
 | 
			
		||||
    for(int i=0;i<_npoints;i++){
 | 
			
		||||
      comm_time_thr[i]=0;
 | 
			
		||||
      comm_bytes_thr[i]=0;
 | 
			
		||||
      comm_enter_thr[i]=0;
 | 
			
		||||
      comm_leave_thr[i]=0;
 | 
			
		||||
    }
 | 
			
		||||
    halogtime = 0.;
 | 
			
		||||
    mergetime = 0.;
 | 
			
		||||
    decompresstime = 0.;
 | 
			
		||||
@@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
 | 
			
		||||
    RealD NP = _grid->_Nprocessors;
 | 
			
		||||
    RealD NN = _grid->NodeCount();
 | 
			
		||||
    double t = 0;
 | 
			
		||||
    // if comm_time_thr is set they were all done in parallel so take the max
 | 
			
		||||
    // but add up the bytes
 | 
			
		||||
    int threaded = 0 ;
 | 
			
		||||
    for (int i = 0; i < 8; ++i) {
 | 
			
		||||
      if ( comm_time_thr[i]>0.0 ) {
 | 
			
		||||
	threaded = 1;
 | 
			
		||||
	comms_bytes += comm_bytes_thr[i];
 | 
			
		||||
	if (t < comm_time_thr[i]) t = comm_time_thr[i];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    if (threaded) commtime += t;
 | 
			
		||||
    
 | 
			
		||||
    _grid->GlobalSum(commtime);    commtime/=NP;
 | 
			
		||||
    if ( calls > 0. ) {
 | 
			
		||||
@@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      PRINTIT(mpi3synctime);
 | 
			
		||||
      PRINTIT(mpi3synctime_g);
 | 
			
		||||
      PRINTIT(shmmergetime);
 | 
			
		||||
      PRINTIT(splicetime);
 | 
			
		||||
      PRINTIT(nosplicetime);
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
 | 
			
		||||
strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
 | 
			
		||||
                 const iVector<vtype,N> * __restrict__ rhs,
 | 
			
		||||
                 const iScalar<mtype> * __restrict__ lhs){
 | 
			
		||||
    mult(ret,lhs,rhs);
 | 
			
		||||
    for(int c1=0;c1<N;c1++){
 | 
			
		||||
        mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
 | 
			
		||||
    }                 
 | 
			
		||||
}
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -219,9 +219,15 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    int MB;
 | 
			
		||||
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
 | 
			
		||||
    GridCmdOptionInt(arg,MB);
 | 
			
		||||
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
 | 
			
		||||
    uint64_t MB64 = MB;
 | 
			
		||||
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
 | 
			
		||||
    CartesianCommunicator::Hugepages = 1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
 | 
			
		||||
    Grid_debug_handler_init();
 | 
			
		||||
  }
 | 
			
		||||
@@ -304,6 +310,7 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
@@ -317,7 +324,7 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
 | 
			
		||||
@@ -356,10 +363,15 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
 | 
			
		||||
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
			
		||||
    LebesgueOrder::UseLebesgueOrder=1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CartesianCommunicator::nCommThreads = -1;
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
 | 
			
		||||
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
 | 
			
		||||
    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
 | 
			
		||||
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
 | 
			
		||||
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
 | 
			
		||||
@@ -374,10 +386,13 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
		  Grid_default_latt,
 | 
			
		||||
		  Grid_default_mpi);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
 | 
			
		||||
  if ( CartesianCommunicator::Hugepages) {
 | 
			
		||||
    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Decomposition\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
 | 
			
		||||
@@ -393,7 +408,7 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
 | 
			
		||||
void Grid_finalize(void)
 | 
			
		||||
{
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
 | 
			
		||||
  MPI_Finalize();
 | 
			
		||||
  Grid_unquiesce_nodes();
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
@@ -82,6 +85,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  Uorg = Uorg - Umu;
 | 
			
		||||
  std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
 | 
			
		||||
  std::cout << " Norm "<< norm2(Umu) << std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -40,12 +40,6 @@ namespace Grid{
 | 
			
		||||
				    double, StoppingCondition,
 | 
			
		||||
				    int, MaxCGIterations,
 | 
			
		||||
				    bool, ApplySmearing);
 | 
			
		||||
 | 
			
		||||
    //template <class ReaderClass >
 | 
			
		||||
    //FermionParameters(Reader<ReaderClass>& Reader){
 | 
			
		||||
    //  read(Reader, "Mobius", *this);
 | 
			
		||||
    //}
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
@@ -113,9 +107,17 @@ int main(int argc, char **argv) {
 | 
			
		||||
  bool ApplySmearing = MyParams.Mobius.ApplySmearing;
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  // Use this if you want to tweak the default decomposition
 | 
			
		||||
  // commented out as very architecture speficic
 | 
			
		||||
  
 | 
			
		||||
  //std::vector<int> simd_lanes({2,2,1,1});
 | 
			
		||||
 | 
			
		||||
  // Grid from the command line arguments --grid and --mpi
 | 
			
		||||
  // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes
 | 
			
		||||
  
 | 
			
		||||
  //TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); // tweak the SIMD lanes
 | 
			
		||||
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
 | 
			
		||||
  
 | 
			
		||||
  // Grid from the command line
 | 
			
		||||
  TheHMC.Resources.AddFourDimGrid("gauge");
 | 
			
		||||
  // Possibile to create the module by hand 
 | 
			
		||||
  // hardcoding parameters or using a Reader
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -66,7 +66,14 @@ int main(int argc, char **argv) {
 | 
			
		||||
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
 | 
			
		||||
  typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
 | 
			
		||||
  TheHMC.Resources.AddObservable<PlaqObs>();
 | 
			
		||||
  TheHMC.Resources.AddObservable<QObs>();
 | 
			
		||||
  TopologyObsParameters TopParams;
 | 
			
		||||
  TopParams.interval = 5;
 | 
			
		||||
  TopParams.do_smearing = true;
 | 
			
		||||
  TopParams.Smearing.steps = 200;
 | 
			
		||||
  TopParams.Smearing.step_size = 0.01;
 | 
			
		||||
  TopParams.Smearing.meas_interval = 50;
 | 
			
		||||
  TopParams.Smearing.maxTau = 2.0; 
 | 
			
		||||
  TheHMC.Resources.AddObservable<QObs>(TopParams);
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -99,21 +99,27 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  Ds.ZeroCounters();
 | 
			
		||||
  CG(HermOp,src,result);
 | 
			
		||||
  Ds.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  Ds.ZeroCounters();
 | 
			
		||||
  mCG(HermOp,src,result);
 | 
			
		||||
  Ds.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  Ds.ZeroCounters();
 | 
			
		||||
  BCGrQ(HermOp,src,result);
 | 
			
		||||
  Ds.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user