mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Merge remote-tracking branch 'origin/develop' into temporary-smearing
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -94,7 +94,7 @@ Thumbs.db
 | 
			
		||||
 | 
			
		||||
# build directory #
 | 
			
		||||
###################
 | 
			
		||||
build/*
 | 
			
		||||
build*/*
 | 
			
		||||
 | 
			
		||||
# IDE related files #
 | 
			
		||||
#####################
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										90
									
								
								.travis.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										90
									
								
								.travis.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,90 @@
 | 
			
		||||
language: cpp
 | 
			
		||||
 | 
			
		||||
cache:
 | 
			
		||||
  directories:
 | 
			
		||||
    - clang
 | 
			
		||||
 | 
			
		||||
matrix:
 | 
			
		||||
  include:
 | 
			
		||||
    - os:        osx
 | 
			
		||||
      osx_image: xcode7.2
 | 
			
		||||
      compiler: clang
 | 
			
		||||
    - os:        osx
 | 
			
		||||
      osx_image: xcode7.2
 | 
			
		||||
      compiler: gcc
 | 
			
		||||
      env: VERSION=-5
 | 
			
		||||
    - compiler: gcc
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.9
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: VERSION=-4.9
 | 
			
		||||
    - compiler: gcc
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-5
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: VERSION=-5
 | 
			
		||||
    - compiler: clang
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.8
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 | 
			
		||||
    - compiler: clang
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.8
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 | 
			
		||||
      
 | 
			
		||||
before_install:
 | 
			
		||||
    - export GRIDDIR=`pwd`
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
 | 
			
		||||
    
 | 
			
		||||
install:
 | 
			
		||||
    - export CC=$CC$VERSION
 | 
			
		||||
    - export CXX=$CXX$VERSION
 | 
			
		||||
    - echo $PATH
 | 
			
		||||
    - which $CC
 | 
			
		||||
    - $CC  --version
 | 
			
		||||
    - which $CXX
 | 
			
		||||
    - $CXX --version
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
 | 
			
		||||
    
 | 
			
		||||
script:
 | 
			
		||||
    - ./scripts/reconfigure_script
 | 
			
		||||
    - mkdir build
 | 
			
		||||
    - cd build
 | 
			
		||||
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
 | 
			
		||||
    - make -j4
 | 
			
		||||
    - ./benchmarks/Benchmark_dwf --threads 1
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
# Grid
 | 
			
		||||
# Grid [](https://travis-ci.org/paboyle/Grid)
 | 
			
		||||
Data parallel C++ mathematical object library
 | 
			
		||||
 | 
			
		||||
Last update 2015/7/30
 | 
			
		||||
 
 | 
			
		||||
@@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#include <PerfCount.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
@@ -45,6 +46,10 @@ struct scal {
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
bool overlapComms = false;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
@@ -64,6 +69,12 @@ int main (int argc, char ** argv)
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
 | 
			
		||||
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
 | 
			
		||||
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
			
		||||
  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
 | 
			
		||||
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
@@ -78,7 +89,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  ColourMatrix cm = Complex(1.0,0.0);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
 | 
			
		||||
  LatticeGaugeField Umu(UGrid); 
 | 
			
		||||
  random(RNG4,Umu);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
 | 
			
		||||
  // replicate across fifth dimension
 | 
			
		||||
@@ -119,14 +132,21 @@ int main (int argc, char ** argv)
 | 
			
		||||
  
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
 | 
			
		||||
  for(int doasm=1;doasm<2;doasm++){
 | 
			
		||||
 | 
			
		||||
    QCD::WilsonKernelsStatic::AsmOpt=doasm;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
			
		||||
  int ncall=1000;
 | 
			
		||||
  {
 | 
			
		||||
  int ncall =10;
 | 
			
		||||
  if (1) {
 | 
			
		||||
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      __SSC_START;
 | 
			
		||||
      Dw.Dhop(src,result,0);
 | 
			
		||||
      __SSC_STOP;
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    
 | 
			
		||||
@@ -140,9 +160,121 @@ int main (int argc, char ** argv)
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    err = ref-result; 
 | 
			
		||||
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
    Dw.Report();
 | 
			
		||||
    //    Dw.Report();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  {
 | 
			
		||||
    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
 | 
			
		||||
    LatticeFermionF ssrc(sFGrid);
 | 
			
		||||
    LatticeFermionF sref(sFGrid);
 | 
			
		||||
    LatticeFermionF sresult(sFGrid);
 | 
			
		||||
    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
 | 
			
		||||
  
 | 
			
		||||
    for(int x=0;x<latt4[0];x++){
 | 
			
		||||
    for(int y=0;y<latt4[1];y++){
 | 
			
		||||
    for(int z=0;z<latt4[2];z++){
 | 
			
		||||
    for(int t=0;t<latt4[3];t++){
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      std::vector<int> site({s,x,y,z,t});
 | 
			
		||||
      SpinColourVectorF tmp;
 | 
			
		||||
      peekSite(tmp,src,site);
 | 
			
		||||
      pokeSite(tmp,ssrc,site);
 | 
			
		||||
    }}}}}
 | 
			
		||||
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      __SSC_START;
 | 
			
		||||
      sDw.Dhop(ssrc,sresult,0);
 | 
			
		||||
      __SSC_STOP;
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
    double flops=1344*volume*ncall;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    //  sDw.Report();
 | 
			
		||||
  
 | 
			
		||||
    if(0){
 | 
			
		||||
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
	sDw.Dhop(ssrc,sresult,0);
 | 
			
		||||
	PerformanceCounter Counter(i);
 | 
			
		||||
	Counter.Start();
 | 
			
		||||
	sDw.Dhop(ssrc,sresult,0);
 | 
			
		||||
	Counter.Stop();
 | 
			
		||||
	Counter.Report();
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    RealF sum=0;
 | 
			
		||||
    for(int x=0;x<latt4[0];x++){
 | 
			
		||||
    for(int y=0;y<latt4[1];y++){
 | 
			
		||||
    for(int z=0;z<latt4[2];z++){
 | 
			
		||||
    for(int t=0;t<latt4[3];t++){
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      std::vector<int> site({s,x,y,z,t});
 | 
			
		||||
      SpinColourVectorF normal, simd;
 | 
			
		||||
      peekSite(normal,result,site);
 | 
			
		||||
      peekSite(simd,sresult,site);
 | 
			
		||||
      sum=sum+norm2(normal-simd);
 | 
			
		||||
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
 | 
			
		||||
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
 | 
			
		||||
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
 | 
			
		||||
    }}}}}
 | 
			
		||||
    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    if (1) {
 | 
			
		||||
 | 
			
		||||
      LatticeFermionF sr_eo(sFGrid);
 | 
			
		||||
      LatticeFermionF serr(sFGrid);
 | 
			
		||||
 | 
			
		||||
      LatticeFermion ssrc_e (sFrbGrid);
 | 
			
		||||
      LatticeFermion ssrc_o (sFrbGrid);
 | 
			
		||||
      LatticeFermion sr_e   (sFrbGrid);
 | 
			
		||||
      LatticeFermion sr_o   (sFrbGrid);
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,ssrc_e,ssrc);
 | 
			
		||||
      pickCheckerboard(Odd,ssrc_o,ssrc);
 | 
			
		||||
 | 
			
		||||
      setCheckerboard(sr_eo,ssrc_o);
 | 
			
		||||
      setCheckerboard(sr_eo,ssrc_e);
 | 
			
		||||
      serr = sr_eo-ssrc; 
 | 
			
		||||
      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
 | 
			
		||||
 | 
			
		||||
      sr_e = zero;
 | 
			
		||||
      sr_o = zero;
 | 
			
		||||
 | 
			
		||||
      double t0=usecond();
 | 
			
		||||
      for(int i=0;i<ncall;i++){
 | 
			
		||||
	sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
 | 
			
		||||
      }
 | 
			
		||||
      double t1=usecond();
 | 
			
		||||
 | 
			
		||||
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
      double flops=(1344.0*volume*ncall)/2;
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
 | 
			
		||||
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
 | 
			
		||||
      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
 | 
			
		||||
      sDw.Dhop  (ssrc  ,sresult,DaggerNo);
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,ssrc_e,sresult);
 | 
			
		||||
      pickCheckerboard(Odd ,ssrc_o,sresult);
 | 
			
		||||
      ssrc_e = ssrc_e - sr_e;
 | 
			
		||||
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
 | 
			
		||||
      ssrc_o = ssrc_o - sr_o;
 | 
			
		||||
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  { // Naive wilson dag implementation
 | 
			
		||||
@@ -197,7 +329,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
  Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
			
		||||
  Dw.Dhop  (src  ,result,DaggerNo);
 | 
			
		||||
@@ -217,5 +348,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										154
									
								
								benchmarks/Benchmark_dwf_ntpf.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								benchmarks/Benchmark_dwf_ntpf.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,154 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_dwf.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#include <PerfCount.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::GammaMatrix Gmu [] = {
 | 
			
		||||
    Gamma::GammaX,
 | 
			
		||||
    Gamma::GammaY,
 | 
			
		||||
    Gamma::GammaZ,
 | 
			
		||||
    Gamma::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
bool overlapComms = false;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
 | 
			
		||||
    overlapComms = true;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt4 = GridDefaultLatt();
 | 
			
		||||
  const int Ls=16;
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src   (FGrid); random(RNG5,src);
 | 
			
		||||
  LatticeFermion result(FGrid); result=zero;
 | 
			
		||||
  LatticeFermion    ref(FGrid);    ref=zero;
 | 
			
		||||
  LatticeFermion    tmp(FGrid);
 | 
			
		||||
  LatticeFermion    err(FGrid);
 | 
			
		||||
 | 
			
		||||
  ColourMatrix cm = Complex(1.0,0.0);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeField Umu(UGrid); 
 | 
			
		||||
  random(RNG4,Umu);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
 | 
			
		||||
  // replicate across fifth dimension
 | 
			
		||||
  for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // Naive wilson implementation
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  {
 | 
			
		||||
    ref = zero;
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu+1,1);
 | 
			
		||||
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
			
		||||
 | 
			
		||||
      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
			
		||||
    }
 | 
			
		||||
    ref = -0.5*ref;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
  typename DomainWallFermionR::ImplParams params; 
 | 
			
		||||
  params.overlapCommsCompute = overlapComms;
 | 
			
		||||
  
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  QCD::WilsonKernelsStatic::AsmOpt=1;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
			
		||||
  int ncall =50;
 | 
			
		||||
  if (1) {
 | 
			
		||||
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      Dw.Dhop(src,result,0);
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    
 | 
			
		||||
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
    double flops=1344*volume*ncall;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    err = ref-result; 
 | 
			
		||||
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
    //    Dw.Report();
 | 
			
		||||
  }
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										172
									
								
								benchmarks/Benchmark_zmm.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								benchmarks/Benchmark_zmm.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,172 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_zmm.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#include <PerfCount.h>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
 | 
			
		||||
 | 
			
		||||
int main(int argc,char **argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
  std::ofstream os("zmm.dat");
 | 
			
		||||
 | 
			
		||||
  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
 | 
			
		||||
  for(int L=4;L<=32;L+=4){
 | 
			
		||||
    for(int m=1;m<=2;m++){
 | 
			
		||||
      for(int Ls=8;Ls<=16;Ls+=8){
 | 
			
		||||
	std::vector<int> grid({L,L,m*L,m*L});
 | 
			
		||||
	for(int i=0;i<4;i++) { 
 | 
			
		||||
	  std::cout << grid[i]<<"x";
 | 
			
		||||
	}
 | 
			
		||||
	std::cout << Ls<<std::endl;
 | 
			
		||||
	bench(os,grid,Ls);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src (FGrid);
 | 
			
		||||
  LatticeFermion tmp (FGrid);
 | 
			
		||||
  LatticeFermion srce(FrbGrid);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion resulto(FrbGrid); resulto=zero;
 | 
			
		||||
  LatticeFermion resulta(FrbGrid); resulta=zero;
 | 
			
		||||
  LatticeFermion junk(FrbGrid); junk=zero;
 | 
			
		||||
  LatticeFermion diff(FrbGrid); 
 | 
			
		||||
  LatticeGaugeField Umu(UGrid);
 | 
			
		||||
 | 
			
		||||
  double mfc, mfa, mfo, mfl1;
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  random(RNG5,src);
 | 
			
		||||
#if 1
 | 
			
		||||
  random(RNG4,Umu);
 | 
			
		||||
#else
 | 
			
		||||
  int mmu=2;
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,UGrid);
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
    if ( mu!=mmu ) U[mu] = zero;
 | 
			
		||||
    if ( mu==mmu ) U[mu] = 1.0;
 | 
			
		||||
    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 pickCheckerboard(Even,srce,src);
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
			
		||||
  int ncall=50;
 | 
			
		||||
  double t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Dw.DhopOE(srce,resulto,0);
 | 
			
		||||
  }
 | 
			
		||||
  double t1=usecond();
 | 
			
		||||
 | 
			
		||||
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
  double flops=1344*volume/2;
 | 
			
		||||
 | 
			
		||||
  mfc = flops*ncall/(t1-t0);
 | 
			
		||||
  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
 | 
			
		||||
 | 
			
		||||
  QCD::WilsonKernelsStatic::AsmOpt=1;
 | 
			
		||||
  t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
  }
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
  mfa = flops*ncall/(t1-t0);
 | 
			
		||||
  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
 | 
			
		||||
  /*
 | 
			
		||||
  int dag=DaggerNo;
 | 
			
		||||
  t0=usecond();
 | 
			
		||||
  for(int i=0;i<1;i++){
 | 
			
		||||
    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
 | 
			
		||||
  }
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
  mfo = flops*100/(t1-t0);
 | 
			
		||||
  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
 | 
			
		||||
 | 
			
		||||
  t0=usecond();
 | 
			
		||||
  for(int i=0;i<1;i++){
 | 
			
		||||
    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
 | 
			
		||||
  }
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
  mfl1= flops*100/(t1-t0);
 | 
			
		||||
  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
 | 
			
		||||
  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
 | 
			
		||||
     << mfc<<" "
 | 
			
		||||
     << mfa<<" "
 | 
			
		||||
     << mfo<<" "
 | 
			
		||||
     << mfl1<<std::endl;
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
    PerformanceCounter Counter(i);
 | 
			
		||||
    Counter.Start();
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
    Counter.Stop();
 | 
			
		||||
    Counter.Report();
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  //resulta = (-0.5) * resulta;
 | 
			
		||||
 | 
			
		||||
  diff = resulto-resulta;
 | 
			
		||||
  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
 | 
			
		||||
  std::cout<<std::endl;
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
 | 
			
		||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
 | 
			
		||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Benchmark_comms_SOURCES=Benchmark_comms.cc
 | 
			
		||||
@@ -10,6 +10,10 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 | 
			
		||||
Benchmark_dwf_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
 | 
			
		||||
Benchmark_dwf_ntpf_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 | 
			
		||||
Benchmark_memory_asynch_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
@@ -25,3 +29,7 @@ Benchmark_su3_LDADD=-lGrid
 | 
			
		||||
Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 | 
			
		||||
Benchmark_wilson_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Benchmark_zmm_SOURCES=Benchmark_zmm.cc
 | 
			
		||||
Benchmark_zmm_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										109
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										109
									
								
								configure
									
									
									
									
										vendored
									
									
								
							@@ -626,12 +626,18 @@ ac_subst_vars='am__EXEEXT_FALSE
 | 
			
		||||
am__EXEEXT_TRUE
 | 
			
		||||
LTLIBOBJS
 | 
			
		||||
LIBOBJS
 | 
			
		||||
USE_LAPACK_LIB_FALSE
 | 
			
		||||
USE_LAPACK_LIB_TRUE
 | 
			
		||||
USE_LAPACK_FALSE
 | 
			
		||||
USE_LAPACK_TRUE
 | 
			
		||||
BUILD_CHROMA_REGRESSION_FALSE
 | 
			
		||||
BUILD_CHROMA_REGRESSION_TRUE
 | 
			
		||||
BUILD_COMMS_NONE_FALSE
 | 
			
		||||
BUILD_COMMS_NONE_TRUE
 | 
			
		||||
BUILD_COMMS_MPI_FALSE
 | 
			
		||||
BUILD_COMMS_MPI_TRUE
 | 
			
		||||
BUILD_COMMS_SHMEM_FALSE
 | 
			
		||||
BUILD_COMMS_SHMEM_TRUE
 | 
			
		||||
BUILD_ZMM_FALSE
 | 
			
		||||
BUILD_ZMM_TRUE
 | 
			
		||||
EGREP
 | 
			
		||||
@@ -751,7 +757,9 @@ enable_simd
 | 
			
		||||
enable_precision
 | 
			
		||||
enable_comms
 | 
			
		||||
enable_rng
 | 
			
		||||
enable_timers
 | 
			
		||||
enable_chroma
 | 
			
		||||
enable_lapack
 | 
			
		||||
'
 | 
			
		||||
      ac_precious_vars='build_alias
 | 
			
		||||
host_alias
 | 
			
		||||
@@ -1410,7 +1418,9 @@ Optional Features:
 | 
			
		||||
  --enable-comms=none|mpi Select communications
 | 
			
		||||
  --enable-rng=ranlux48|mt19937
 | 
			
		||||
                          Select Random Number Generator to be used
 | 
			
		||||
  --enable-timers=yes|no  Enable system dependent high res timers
 | 
			
		||||
  --enable-chroma         Expect chroma compiled under c++11
 | 
			
		||||
  --enable-lapack         Enable lapack yes/no
 | 
			
		||||
 | 
			
		||||
Some influential environment variables:
 | 
			
		||||
  CXX         C++ compiler command
 | 
			
		||||
@@ -6410,7 +6420,7 @@ if test "${enable_simd+set}" = set; then :
 | 
			
		||||
  enableval=$enable_simd; \
 | 
			
		||||
	ac_SIMD=${enable_simd}
 | 
			
		||||
else
 | 
			
		||||
  ac_SIMD=AVX2
 | 
			
		||||
  ac_SIMD=DEBUG
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -6477,7 +6487,7 @@ $as_echo "#define AVX512 1" >>confdefs.h
 | 
			
		||||
$as_echo "#define IMCI 1" >>confdefs.h
 | 
			
		||||
 | 
			
		||||
       supported="cross compilation"
 | 
			
		||||
       ac_ZMM=yes;
 | 
			
		||||
       ac_ZMM=no;
 | 
			
		||||
     ;;
 | 
			
		||||
     NEONv8)
 | 
			
		||||
       echo Configuring for experimental ARMv8a support
 | 
			
		||||
@@ -6561,12 +6571,26 @@ $as_echo "#define GRID_COMMS_NONE 1" >>confdefs.h
 | 
			
		||||
 | 
			
		||||
$as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
 | 
			
		||||
 | 
			
		||||
     ;;
 | 
			
		||||
     shmem)
 | 
			
		||||
       echo Configuring for SHMEM communications
 | 
			
		||||
 | 
			
		||||
$as_echo "#define GRID_COMMS_SHMEM 1" >>confdefs.h
 | 
			
		||||
 | 
			
		||||
     ;;
 | 
			
		||||
     *)
 | 
			
		||||
     as_fn_error $? "${ac_COMMS} unsupported --enable-comms option" "$LINENO" 5;
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
 if  test "X${ac_COMMS}X" == "XshmemX" ; then
 | 
			
		||||
  BUILD_COMMS_SHMEM_TRUE=
 | 
			
		||||
  BUILD_COMMS_SHMEM_FALSE='#'
 | 
			
		||||
else
 | 
			
		||||
  BUILD_COMMS_SHMEM_TRUE='#'
 | 
			
		||||
  BUILD_COMMS_SHMEM_FALSE=
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 if  test "X${ac_COMMS}X" == "XmpiX" ; then
 | 
			
		||||
  BUILD_COMMS_MPI_TRUE=
 | 
			
		||||
  BUILD_COMMS_MPI_FALSE='#'
 | 
			
		||||
@@ -6610,6 +6634,34 @@ $as_echo "#define RNG_MT19937 1" >>confdefs.h
 | 
			
		||||
     as_fn_error $? "${ac_RNG} unsupported --enable-rng option" "$LINENO" 5;
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# SDE timing mode
 | 
			
		||||
#
 | 
			
		||||
# Check whether --enable-timers was given.
 | 
			
		||||
if test "${enable_timers+set}" = set; then :
 | 
			
		||||
  enableval=$enable_timers; \
 | 
			
		||||
	ac_TIMERS=${enable_timers}
 | 
			
		||||
else
 | 
			
		||||
  ac_TIMERS=yes
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
case ${ac_TIMERS} in
 | 
			
		||||
     yes)
 | 
			
		||||
 | 
			
		||||
$as_echo "#define TIMERS_ON 1" >>confdefs.h
 | 
			
		||||
 | 
			
		||||
     ;;
 | 
			
		||||
     no)
 | 
			
		||||
 | 
			
		||||
$as_echo "#define TIMERS_OFF 1" >>confdefs.h
 | 
			
		||||
 | 
			
		||||
     ;;
 | 
			
		||||
     *)
 | 
			
		||||
     as_fn_error $? "${ac_TIMERS} unsupported --enable-timers option" "$LINENO" 5;
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Chroma regression tests
 | 
			
		||||
#
 | 
			
		||||
@@ -6642,6 +6694,46 @@ else
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Lapack
 | 
			
		||||
#
 | 
			
		||||
# Check whether --enable-lapack was given.
 | 
			
		||||
if test "${enable_lapack+set}" = set; then :
 | 
			
		||||
  enableval=$enable_lapack; ac_LAPACK=${enable_lapack}
 | 
			
		||||
else
 | 
			
		||||
  ac_LAPACK=no
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
case ${ac_LAPACK} in
 | 
			
		||||
     yes)
 | 
			
		||||
       echo Enabling lapack
 | 
			
		||||
     ;;
 | 
			
		||||
     no)
 | 
			
		||||
       echo Disabling lapack
 | 
			
		||||
     ;;
 | 
			
		||||
     *)
 | 
			
		||||
       echo Enabling lapack at ${ac_LAPACK}
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
 if  test "X${ac_LAPACK}X" != "XnoX" ; then
 | 
			
		||||
  USE_LAPACK_TRUE=
 | 
			
		||||
  USE_LAPACK_FALSE='#'
 | 
			
		||||
else
 | 
			
		||||
  USE_LAPACK_TRUE='#'
 | 
			
		||||
  USE_LAPACK_FALSE=
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 if  test "X${ac_LAPACK}X" != "XyesX" ; then
 | 
			
		||||
  USE_LAPACK_LIB_TRUE=
 | 
			
		||||
  USE_LAPACK_LIB_FALSE='#'
 | 
			
		||||
else
 | 
			
		||||
  USE_LAPACK_LIB_TRUE='#'
 | 
			
		||||
  USE_LAPACK_LIB_FALSE=
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
###################################################################
 | 
			
		||||
# Checks for doxygen support
 | 
			
		||||
# if present enables the "make doxyfile" command
 | 
			
		||||
@@ -6809,6 +6901,10 @@ if test -z "${BUILD_ZMM_TRUE}" && test -z "${BUILD_ZMM_FALSE}"; then
 | 
			
		||||
  as_fn_error $? "conditional \"BUILD_ZMM\" was never defined.
 | 
			
		||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
			
		||||
fi
 | 
			
		||||
if test -z "${BUILD_COMMS_SHMEM_TRUE}" && test -z "${BUILD_COMMS_SHMEM_FALSE}"; then
 | 
			
		||||
  as_fn_error $? "conditional \"BUILD_COMMS_SHMEM\" was never defined.
 | 
			
		||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
			
		||||
fi
 | 
			
		||||
if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
 | 
			
		||||
  as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
 | 
			
		||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
			
		||||
@@ -6821,6 +6917,14 @@ if test -z "${BUILD_CHROMA_REGRESSION_TRUE}" && test -z "${BUILD_CHROMA_REGRESSI
 | 
			
		||||
  as_fn_error $? "conditional \"BUILD_CHROMA_REGRESSION\" was never defined.
 | 
			
		||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
			
		||||
fi
 | 
			
		||||
if test -z "${USE_LAPACK_TRUE}" && test -z "${USE_LAPACK_FALSE}"; then
 | 
			
		||||
  as_fn_error $? "conditional \"USE_LAPACK\" was never defined.
 | 
			
		||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
			
		||||
fi
 | 
			
		||||
if test -z "${USE_LAPACK_LIB_TRUE}" && test -z "${USE_LAPACK_LIB_FALSE}"; then
 | 
			
		||||
  as_fn_error $? "conditional \"USE_LAPACK_LIB\" was never defined.
 | 
			
		||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
: "${CONFIG_STATUS=./config.status}"
 | 
			
		||||
ac_write_fail=0
 | 
			
		||||
@@ -8167,6 +8271,7 @@ The following features are enabled:
 | 
			
		||||
- communications type           : ${ac_COMMS}
 | 
			
		||||
- default precision             : ${ac_PRECISION}
 | 
			
		||||
- RNG choice                    : ${ac_RNG}
 | 
			
		||||
- LAPACK	                : ${ac_LAPACK}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										49
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								configure.ac
									
									
									
									
									
								
							@@ -71,7 +71,7 @@ AC_CHECK_FUNCS([gettimeofday])
 | 
			
		||||
 | 
			
		||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
 | 
			
		||||
	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 | 
			
		||||
	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
 | 
			
		||||
	[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
 | 
			
		||||
 | 
			
		||||
supported=no
 | 
			
		||||
 | 
			
		||||
@@ -124,7 +124,7 @@ case ${ac_SIMD} in
 | 
			
		||||
       echo Configuring for IMCI
 | 
			
		||||
       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
 | 
			
		||||
       supported="cross compilation"
 | 
			
		||||
       ac_ZMM=yes;
 | 
			
		||||
       ac_ZMM=no;
 | 
			
		||||
     ;;
 | 
			
		||||
     NEONv8)
 | 
			
		||||
       echo Configuring for experimental ARMv8a support 
 | 
			
		||||
@@ -178,11 +178,16 @@ case ${ac_COMMS} in
 | 
			
		||||
       echo Configuring for MPI communications
 | 
			
		||||
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
 | 
			
		||||
     ;;
 | 
			
		||||
     shmem)
 | 
			
		||||
       echo Configuring for SHMEM communications
 | 
			
		||||
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
 | 
			
		||||
     ;;
 | 
			
		||||
     *)
 | 
			
		||||
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 | 
			
		||||
 | 
			
		||||
@@ -203,6 +208,25 @@ case ${ac_RNG} in
 | 
			
		||||
     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# SDE timing mode
 | 
			
		||||
#
 | 
			
		||||
AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
 | 
			
		||||
	[Enable system dependent high res timers])],\
 | 
			
		||||
	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 | 
			
		||||
case ${ac_TIMERS} in
 | 
			
		||||
     yes)
 | 
			
		||||
     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
 | 
			
		||||
     ;;
 | 
			
		||||
     no)
 | 
			
		||||
     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
 | 
			
		||||
     ;;
 | 
			
		||||
     *)
 | 
			
		||||
     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Chroma regression tests
 | 
			
		||||
#
 | 
			
		||||
@@ -222,6 +246,26 @@ esac
 | 
			
		||||
 | 
			
		||||
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Lapack
 | 
			
		||||
#
 | 
			
		||||
AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
 | 
			
		||||
 | 
			
		||||
case ${ac_LAPACK} in
 | 
			
		||||
     yes)
 | 
			
		||||
       echo Enabling lapack
 | 
			
		||||
     ;;
 | 
			
		||||
     no)
 | 
			
		||||
       echo Disabling lapack
 | 
			
		||||
     ;;
 | 
			
		||||
     *)
 | 
			
		||||
       echo Enabling lapack at ${ac_LAPACK}
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
 | 
			
		||||
AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
 | 
			
		||||
 | 
			
		||||
###################################################################
 | 
			
		||||
# Checks for doxygen support
 | 
			
		||||
# if present enables the "make doxyfile" command
 | 
			
		||||
@@ -265,6 +309,7 @@ The following features are enabled:
 | 
			
		||||
- communications type           : ${ac_COMMS}
 | 
			
		||||
- default precision             : ${ac_PRECISION}
 | 
			
		||||
- RNG choice                    : ${ac_RNG} 
 | 
			
		||||
- LAPACK	                : ${ac_LAPACK} 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
"
 | 
			
		||||
 
 | 
			
		||||
@@ -36,11 +36,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
#include <mm_malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
extern "C" { 
 | 
			
		||||
#include <mpp/shmem.h>
 | 
			
		||||
extern void * shmem_align(size_t, size_t);
 | 
			
		||||
extern void  shmem_free(void *);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -72,21 +79,59 @@ public:
 | 
			
		||||
 | 
			
		||||
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 | 
			
		||||
 | 
			
		||||
  pointer allocate(size_type __n, const void* = 0)
 | 
			
		||||
  pointer allocate(size_type __n, const void* _p= 0)
 | 
			
		||||
  { 
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
 | 
			
		||||
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define PARANOID_SYMMETRIC_HEAP
 | 
			
		||||
#ifdef PARANOID_SYMMETRIC_HEAP
 | 
			
		||||
    static void * bcast;
 | 
			
		||||
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
 | 
			
		||||
    bcast = (void *) ptr;
 | 
			
		||||
    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
 | 
			
		||||
 | 
			
		||||
    if ( bcast != ptr ) {
 | 
			
		||||
      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
 | 
			
		||||
      BACKTRACEFILE();
 | 
			
		||||
      exit(0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    assert( bcast == (void *) ptr);
 | 
			
		||||
 | 
			
		||||
#endif 
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 | 
			
		||||
#else
 | 
			
		||||
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
    _Tp tmp;
 | 
			
		||||
#undef FIRST_TOUCH_OPTIMISE
 | 
			
		||||
#ifdef FIRST_TOUCH_OPTIMISE
 | 
			
		||||
#pragma omp parallel for 
 | 
			
		||||
  for(int i=0;i<__n;i++){
 | 
			
		||||
    ptr[i]=tmp;
 | 
			
		||||
  }
 | 
			
		||||
#endif 
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void deallocate(pointer __p, size_type) { 
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
    shmem_free((void *)__p);
 | 
			
		||||
#else
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    _mm_free((void *)__p); 
 | 
			
		||||
#else
 | 
			
		||||
    free((void *)__p);
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
  void construct(pointer __p, const _Tp& __val) { };
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										180
									
								
								lib/Config.h.in
									
									
									
									
									
								
							
							
						
						
									
										180
									
								
								lib/Config.h.in
									
									
									
									
									
								
							@@ -1,180 +0,0 @@
 | 
			
		||||
/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */
 | 
			
		||||
 | 
			
		||||
/* AVX Intrinsics */
 | 
			
		||||
#undef AVX1
 | 
			
		||||
 | 
			
		||||
/* AVX2 Intrinsics */
 | 
			
		||||
#undef AVX2
 | 
			
		||||
 | 
			
		||||
/* AVX512 Intrinsics for Knights Landing */
 | 
			
		||||
#undef AVX512
 | 
			
		||||
 | 
			
		||||
/* AVX Intrinsics with FMA4 */
 | 
			
		||||
#undef AVXFMA4
 | 
			
		||||
 | 
			
		||||
/* EMPTY_SIMD only for DEBUGGING */
 | 
			
		||||
#undef EMPTY_SIMD
 | 
			
		||||
 | 
			
		||||
/* GRID_COMMS_MPI */
 | 
			
		||||
#undef GRID_COMMS_MPI
 | 
			
		||||
 | 
			
		||||
/* GRID_COMMS_NONE */
 | 
			
		||||
#undef GRID_COMMS_NONE
 | 
			
		||||
 | 
			
		||||
/* GRID_DEFAULT_PRECISION is DOUBLE */
 | 
			
		||||
#undef GRID_DEFAULT_PRECISION_DOUBLE
 | 
			
		||||
 | 
			
		||||
/* GRID_DEFAULT_PRECISION is SINGLE */
 | 
			
		||||
#undef GRID_DEFAULT_PRECISION_SINGLE
 | 
			
		||||
 | 
			
		||||
/* Support Altivec instructions */
 | 
			
		||||
#undef HAVE_ALTIVEC
 | 
			
		||||
 | 
			
		||||
/* Support AVX (Advanced Vector Extensions) instructions */
 | 
			
		||||
#undef HAVE_AVX
 | 
			
		||||
 | 
			
		||||
/* Support AVX2 (Advanced Vector Extensions 2) instructions */
 | 
			
		||||
#undef HAVE_AVX2
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
 | 
			
		||||
   don't. */
 | 
			
		||||
#undef HAVE_DECL_BE64TOH
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
 | 
			
		||||
   */
 | 
			
		||||
#undef HAVE_DECL_NTOHLL
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <endian.h> header file. */
 | 
			
		||||
#undef HAVE_ENDIAN_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <execinfo.h> header file. */
 | 
			
		||||
#undef HAVE_EXECINFO_H
 | 
			
		||||
 | 
			
		||||
/* Support FMA3 (Fused Multiply-Add) instructions */
 | 
			
		||||
#undef HAVE_FMA
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the `gettimeofday' function. */
 | 
			
		||||
#undef HAVE_GETTIMEOFDAY
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <gmp.h> header file. */
 | 
			
		||||
#undef HAVE_GMP_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <inttypes.h> header file. */
 | 
			
		||||
#undef HAVE_INTTYPES_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <malloc.h> header file. */
 | 
			
		||||
#undef HAVE_MALLOC_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
 | 
			
		||||
#undef HAVE_MALLOC_MALLOC_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <memory.h> header file. */
 | 
			
		||||
#undef HAVE_MEMORY_H
 | 
			
		||||
 | 
			
		||||
/* Support mmx instructions */
 | 
			
		||||
#undef HAVE_MMX
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <mm_malloc.h> header file. */
 | 
			
		||||
#undef HAVE_MM_MALLOC_H
 | 
			
		||||
 | 
			
		||||
/* Support SSE (Streaming SIMD Extensions) instructions */
 | 
			
		||||
#undef HAVE_SSE
 | 
			
		||||
 | 
			
		||||
/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
 | 
			
		||||
#undef HAVE_SSE2
 | 
			
		||||
 | 
			
		||||
/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
 | 
			
		||||
#undef HAVE_SSE3
 | 
			
		||||
 | 
			
		||||
/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
 | 
			
		||||
#undef HAVE_SSE4_1
 | 
			
		||||
 | 
			
		||||
/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
 | 
			
		||||
#undef HAVE_SSE4_2
 | 
			
		||||
 | 
			
		||||
/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
 | 
			
		||||
#undef HAVE_SSSE3
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <stdint.h> header file. */
 | 
			
		||||
#undef HAVE_STDINT_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <stdlib.h> header file. */
 | 
			
		||||
#undef HAVE_STDLIB_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <strings.h> header file. */
 | 
			
		||||
#undef HAVE_STRINGS_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <string.h> header file. */
 | 
			
		||||
#undef HAVE_STRING_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <sys/stat.h> header file. */
 | 
			
		||||
#undef HAVE_SYS_STAT_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <sys/types.h> header file. */
 | 
			
		||||
#undef HAVE_SYS_TYPES_H
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <unistd.h> header file. */
 | 
			
		||||
#undef HAVE_UNISTD_H
 | 
			
		||||
 | 
			
		||||
/* IMCI Intrinsics for Knights Corner */
 | 
			
		||||
#undef IMCI
 | 
			
		||||
 | 
			
		||||
/* NEON ARMv8 Experimental support */
 | 
			
		||||
#undef NEONv8
 | 
			
		||||
 | 
			
		||||
/* Name of package */
 | 
			
		||||
#undef PACKAGE
 | 
			
		||||
 | 
			
		||||
/* Define to the address where bug reports for this package should be sent. */
 | 
			
		||||
#undef PACKAGE_BUGREPORT
 | 
			
		||||
 | 
			
		||||
/* Define to the full name of this package. */
 | 
			
		||||
#undef PACKAGE_NAME
 | 
			
		||||
 | 
			
		||||
/* Define to the full name and version of this package. */
 | 
			
		||||
#undef PACKAGE_STRING
 | 
			
		||||
 | 
			
		||||
/* Define to the one symbol short name of this package. */
 | 
			
		||||
#undef PACKAGE_TARNAME
 | 
			
		||||
 | 
			
		||||
/* Define to the home page for this package. */
 | 
			
		||||
#undef PACKAGE_URL
 | 
			
		||||
 | 
			
		||||
/* Define to the version of this package. */
 | 
			
		||||
#undef PACKAGE_VERSION
 | 
			
		||||
 | 
			
		||||
/* RNG_MT19937 */
 | 
			
		||||
#undef RNG_MT19937
 | 
			
		||||
 | 
			
		||||
/* RNG_RANLUX */
 | 
			
		||||
#undef RNG_RANLUX
 | 
			
		||||
 | 
			
		||||
/* SSE4 Intrinsics */
 | 
			
		||||
#undef SSE4
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the ANSI C header files. */
 | 
			
		||||
#undef STDC_HEADERS
 | 
			
		||||
 | 
			
		||||
/* Version number of package */
 | 
			
		||||
#undef VERSION
 | 
			
		||||
 | 
			
		||||
/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
 | 
			
		||||
   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
 | 
			
		||||
   #define below would cause a syntax error. */
 | 
			
		||||
#undef _UINT32_T
 | 
			
		||||
 | 
			
		||||
/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
 | 
			
		||||
   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
 | 
			
		||||
   #define below would cause a syntax error. */
 | 
			
		||||
#undef _UINT64_T
 | 
			
		||||
 | 
			
		||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
 | 
			
		||||
#undef size_t
 | 
			
		||||
 | 
			
		||||
/* Define to the type of an unsigned integer type of width exactly 32 bits if
 | 
			
		||||
   such a type exists and the standard includes do not define it. */
 | 
			
		||||
#undef uint32_t
 | 
			
		||||
 | 
			
		||||
/* Define to the type of an unsigned integer type of width exactly 64 bits if
 | 
			
		||||
   such a type exists and the standard includes do not define it. */
 | 
			
		||||
#undef uint64_t
 | 
			
		||||
@@ -37,4 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_COMMS_MPI
 | 
			
		||||
#include <cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
 | 
			
		||||
#endif 
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -62,10 +62,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <serialisation/Serialisation.h>
 | 
			
		||||
#include <Config.h>
 | 
			
		||||
#include <Timer.h>
 | 
			
		||||
#include <PerfCount.h>
 | 
			
		||||
#include <Log.h>
 | 
			
		||||
#include <AlignedAllocator.h>
 | 
			
		||||
#include <Simd.h>
 | 
			
		||||
#include <Threads.h>
 | 
			
		||||
#include <Lexicographic.h>
 | 
			
		||||
#include <Communicator.h> 
 | 
			
		||||
#include <Cartesian.h>    
 | 
			
		||||
#include <Tensors.h>      
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										49
									
								
								lib/Init.cc
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								lib/Init.cc
									
									
									
									
									
								
							@@ -45,12 +45,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <algorithm>
 | 
			
		||||
#include <iterator>
 | 
			
		||||
 | 
			
		||||
#define __X86_64
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_EXECINFO_H
 | 
			
		||||
#include <execinfo.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
@@ -150,6 +144,10 @@ void GridParseLayout(char **argv,int argc,
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
 | 
			
		||||
    std::vector<int> ompthreads(0);
 | 
			
		||||
#ifndef GRID_OMP
 | 
			
		||||
    std::cout << GridLogWarning << "'--threads' option used but Grid was"
 | 
			
		||||
              << " not compiled with thread support" << std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
    arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
 | 
			
		||||
    GridCmdOptionIntVector(arg,ompthreads);
 | 
			
		||||
    assert(ompthreads.size()==1);
 | 
			
		||||
@@ -174,9 +172,8 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 | 
			
		||||
/////////////////////////////////////////////////////////
 | 
			
		||||
void Grid_init(int *argc,char ***argv)
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_COMMS_MPI
 | 
			
		||||
  MPI_Init(argc,argv);
 | 
			
		||||
#endif
 | 
			
		||||
  CartesianCommunicator::Init(argc,argv);
 | 
			
		||||
 | 
			
		||||
  // Parse command line args.
 | 
			
		||||
 | 
			
		||||
  GridLogger::StopWatch.Start();
 | 
			
		||||
@@ -194,9 +191,10 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--omp n         : default number of OMP threads"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
 | 
			
		||||
@@ -213,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    Grid_quiesce_nodes();
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
 | 
			
		||||
    QCD::WilsonFermionStatic::HandOptDslash=1;
 | 
			
		||||
    QCD::WilsonFermion5DStatic::HandOptDslash=1;
 | 
			
		||||
    QCD::WilsonKernelsStatic::HandOpt=1;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
			
		||||
    LebesgueOrder::UseLebesgueOrder=1;
 | 
			
		||||
@@ -287,13 +284,7 @@ void Grid_finalize(void)
 | 
			
		||||
  Grid_unquiesce_nodes();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
double usecond(void) {
 | 
			
		||||
  struct timeval tv;
 | 
			
		||||
  gettimeofday(&tv,NULL);
 | 
			
		||||
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define _NBACKTRACE (256)
 | 
			
		||||
void * Grid_backtrace_buffer[_NBACKTRACE];
 | 
			
		||||
 | 
			
		||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
			
		||||
@@ -305,11 +296,11 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
			
		||||
  // Linux/Posix
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
  // And x86 64bit
 | 
			
		||||
    ucontext_t * uc= (ucontext_t *)ptr;
 | 
			
		||||
#ifdef __x86_64__
 | 
			
		||||
  ucontext_t * uc= (ucontext_t *)ptr;
 | 
			
		||||
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
 | 
			
		||||
  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 | 
			
		||||
#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
 | 
			
		||||
 | 
			
		||||
  REG(rdi);
 | 
			
		||||
  REG(rsi);
 | 
			
		||||
  REG(rbp);
 | 
			
		||||
@@ -330,17 +321,15 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
			
		||||
  REG(r14);
 | 
			
		||||
  REG(r15);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef HAVE_EXECINFO_H
 | 
			
		||||
  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
 | 
			
		||||
  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
 | 
			
		||||
  for (int i = 0; i < symbols; i++){
 | 
			
		||||
    printf ("%s\n", strings[i]);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  BACKTRACE();
 | 
			
		||||
  exit(0);
 | 
			
		||||
  return;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_FPE
 | 
			
		||||
#define _GNU_SOURCE
 | 
			
		||||
#include <fenv.h>
 | 
			
		||||
#endif
 | 
			
		||||
void Grid_debug_handler_init(void)
 | 
			
		||||
{
 | 
			
		||||
  struct sigaction sa,osa;
 | 
			
		||||
@@ -349,5 +338,9 @@ void Grid_debug_handler_init(void)
 | 
			
		||||
  sa.sa_flags    = SA_SIGINFO;
 | 
			
		||||
  sigaction(SIGSEGV,&sa,NULL);
 | 
			
		||||
  sigaction(SIGTRAP,&sa,NULL);
 | 
			
		||||
#ifdef GRID_FPE
 | 
			
		||||
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
 | 
			
		||||
  sigaction(SIGFPE,&sa,NULL);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										32
									
								
								lib/Lexicographic.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								lib/Lexicographic.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,32 @@
 | 
			
		||||
#ifndef GRID_LEXICOGRAPHIC_H
 | 
			
		||||
#define GRID_LEXICOGRAPHIC_H
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
namespace Grid{
 | 
			
		||||
 | 
			
		||||
  class Lexicographic {
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
 | 
			
		||||
      int nd= dims.size();
 | 
			
		||||
      coor.resize(nd);
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	coor[d] = index % dims[d];
 | 
			
		||||
	index   = index / dims[d];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
 | 
			
		||||
      int nd=dims.size();
 | 
			
		||||
      int stride=1;
 | 
			
		||||
      index=0;
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	index = index+stride*coor[d];
 | 
			
		||||
	stride=stride*dims[d];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
@@ -73,13 +73,16 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
 | 
			
		||||
////////////////////////////////////////////////////////////
 | 
			
		||||
void Grid_quiesce_nodes(void)
 | 
			
		||||
{
 | 
			
		||||
  int me=0;
 | 
			
		||||
#ifdef GRID_COMMS_MPI
 | 
			
		||||
  int me;
 | 
			
		||||
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
  me = shmem_my_pe();
 | 
			
		||||
#endif
 | 
			
		||||
  if ( me ) { 
 | 
			
		||||
    std::cout.setstate(std::ios::badbit);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Grid_unquiesce_nodes(void)
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										191
									
								
								lib/Log.h
									
									
									
									
									
								
							
							
						
						
									
										191
									
								
								lib/Log.h
									
									
									
									
									
								
							@@ -32,75 +32,80 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
#ifndef GRID_LOG_H
 | 
			
		||||
#define GRID_LOG_H
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  // Dress the output; use std::chrono for time stamping via the StopWatch class
 | 
			
		||||
#ifdef HAVE_EXECINFO_H
 | 
			
		||||
#include <execinfo.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    namespace Grid {
 | 
			
		||||
 | 
			
		||||
// Dress the output; use std::chrono for time stamping via the StopWatch class
 | 
			
		||||
int Rank(void); // used for early stage debug before library init
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  class Colours{
 | 
			
		||||
  protected:
 | 
			
		||||
    bool is_active;
 | 
			
		||||
  public:
 | 
			
		||||
    std::map<std::string, std::string> colour;
 | 
			
		||||
class Colours{
 | 
			
		||||
protected:
 | 
			
		||||
  bool is_active;
 | 
			
		||||
public:
 | 
			
		||||
  std::map<std::string, std::string> colour;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    Colours(bool activate=false){
 | 
			
		||||
      Active(activate);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    void Active(bool activate){
 | 
			
		||||
      is_active=activate;
 | 
			
		||||
      
 | 
			
		||||
      if (is_active){
 | 
			
		||||
	colour["BLACK"]  ="\033[30m";
 | 
			
		||||
	colour["RED"]    ="\033[31m";
 | 
			
		||||
	colour["GREEN"]  ="\033[32m";
 | 
			
		||||
	colour["YELLOW"] ="\033[33m";
 | 
			
		||||
	colour["BLUE"]   ="\033[34m";
 | 
			
		||||
	colour["PURPLE"] ="\033[35m";
 | 
			
		||||
	colour["CYAN"]   ="\033[36m";
 | 
			
		||||
	colour["WHITE"]  ="\033[37m";
 | 
			
		||||
	colour["NORMAL"] ="\033[0;39m";
 | 
			
		||||
      } else {
 | 
			
		||||
      colour["BLACK"] ="";
 | 
			
		||||
      colour["RED"]   ="";
 | 
			
		||||
      colour["GREEN"] ="";
 | 
			
		||||
      colour["YELLOW"]="";
 | 
			
		||||
      colour["BLUE"]  ="";
 | 
			
		||||
      colour["PURPLE"]="";
 | 
			
		||||
      colour["CYAN"]  ="";
 | 
			
		||||
      colour["WHITE"] ="";
 | 
			
		||||
      colour["NORMAL"]="";
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
    };
 | 
			
		||||
    
 | 
			
		||||
  Colours(bool activate=false){
 | 
			
		||||
    Active(activate);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  void Active(bool activate){
 | 
			
		||||
    is_active=activate;
 | 
			
		||||
 | 
			
		||||
  class Logger {
 | 
			
		||||
  protected:
 | 
			
		||||
    Colours &Painter;
 | 
			
		||||
    int active;
 | 
			
		||||
    std::string name, topName;
 | 
			
		||||
    std::string COLOUR;
 | 
			
		||||
    if (is_active){
 | 
			
		||||
     colour["BLACK"]  ="\033[30m";
 | 
			
		||||
     colour["RED"]    ="\033[31m";
 | 
			
		||||
     colour["GREEN"]  ="\033[32m";
 | 
			
		||||
     colour["YELLOW"] ="\033[33m";
 | 
			
		||||
     colour["BLUE"]   ="\033[34m";
 | 
			
		||||
     colour["PURPLE"] ="\033[35m";
 | 
			
		||||
     colour["CYAN"]   ="\033[36m";
 | 
			
		||||
     colour["WHITE"]  ="\033[37m";
 | 
			
		||||
     colour["NORMAL"] ="\033[0;39m";
 | 
			
		||||
   } else {
 | 
			
		||||
    colour["BLACK"] ="";
 | 
			
		||||
    colour["RED"]   ="";
 | 
			
		||||
    colour["GREEN"] ="";
 | 
			
		||||
    colour["YELLOW"]="";
 | 
			
		||||
    colour["BLUE"]  ="";
 | 
			
		||||
    colour["PURPLE"]="";
 | 
			
		||||
    colour["CYAN"]  ="";
 | 
			
		||||
    colour["WHITE"] ="";
 | 
			
		||||
    colour["NORMAL"]="";
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
    static GridStopWatch StopWatch;
 | 
			
		||||
    static std::ostream devnull;
 | 
			
		||||
 | 
			
		||||
    std::string background() {return Painter.colour["NORMAL"];}
 | 
			
		||||
    std::string evidence() {return Painter.colour["YELLOW"];}
 | 
			
		||||
    std::string colour() {return Painter.colour[COLOUR];}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
    Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
 | 
			
		||||
      : active(on),
 | 
			
		||||
	name(nm),
 | 
			
		||||
	topName(topNm),
 | 
			
		||||
	Painter(col_class),
 | 
			
		||||
	COLOUR(col){} ;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Logger {
 | 
			
		||||
protected:
 | 
			
		||||
  Colours &Painter;
 | 
			
		||||
  int active;
 | 
			
		||||
  std::string name, topName;
 | 
			
		||||
  std::string COLOUR;
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
  static GridStopWatch StopWatch;
 | 
			
		||||
  static std::ostream devnull;
 | 
			
		||||
 | 
			
		||||
  std::string background() {return Painter.colour["NORMAL"];}
 | 
			
		||||
  std::string evidence() {return Painter.colour["YELLOW"];}
 | 
			
		||||
  std::string colour() {return Painter.colour[COLOUR];}
 | 
			
		||||
 | 
			
		||||
  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
 | 
			
		||||
  : active(on),
 | 
			
		||||
  name(nm),
 | 
			
		||||
  topName(topNm),
 | 
			
		||||
  Painter(col_class),
 | 
			
		||||
  COLOUR(col){} ;
 | 
			
		||||
  
 | 
			
		||||
  void Active(int on) {active = on;};
 | 
			
		||||
  int  isActive(void) {return active;};
 | 
			
		||||
@@ -108,36 +113,68 @@ namespace Grid {
 | 
			
		||||
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
 | 
			
		||||
 | 
			
		||||
    if ( log.active ) {
 | 
			
		||||
            StopWatch.Stop();
 | 
			
		||||
            GridTime now = StopWatch.Elapsed();
 | 
			
		||||
            StopWatch.Start();
 | 
			
		||||
            stream << log.background()<< log.topName << log.background()<< " : ";
 | 
			
		||||
            stream << log.colour() <<std::setw(10) << std::left << log.name << log.background() << " : ";
 | 
			
		||||
            stream << log.evidence()<< now << log.background() << " : " << log.colour();
 | 
			
		||||
            return stream;
 | 
			
		||||
        } else { 
 | 
			
		||||
            return devnull;
 | 
			
		||||
        }
 | 
			
		||||
      StopWatch.Stop();
 | 
			
		||||
      GridTime now = StopWatch.Elapsed();
 | 
			
		||||
      StopWatch.Start();
 | 
			
		||||
      stream << log.background()<< log.topName << log.background()<< " : ";
 | 
			
		||||
      stream << log.colour() <<std::setw(10) << std::left << log.name << log.background() << " : ";
 | 
			
		||||
      stream << log.evidence()<< now << log.background() << " : " << log.colour();
 | 
			
		||||
      return stream;
 | 
			
		||||
    } else { 
 | 
			
		||||
      return devnull;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class GridLogger: public Logger {
 | 
			
		||||
public:
 | 
			
		||||
  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
 | 
			
		||||
    Logger("Grid", on, nm, col_class, col_key){};
 | 
			
		||||
  Logger("Grid", on, nm, col_class, col_key){};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void GridLogConfigure(std::vector<std::string> &logstreams);
 | 
			
		||||
 | 
			
		||||
  extern GridLogger GridLogError;
 | 
			
		||||
  extern GridLogger GridLogWarning;
 | 
			
		||||
  extern GridLogger GridLogMessage;
 | 
			
		||||
  extern GridLogger GridLogDebug  ;
 | 
			
		||||
  extern GridLogger GridLogPerformance;
 | 
			
		||||
  extern GridLogger GridLogIterative  ;
 | 
			
		||||
  extern GridLogger GridLogIntegrator  ;
 | 
			
		||||
  extern Colours    GridLogColours;
 | 
			
		||||
extern GridLogger GridLogError;
 | 
			
		||||
extern GridLogger GridLogWarning;
 | 
			
		||||
extern GridLogger GridLogMessage;
 | 
			
		||||
extern GridLogger GridLogDebug  ;
 | 
			
		||||
extern GridLogger GridLogPerformance;
 | 
			
		||||
extern GridLogger GridLogIterative  ;
 | 
			
		||||
extern GridLogger GridLogIntegrator  ;
 | 
			
		||||
extern Colours    GridLogColours;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define _NBACKTRACE (256)
 | 
			
		||||
extern void * Grid_backtrace_buffer[_NBACKTRACE];
 | 
			
		||||
 | 
			
		||||
#define BACKTRACEFILE() {\
 | 
			
		||||
char string[20];					\
 | 
			
		||||
std::sprintf(string,"backtrace.%d",Rank());				\
 | 
			
		||||
std::FILE * fp = std::fopen(string,"w");				\
 | 
			
		||||
BACKTRACEFP(fp)\
 | 
			
		||||
std::fclose(fp);	    \
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_EXECINFO_H
 | 
			
		||||
#define BACKTRACEFP(fp) { \
 | 
			
		||||
int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
 | 
			
		||||
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
 | 
			
		||||
for (int i = 0; i < symbols; i++){\
 | 
			
		||||
  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
 | 
			
		||||
}\
 | 
			
		||||
}
 | 
			
		||||
#else 
 | 
			
		||||
#define BACKTRACEFP(fp) { \
 | 
			
		||||
std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
 | 
			
		||||
std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
 | 
			
		||||
std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
 | 
			
		||||
std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define BACKTRACE() BACKTRACEFP(stdout) 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							@@ -6,6 +6,10 @@ if BUILD_COMMS_MPI
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpi.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
if BUILD_COMMS_SHMEM
 | 
			
		||||
  extra_sources+=communicator/Communicator_shmem.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
if BUILD_COMMS_NONE
 | 
			
		||||
  extra_sources+=communicator/Communicator_none.cc
 | 
			
		||||
endif
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								lib/Old/Endeavour.tgz
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								lib/Old/Endeavour.tgz
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							@@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 | 
			
		||||
 | 
			
		||||
#define RawConfig(A,B) (A<<8|B)
 | 
			
		||||
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
 | 
			
		||||
  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
 | 
			
		||||
    // 4
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
 | 
			
		||||
    // 11
 | 
			
		||||
#else
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
 | 
			
		||||
    // 11
 | 
			
		||||
#endif
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
 | 
			
		||||
    //15
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
 | 
			
		||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
 | 
			
		||||
    //19
 | 
			
		||||
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 | 
			
		||||
#endif
 | 
			
		||||
};
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										112
									
								
								lib/PerfCount.h
									
									
									
									
									
								
							
							
						
						
									
										112
									
								
								lib/PerfCount.h
									
									
									
									
									
								
							@@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
#include <chrono>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#include <sys/ioctl.h>
 | 
			
		||||
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
@@ -43,8 +43,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#else
 | 
			
		||||
#include <sys/syscall.h>
 | 
			
		||||
#endif
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 | 
			
		||||
@@ -58,6 +58,49 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef TIMERS_OFF
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
inline uint64_t cyclecount(void){ 
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 | 
			
		||||
#define __SSC_STOP  __SSC_MARK(0x110)
 | 
			
		||||
#define __SSC_START __SSC_MARK(0x111)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
#define __SSC_MARK(mark) 
 | 
			
		||||
#define __SSC_STOP  
 | 
			
		||||
#define __SSC_START 
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * cycle counters arch dependent
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#ifdef __bgq__
 | 
			
		||||
inline uint64_t cyclecount(void){ 
 | 
			
		||||
   uint64_t tmp;
 | 
			
		||||
   asm volatile ("mfspr %0,0x10C" : "=&r" (tmp)  );
 | 
			
		||||
   return tmp;
 | 
			
		||||
}
 | 
			
		||||
#elif defined __x86_64__
 | 
			
		||||
#include <x86intrin.h>
 | 
			
		||||
inline uint64_t cyclecount(void){ 
 | 
			
		||||
  return __rdtsc();
 | 
			
		||||
  //  unsigned int dummy;
 | 
			
		||||
  // return __rdtscp(&dummy);
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
inline uint64_t cyclecount(void){ 
 | 
			
		||||
   return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
class PerformanceCounter {
 | 
			
		||||
private:
 | 
			
		||||
@@ -67,6 +110,7 @@ private:
 | 
			
		||||
    uint32_t type;
 | 
			
		||||
    uint64_t config;
 | 
			
		||||
    const char *name;
 | 
			
		||||
    int normalisation;
 | 
			
		||||
  } PerformanceCounterConfig; 
 | 
			
		||||
  
 | 
			
		||||
  static const PerformanceCounterConfig PerformanceCounterConfigs [];
 | 
			
		||||
@@ -74,26 +118,12 @@ private:
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  enum PerformanceCounterType {
 | 
			
		||||
    CPUCYCLES=0,
 | 
			
		||||
    INSTRUCTIONS,
 | 
			
		||||
    //    STALL_CYCLES,
 | 
			
		||||
    CACHE_REFERENCES,
 | 
			
		||||
    CACHE_MISSES,
 | 
			
		||||
    L1D_READ_MISS,
 | 
			
		||||
    L1D_READ_ACCESS,
 | 
			
		||||
    L1D_WRITE_MISS,
 | 
			
		||||
    L1D_WRITE_ACCESS,
 | 
			
		||||
    L1D_PREFETCH_MISS,
 | 
			
		||||
    L1D_PREFETCH_ACCESS,
 | 
			
		||||
    LL_READ_MISS,
 | 
			
		||||
    //    LL_READ_ACCESS,
 | 
			
		||||
    LL_WRITE_MISS,
 | 
			
		||||
    LL_WRITE_ACCESS,
 | 
			
		||||
    LL_PREFETCH_MISS,
 | 
			
		||||
    LL_PREFETCH_ACCESS,
 | 
			
		||||
    L1I_READ_MISS,
 | 
			
		||||
    L1I_READ_ACCESS,
 | 
			
		||||
    PERFORMANCE_COUNTER_NUM_TYPES
 | 
			
		||||
    CACHE_REFERENCES=0,
 | 
			
		||||
    CACHE_MISSES=1,
 | 
			
		||||
    CPUCYCLES=2,
 | 
			
		||||
    INSTRUCTIONS=3,
 | 
			
		||||
    L1D_READ_ACCESS=4,
 | 
			
		||||
    PERFORMANCE_COUNTER_NUM_TYPES=19
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
@@ -101,7 +131,9 @@ public:
 | 
			
		||||
  int PCT;
 | 
			
		||||
 | 
			
		||||
  long long count;
 | 
			
		||||
  long long cycles;
 | 
			
		||||
  int fd;
 | 
			
		||||
  int cyclefd;
 | 
			
		||||
  unsigned long long elapsed;
 | 
			
		||||
  uint64_t begin;
 | 
			
		||||
 | 
			
		||||
@@ -114,7 +146,9 @@ public:
 | 
			
		||||
    assert(_pct>=0);
 | 
			
		||||
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
 | 
			
		||||
    fd=-1;
 | 
			
		||||
    cyclefd=-1;
 | 
			
		||||
    count=0;
 | 
			
		||||
    cycles=0;
 | 
			
		||||
    PCT =_pct;
 | 
			
		||||
    Open();
 | 
			
		||||
#endif
 | 
			
		||||
@@ -139,6 +173,15 @@ public:
 | 
			
		||||
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
 | 
			
		||||
      perror("Error is");
 | 
			
		||||
    }
 | 
			
		||||
    int norm = PerformanceCounterConfigs[PCT].normalisation;
 | 
			
		||||
    pe.type  = PerformanceCounterConfigs[norm].type;
 | 
			
		||||
    pe.config= PerformanceCounterConfigs[norm].config;
 | 
			
		||||
    name = PerformanceCounterConfigs[norm].name;
 | 
			
		||||
    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
 | 
			
		||||
    if (cyclefd == -1) {
 | 
			
		||||
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
 | 
			
		||||
      perror("Error is");
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -146,10 +189,12 @@ public:
 | 
			
		||||
  {
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
    if ( fd!= -1) {
 | 
			
		||||
      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 | 
			
		||||
      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 | 
			
		||||
      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 | 
			
		||||
      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 | 
			
		||||
      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
 | 
			
		||||
      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
 | 
			
		||||
    }
 | 
			
		||||
    begin  =__rdtsc();
 | 
			
		||||
    begin  =cyclecount();
 | 
			
		||||
#else
 | 
			
		||||
    begin = 0;
 | 
			
		||||
#endif
 | 
			
		||||
@@ -157,12 +202,15 @@ public:
 | 
			
		||||
 | 
			
		||||
  void Stop(void) {
 | 
			
		||||
    count=0;
 | 
			
		||||
    cycles=0;
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
    if ( fd!= -1) {
 | 
			
		||||
      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 | 
			
		||||
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 | 
			
		||||
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
 | 
			
		||||
      ::read(fd, &count, sizeof(long long));
 | 
			
		||||
      ::read(cyclefd, &cycles, sizeof(long long));
 | 
			
		||||
    }
 | 
			
		||||
    elapsed = __rdtsc() - begin;
 | 
			
		||||
    elapsed = cyclecount() - begin;
 | 
			
		||||
#else
 | 
			
		||||
    elapsed = 0;
 | 
			
		||||
#endif
 | 
			
		||||
@@ -170,16 +218,20 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
  void Report(void) {
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
 | 
			
		||||
    int N = PerformanceCounterConfigs[PCT].normalisation;
 | 
			
		||||
    const char * sn = PerformanceCounterConfigs[N].name ;
 | 
			
		||||
    const char * sc = PerformanceCounterConfigs[PCT].name;
 | 
			
		||||
      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
 | 
			
		||||
		  sc, count, sc,sn, (double)count/(double)cycles);
 | 
			
		||||
#else
 | 
			
		||||
    printf("%llu cycles \n", elapsed );
 | 
			
		||||
    std::printf("%llu cycles \n", elapsed );
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ~PerformanceCounter()
 | 
			
		||||
  {
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
    close(fd);
 | 
			
		||||
    ::close(fd);    ::close(cyclefd);
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -42,10 +42,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
 | 
			
		||||
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
 | 
			
		||||
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
 | 
			
		||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 | 
			
		||||
#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
 | 
			
		||||
 | 
			
		||||
#define RotateBit (0x100)
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef uint32_t Integer;
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1594
									
								
								lib/Stencil.h
									
									
									
									
									
								
							
							
						
						
									
										1594
									
								
								lib/Stencil.h
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										27
									
								
								lib/Timer.h
									
									
									
									
									
								
							
							
						
						
									
										27
									
								
								lib/Timer.h
									
									
									
									
									
								
							@@ -39,11 +39,18 @@ namespace Grid {
 | 
			
		||||
  // Dress the output; use std::chrono
 | 
			
		||||
 | 
			
		||||
// C++11 time facilities better?
 | 
			
		||||
double usecond(void);
 | 
			
		||||
inline double usecond(void) {
 | 
			
		||||
  struct timeval tv;
 | 
			
		||||
#ifdef TIMERS_ON
 | 
			
		||||
  gettimeofday(&tv,NULL);
 | 
			
		||||
#endif
 | 
			
		||||
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
typedef  std::chrono::system_clock          GridClock;
 | 
			
		||||
typedef  std::chrono::time_point<GridClock> GridTimePoint;
 | 
			
		||||
typedef  std::chrono::milliseconds          GridTime;
 | 
			
		||||
typedef  std::chrono::microseconds          GridUsecs;
 | 
			
		||||
 | 
			
		||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 | 
			
		||||
{
 | 
			
		||||
@@ -55,29 +62,39 @@ class GridStopWatch {
 | 
			
		||||
private:
 | 
			
		||||
  bool running;
 | 
			
		||||
  GridTimePoint start;
 | 
			
		||||
  GridTime accumulator;
 | 
			
		||||
  GridUsecs accumulator;
 | 
			
		||||
public:
 | 
			
		||||
  GridStopWatch () { 
 | 
			
		||||
    Reset();
 | 
			
		||||
  }
 | 
			
		||||
  void     Start(void) { 
 | 
			
		||||
    assert(running == false);
 | 
			
		||||
#ifdef TIMERS_ON
 | 
			
		||||
    start = GridClock::now(); 
 | 
			
		||||
#endif
 | 
			
		||||
    running = true;
 | 
			
		||||
  }
 | 
			
		||||
  void     Stop(void)  { 
 | 
			
		||||
    assert(running == true);
 | 
			
		||||
    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
 | 
			
		||||
#ifdef TIMERS_ON
 | 
			
		||||
    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
 | 
			
		||||
#endif
 | 
			
		||||
    running = false; 
 | 
			
		||||
  };
 | 
			
		||||
  void     Reset(void){
 | 
			
		||||
    running = false;
 | 
			
		||||
#ifdef TIMERS_ON
 | 
			
		||||
    start = GridClock::now();
 | 
			
		||||
    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
 | 
			
		||||
#endif
 | 
			
		||||
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
 | 
			
		||||
  }
 | 
			
		||||
  GridTime Elapsed(void) {
 | 
			
		||||
    assert(running == false);
 | 
			
		||||
    return accumulator;
 | 
			
		||||
    return std::chrono::duration_cast<GridTime>( accumulator );
 | 
			
		||||
  }
 | 
			
		||||
  uint64_t useconds(void){
 | 
			
		||||
    assert(running == false);
 | 
			
		||||
    return (uint64_t) accumulator.count();
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -147,6 +147,56 @@ namespace Grid {
 | 
			
		||||
      }
 | 
			
		||||
      Orthogonalise();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
 | 
			
		||||
    {
 | 
			
		||||
      // Run a Lanczos with sloppy convergence
 | 
			
		||||
	const int Nstop = nn;
 | 
			
		||||
	const int Nk = nn+20;
 | 
			
		||||
	const int Np = nn+20;
 | 
			
		||||
	const int Nm = Nk+Np;
 | 
			
		||||
	const int MaxIt= 10000;
 | 
			
		||||
	RealD resid = 1.0e-3;
 | 
			
		||||
 | 
			
		||||
	Chebyshev<FineField> Cheb(0.5,64.0,21);
 | 
			
		||||
	ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
 | 
			
		||||
	//	IRL.lock = 1;
 | 
			
		||||
 | 
			
		||||
	FineField noise(FineGrid); gaussian(RNG,noise);
 | 
			
		||||
	FineField tmp(FineGrid); 
 | 
			
		||||
	std::vector<RealD>     eval(Nm);
 | 
			
		||||
	std::vector<FineField> evec(Nm,FineGrid);
 | 
			
		||||
 | 
			
		||||
	int Nconv;
 | 
			
		||||
	IRL.calc(eval,evec,
 | 
			
		||||
		 noise,
 | 
			
		||||
		 Nconv);
 | 
			
		||||
 | 
			
		||||
    	// pull back nn vectors
 | 
			
		||||
	for(int b=0;b<nn;b++){
 | 
			
		||||
 | 
			
		||||
	  subspace[b]   = evec[b];
 | 
			
		||||
 | 
			
		||||
	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 | 
			
		||||
 | 
			
		||||
	  hermop.Op(subspace[b],tmp); 
 | 
			
		||||
	  std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
 | 
			
		||||
 | 
			
		||||
	  noise = tmp -  sqrt(eval[b])*subspace[b] ;
 | 
			
		||||
 | 
			
		||||
	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 | 
			
		||||
 | 
			
		||||
	  noise = tmp +  eval[b]*subspace[b] ;
 | 
			
		||||
 | 
			
		||||
	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
	Orthogonalise();
 | 
			
		||||
	for(int b=0;b<nn;b++){
 | 
			
		||||
	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
 | 
			
		||||
 | 
			
		||||
      RealD scale;
 | 
			
		||||
@@ -200,7 +250,7 @@ namespace Grid {
 | 
			
		||||
    ////////////////////
 | 
			
		||||
    Geometry         geom;
 | 
			
		||||
    GridBase *       _grid; 
 | 
			
		||||
    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 
 | 
			
		||||
    CartesianStencil<siteVector,siteVector> Stencil; 
 | 
			
		||||
 | 
			
		||||
    std::vector<CoarseMatrix> A;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -222,6 +222,7 @@ namespace Grid {
 | 
			
		||||
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
 | 
			
		||||
      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
			
		||||
	Field tmp(in._grid);
 | 
			
		||||
//	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 | 
			
		||||
 | 
			
		||||
	_Mat.Meooe(in,tmp);
 | 
			
		||||
	_Mat.MooeeInv(tmp,out);
 | 
			
		||||
@@ -251,10 +252,10 @@ namespace Grid {
 | 
			
		||||
      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
			
		||||
	Field tmp(in._grid);
 | 
			
		||||
 | 
			
		||||
	_Mat.Meooe(in,tmp);
 | 
			
		||||
	_Mat.MooeeInv(tmp,out);
 | 
			
		||||
	_Mat.Meooe(out,tmp);
 | 
			
		||||
	_Mat.MooeeInv(tmp,out);
 | 
			
		||||
	_Mat.Meooe(in,out);
 | 
			
		||||
	_Mat.MooeeInv(out,tmp);
 | 
			
		||||
	_Mat.Meooe(tmp,out);
 | 
			
		||||
	_Mat.MooeeInv(out,tmp);
 | 
			
		||||
 | 
			
		||||
	return axpy_norm(out,-1.0,tmp,in);
 | 
			
		||||
      }
 | 
			
		||||
@@ -270,6 +271,35 @@ namespace Grid {
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<class Matrix,class Field>
 | 
			
		||||
      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
 | 
			
		||||
    protected:
 | 
			
		||||
      Matrix &_Mat;
 | 
			
		||||
    public:
 | 
			
		||||
      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
 | 
			
		||||
 | 
			
		||||
      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
			
		||||
	Field tmp(in._grid);
 | 
			
		||||
 | 
			
		||||
	_Mat.MooeeInv(in,out);
 | 
			
		||||
	_Mat.Meooe(out,tmp);
 | 
			
		||||
	_Mat.MooeeInv(tmp,out);
 | 
			
		||||
	_Mat.Meooe(out,tmp);
 | 
			
		||||
 | 
			
		||||
	return axpy_norm(out,-1.0,tmp,in);
 | 
			
		||||
      }
 | 
			
		||||
      virtual  RealD MpcDag   (const Field &in, Field &out){
 | 
			
		||||
	Field tmp(in._grid);
 | 
			
		||||
 | 
			
		||||
	_Mat.MeooeDag(in,out);
 | 
			
		||||
	_Mat.MooeeInvDag(out,tmp);
 | 
			
		||||
	_Mat.MeooeDag(tmp,out);
 | 
			
		||||
	_Mat.MooeeInvDag(out,tmp);
 | 
			
		||||
 | 
			
		||||
	return axpy_norm(out,-1.0,tmp,in);
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    /////////////////////////////////////////////////////////////
 | 
			
		||||
    // Base classes for functions of operators
 | 
			
		||||
 
 | 
			
		||||
@@ -58,13 +58,14 @@ namespace Grid {
 | 
			
		||||
      Field Mtmp(in._grid);
 | 
			
		||||
      AtoN = in;
 | 
			
		||||
      out = AtoN*Coeffs[0];
 | 
			
		||||
      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
 | 
			
		||||
      //      std::cout <<"0 " <<norm2(out)<<std::endl;
 | 
			
		||||
//            std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
 | 
			
		||||
//            std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
 | 
			
		||||
      for(int n=1;n<Coeffs.size();n++){
 | 
			
		||||
	Mtmp = AtoN;
 | 
			
		||||
	Linop.HermOp(Mtmp,AtoN);
 | 
			
		||||
	out=out+AtoN*Coeffs[n];
 | 
			
		||||
	//	std::cout << n<<" " <<norm2(out)<<std::endl;
 | 
			
		||||
//            std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
 | 
			
		||||
//		std::cout << n<<" " <<norm2(out)<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
  };
 | 
			
		||||
@@ -82,7 +83,8 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
    void csv(std::ostream &out){
 | 
			
		||||
      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
 | 
			
		||||
	RealD diff = hi-lo;
 | 
			
		||||
      for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
 | 
			
		||||
	RealD f = approx(x);
 | 
			
		||||
	out<< x<<" "<<f<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
@@ -99,10 +101,24 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    Chebyshev(){};
 | 
			
		||||
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
 | 
			
		||||
    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// CJ: the one we need for Lanczos
 | 
			
		||||
    void Init(RealD _lo,RealD _hi,int _order)
 | 
			
		||||
    {
 | 
			
		||||
      lo=_lo;
 | 
			
		||||
      hi=_hi;
 | 
			
		||||
      order=_order;
 | 
			
		||||
      
 | 
			
		||||
      if(order < 2) exit(-1);
 | 
			
		||||
      Coeffs.resize(order);
 | 
			
		||||
      Coeffs.assign(0.,order);
 | 
			
		||||
      Coeffs[order-1] = 1.;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
 | 
			
		||||
    {
 | 
			
		||||
      lo=_lo;
 | 
			
		||||
@@ -182,6 +198,8 @@ namespace Grid {
 | 
			
		||||
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 | 
			
		||||
 | 
			
		||||
      GridBase *grid=in._grid;
 | 
			
		||||
//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
 | 
			
		||||
//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
 | 
			
		||||
 | 
			
		||||
      int vol=grid->gSites();
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -16,9 +16,13 @@
 | 
			
		||||
#define INCLUDED_ALG_REMEZ_H
 | 
			
		||||
 | 
			
		||||
#include <stddef.h>
 | 
			
		||||
#include <Config.h>
 | 
			
		||||
 | 
			
		||||
//#include <algorithms/approx/bigfloat.h>
 | 
			
		||||
#ifdef HAVE_GMP_H
 | 
			
		||||
#include <algorithms/approx/bigfloat.h>
 | 
			
		||||
#else
 | 
			
		||||
#include <algorithms/approx/bigfloat_double.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define JMAX 10000 //Maximum number of iterations of Newton's approximation
 | 
			
		||||
#define SUM_MAX 10 // Maximum number of terms in exponential
 | 
			
		||||
 
 | 
			
		||||
@@ -84,7 +84,7 @@ public:
 | 
			
		||||
	return;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
 | 
			
		||||
      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
 | 
			
		||||
 | 
			
		||||
      GridStopWatch LinalgTimer;
 | 
			
		||||
      GridStopWatch MatrixTimer;
 | 
			
		||||
@@ -101,8 +101,8 @@ public:
 | 
			
		||||
	MatrixTimer.Stop();
 | 
			
		||||
 | 
			
		||||
	LinalgTimer.Start();
 | 
			
		||||
	RealD    qqck = norm2(mmp);
 | 
			
		||||
	ComplexD dck  = innerProduct(p,mmp);
 | 
			
		||||
	//	RealD    qqck = norm2(mmp);
 | 
			
		||||
	//	ComplexD dck  = innerProduct(p,mmp);
 | 
			
		||||
      
 | 
			
		||||
	a      = c/d;
 | 
			
		||||
	b_pred = a*(a*qq-d)/c;
 | 
			
		||||
@@ -115,7 +115,7 @@ public:
 | 
			
		||||
	p  = p*b+r;
 | 
			
		||||
	  
 | 
			
		||||
	LinalgTimer.Stop();
 | 
			
		||||
	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 | 
			
		||||
	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
 | 
			
		||||
	
 | 
			
		||||
	// Stopping condition
 | 
			
		||||
	if ( cp <= rsq ) { 
 | 
			
		||||
@@ -132,9 +132,9 @@ public:
 | 
			
		||||
 | 
			
		||||
	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 | 
			
		||||
		   <<" computed residual "<<sqrt(cp/ssq)
 | 
			
		||||
		   <<" true residual     "<<true_residual
 | 
			
		||||
		   <<" target "<<Tolerance;
 | 
			
		||||
	  std::cout<<" Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 | 
			
		||||
		   <<" true residual "    <<true_residual
 | 
			
		||||
		   <<" target "<<Tolerance<<std::endl;
 | 
			
		||||
	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 | 
			
		||||
	  std::cout<<std::endl;
 | 
			
		||||
	  
 | 
			
		||||
	  assert(true_residual/Tolerance < 1000.0);
 | 
			
		||||
 
 | 
			
		||||
@@ -274,7 +274,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 | 
			
		||||
  }
 | 
			
		||||
  // ugly hack
 | 
			
		||||
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
 | 
			
		||||
  assert(0);
 | 
			
		||||
//  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 
 | 
			
		||||
@@ -38,32 +38,34 @@ template<class Field>
 | 
			
		||||
class SortEigen {
 | 
			
		||||
 private:
 | 
			
		||||
  
 | 
			
		||||
//hacking for testing for now
 | 
			
		||||
 private:
 | 
			
		||||
  static bool less_lmd(RealD left,RealD right){
 | 
			
		||||
    return fabs(left) < fabs(right);
 | 
			
		||||
    return left > right;
 | 
			
		||||
  }  
 | 
			
		||||
  static bool less_pair(std::pair<RealD,Field>& left,
 | 
			
		||||
		 std::pair<RealD,Field>& right){
 | 
			
		||||
    return fabs(left.first) < fabs(right.first);
 | 
			
		||||
  static bool less_pair(std::pair<RealD,Field const*>& left,
 | 
			
		||||
                        std::pair<RealD,Field const*>& right){
 | 
			
		||||
    return left.first > (right.first);
 | 
			
		||||
  }  
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
 public:
 | 
			
		||||
 | 
			
		||||
  void push(DenseVector<RealD>& lmd,
 | 
			
		||||
	    DenseVector<Field>& evec,int N) {
 | 
			
		||||
            DenseVector<Field>& evec,int N) {
 | 
			
		||||
    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
 | 
			
		||||
    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
 | 
			
		||||
    
 | 
			
		||||
    DenseVector<std::pair<RealD, Field> > emod;
 | 
			
		||||
    typename DenseVector<std::pair<RealD, Field> >::iterator it;
 | 
			
		||||
    
 | 
			
		||||
    for(int i=0;i<lmd.size();++i){
 | 
			
		||||
      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
 | 
			
		||||
    }
 | 
			
		||||
    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
 | 
			
		||||
    for(int i=0;i<lmd.size();++i)
 | 
			
		||||
      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
 | 
			
		||||
 | 
			
		||||
    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
 | 
			
		||||
 | 
			
		||||
    it=emod.begin();
 | 
			
		||||
    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
 | 
			
		||||
    for(int i=0;i<N;++i){
 | 
			
		||||
      lmd[i]=it->first;
 | 
			
		||||
      evec[i]=it->second;
 | 
			
		||||
      evec[i]=*(it->second);
 | 
			
		||||
      ++it;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -29,6 +29,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifndef GRID_IRL_H
 | 
			
		||||
#define GRID_IRL_H
 | 
			
		||||
 | 
			
		||||
#include <string.h> //memset
 | 
			
		||||
#ifdef USE_LAPACK
 | 
			
		||||
#include <lapacke.h>
 | 
			
		||||
#endif
 | 
			
		||||
#include <algorithms/iterative/DenseMatrix.h>
 | 
			
		||||
#include <algorithms/iterative/EigenSort.h>
 | 
			
		||||
 | 
			
		||||
@@ -49,6 +53,7 @@ public:
 | 
			
		||||
    int Niter;
 | 
			
		||||
    int converged;
 | 
			
		||||
 | 
			
		||||
    int Nstop;   // Number of evecs checked for convergence
 | 
			
		||||
    int Nk;      // Number of converged sought
 | 
			
		||||
    int Np;      // Np -- Number of spare vecs in kryloc space
 | 
			
		||||
    int Nm;      // Nm -- total number of vectors
 | 
			
		||||
@@ -57,6 +62,8 @@ public:
 | 
			
		||||
 | 
			
		||||
    SortEigen<Field> _sort;
 | 
			
		||||
 | 
			
		||||
//    GridCartesian &_fgrid;
 | 
			
		||||
 | 
			
		||||
    LinearOperatorBase<Field> &_Linop;
 | 
			
		||||
 | 
			
		||||
    OperatorFunction<Field>   &_poly;
 | 
			
		||||
@@ -67,7 +74,27 @@ public:
 | 
			
		||||
    void init(void){};
 | 
			
		||||
    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
 | 
			
		||||
 | 
			
		||||
    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
 | 
			
		||||
    ImplicitlyRestartedLanczos(
 | 
			
		||||
				LinearOperatorBase<Field> &Linop, // op
 | 
			
		||||
			       OperatorFunction<Field> & poly,   // polynmial
 | 
			
		||||
			       int _Nstop, // sought vecs
 | 
			
		||||
			       int _Nk, // sought vecs
 | 
			
		||||
			       int _Nm, // spare vecs
 | 
			
		||||
			       RealD _eresid, // resid in lmdue deficit 
 | 
			
		||||
			       int _Niter) : // Max iterations
 | 
			
		||||
      _Linop(Linop),
 | 
			
		||||
      _poly(poly),
 | 
			
		||||
      Nstop(_Nstop),
 | 
			
		||||
      Nk(_Nk),
 | 
			
		||||
      Nm(_Nm),
 | 
			
		||||
      eresid(_eresid),
 | 
			
		||||
      Niter(_Niter)
 | 
			
		||||
    { 
 | 
			
		||||
      Np = Nm-Nk; assert(Np>0);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    ImplicitlyRestartedLanczos(
 | 
			
		||||
				LinearOperatorBase<Field> &Linop, // op
 | 
			
		||||
			       OperatorFunction<Field> & poly,   // polynmial
 | 
			
		||||
			       int _Nk, // sought vecs
 | 
			
		||||
			       int _Nm, // spare vecs
 | 
			
		||||
@@ -75,6 +102,7 @@ public:
 | 
			
		||||
			       int _Niter) : // Max iterations
 | 
			
		||||
      _Linop(Linop),
 | 
			
		||||
      _poly(poly),
 | 
			
		||||
      Nstop(_Nk),
 | 
			
		||||
      Nk(_Nk),
 | 
			
		||||
      Nm(_Nm),
 | 
			
		||||
      eresid(_eresid),
 | 
			
		||||
@@ -142,10 +170,11 @@ public:
 | 
			
		||||
      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
 | 
			
		||||
                                 // 7. vk+1 := wk/βk+1
 | 
			
		||||
 | 
			
		||||
//	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
 | 
			
		||||
      const RealD tiny = 1.0e-20;
 | 
			
		||||
      if ( beta < tiny ) { 
 | 
			
		||||
	std::cout << " beta is tiny "<<beta<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
     }
 | 
			
		||||
      lmd[k] = alph;
 | 
			
		||||
      lme[k]  = beta;
 | 
			
		||||
 | 
			
		||||
@@ -219,15 +248,122 @@ public:
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#ifdef USE_LAPACK
 | 
			
		||||
    void diagonalize_lapack(DenseVector<RealD>& lmd,
 | 
			
		||||
		     DenseVector<RealD>& lme, 
 | 
			
		||||
		     int N1,
 | 
			
		||||
		     int N2,
 | 
			
		||||
		     DenseVector<RealD>& Qt,
 | 
			
		||||
		     GridBase *grid){
 | 
			
		||||
  const int size = Nm;
 | 
			
		||||
//  tevals.resize(size);
 | 
			
		||||
//  tevecs.resize(size);
 | 
			
		||||
  int NN = N1;
 | 
			
		||||
  double evals_tmp[NN];
 | 
			
		||||
  double evec_tmp[NN][NN];
 | 
			
		||||
  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
 | 
			
		||||
//  double AA[NN][NN];
 | 
			
		||||
  double DD[NN];
 | 
			
		||||
  double EE[NN];
 | 
			
		||||
  for (int i = 0; i< NN; i++)
 | 
			
		||||
    for (int j = i - 1; j <= i + 1; j++)
 | 
			
		||||
      if ( j < NN && j >= 0 ) {
 | 
			
		||||
        if (i==j) DD[i] = lmd[i];
 | 
			
		||||
        if (i==j) evals_tmp[i] = lmd[i];
 | 
			
		||||
        if (j==(i-1)) EE[j] = lme[j];
 | 
			
		||||
      }
 | 
			
		||||
  int evals_found;
 | 
			
		||||
  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
 | 
			
		||||
  int liwork =  3+NN*10 ;
 | 
			
		||||
  int iwork[liwork];
 | 
			
		||||
  double work[lwork];
 | 
			
		||||
  int isuppz[2*NN];
 | 
			
		||||
  char jobz = 'V'; // calculate evals & evecs
 | 
			
		||||
  char range = 'I'; // calculate all evals
 | 
			
		||||
  //    char range = 'A'; // calculate all evals
 | 
			
		||||
  char uplo = 'U'; // refer to upper half of original matrix
 | 
			
		||||
  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
 | 
			
		||||
  int ifail[NN];
 | 
			
		||||
  int info;
 | 
			
		||||
//  int total = QMP_get_number_of_nodes();
 | 
			
		||||
//  int node = QMP_get_node_number();
 | 
			
		||||
//  GridBase *grid = evec[0]._grid;
 | 
			
		||||
  int total = grid->_Nprocessors;
 | 
			
		||||
  int node = grid->_processor;
 | 
			
		||||
  int interval = (NN/total)+1;
 | 
			
		||||
  double vl = 0.0, vu = 0.0;
 | 
			
		||||
  int il = interval*node+1 , iu = interval*(node+1);
 | 
			
		||||
  if (iu > NN)  iu=NN;
 | 
			
		||||
  double tol = 0.0;
 | 
			
		||||
    if (1) {
 | 
			
		||||
      memset(evals_tmp,0,sizeof(double)*NN);
 | 
			
		||||
      if ( il <= NN){
 | 
			
		||||
        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
 | 
			
		||||
        LAPACK_dstegr(&jobz, &range, &NN,
 | 
			
		||||
            (double*)DD, (double*)EE,
 | 
			
		||||
            &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
 | 
			
		||||
            &tol, // tolerance
 | 
			
		||||
            &evals_found, evals_tmp, (double*)evec_tmp, &NN,
 | 
			
		||||
            isuppz,
 | 
			
		||||
            work, &lwork, iwork, &liwork,
 | 
			
		||||
            &info);
 | 
			
		||||
        for (int i = iu-1; i>= il-1; i--){
 | 
			
		||||
          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
 | 
			
		||||
          evals_tmp[i] = evals_tmp[i - (il-1)];
 | 
			
		||||
          if (il>1) evals_tmp[i-(il-1)]=0.;
 | 
			
		||||
          for (int j = 0; j< NN; j++){
 | 
			
		||||
            evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
 | 
			
		||||
            if (il>1) evec_tmp[i-(il-1)][j]=0.;
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      {
 | 
			
		||||
//        QMP_sum_double_array(evals_tmp,NN);
 | 
			
		||||
//        QMP_sum_double_array((double *)evec_tmp,NN*NN);
 | 
			
		||||
         grid->GlobalSumVector(evals_tmp,NN);
 | 
			
		||||
         grid->GlobalSumVector((double*)evec_tmp,NN*NN);
 | 
			
		||||
      }
 | 
			
		||||
    } 
 | 
			
		||||
// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
 | 
			
		||||
  for(int i=0;i<NN;i++){
 | 
			
		||||
    for(int j=0;j<NN;j++)
 | 
			
		||||
      Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
 | 
			
		||||
      lmd [NN-1-i]=evals_tmp[i];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    void diagonalize(DenseVector<RealD>& lmd,
 | 
			
		||||
		     DenseVector<RealD>& lme, 
 | 
			
		||||
		     int Nm2,
 | 
			
		||||
		     int Nm,
 | 
			
		||||
		     DenseVector<RealD>& Qt)
 | 
			
		||||
		     int N2,
 | 
			
		||||
		     int N1,
 | 
			
		||||
		     DenseVector<RealD>& Qt,
 | 
			
		||||
		     GridBase *grid)
 | 
			
		||||
    {
 | 
			
		||||
      int Niter = 100*Nm;
 | 
			
		||||
 | 
			
		||||
#ifdef USE_LAPACK
 | 
			
		||||
    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
 | 
			
		||||
 | 
			
		||||
    if(!check_lapack)
 | 
			
		||||
	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
 | 
			
		||||
 | 
			
		||||
	DenseVector <RealD> lmd2(N1);
 | 
			
		||||
	DenseVector <RealD> lme2(N1);
 | 
			
		||||
	DenseVector<RealD> Qt2(N1*N1);
 | 
			
		||||
         for(int k=0; k<N1; ++k){
 | 
			
		||||
	    lmd2[k] = lmd[k];
 | 
			
		||||
	    lme2[k] = lme[k];
 | 
			
		||||
	  }
 | 
			
		||||
         for(int k=0; k<N1*N1; ++k)
 | 
			
		||||
	Qt2[k] = Qt[k];
 | 
			
		||||
 | 
			
		||||
//	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
      int Niter = 100*N1;
 | 
			
		||||
      int kmin = 1;
 | 
			
		||||
      int kmax = Nk;
 | 
			
		||||
      int kmax = N2;
 | 
			
		||||
      // (this should be more sophisticated)
 | 
			
		||||
 | 
			
		||||
      for(int iter=0; iter<Niter; ++iter){
 | 
			
		||||
@@ -239,7 +375,7 @@ public:
 | 
			
		||||
	// (Dsh: shift)
 | 
			
		||||
	
 | 
			
		||||
	// transformation
 | 
			
		||||
	qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
 | 
			
		||||
	qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
 | 
			
		||||
	
 | 
			
		||||
	// Convergence criterion (redef of kmin and kamx)
 | 
			
		||||
	for(int j=kmax-1; j>= kmin; --j){
 | 
			
		||||
@@ -250,6 +386,23 @@ public:
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	Niter = iter;
 | 
			
		||||
#ifdef USE_LAPACK
 | 
			
		||||
    if(check_lapack){
 | 
			
		||||
	const double SMALL=1e-8;
 | 
			
		||||
	diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
 | 
			
		||||
	DenseVector <RealD> lmd3(N2);
 | 
			
		||||
         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
 | 
			
		||||
        _sort.push(lmd3,N2);
 | 
			
		||||
        _sort.push(lmd2,N2);
 | 
			
		||||
         for(int k=0; k<N2; ++k){
 | 
			
		||||
	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
 | 
			
		||||
//	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
 | 
			
		||||
	  }
 | 
			
		||||
         for(int k=0; k<N1*N1; ++k){
 | 
			
		||||
//	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
	return;
 | 
			
		||||
 | 
			
		||||
      continued:
 | 
			
		||||
@@ -265,6 +418,7 @@ public:
 | 
			
		||||
      abort();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#if 1
 | 
			
		||||
    static RealD normalise(Field& v) 
 | 
			
		||||
    {
 | 
			
		||||
      RealD nn = norm2(v);
 | 
			
		||||
@@ -326,6 +480,7 @@ until convergence
 | 
			
		||||
      {
 | 
			
		||||
 | 
			
		||||
	GridBase *grid = evec[0]._grid;
 | 
			
		||||
	assert(grid == src._grid);
 | 
			
		||||
 | 
			
		||||
	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
 | 
			
		||||
	std::cout << " -- Nm = " << Nm << std::endl;
 | 
			
		||||
@@ -356,11 +511,21 @@ until convergence
 | 
			
		||||
	// (uniform vector) Why not src??
 | 
			
		||||
	//	evec[0] = 1.0;
 | 
			
		||||
	evec[0] = src;
 | 
			
		||||
	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
 | 
			
		||||
// << src._grid  << std::endl;
 | 
			
		||||
	normalise(evec[0]);
 | 
			
		||||
	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
 | 
			
		||||
// << evec[0]._grid << std::endl;
 | 
			
		||||
	
 | 
			
		||||
	// Initial Nk steps
 | 
			
		||||
	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
 | 
			
		||||
//	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
 | 
			
		||||
//	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
 | 
			
		||||
	RitzMatrix(evec,Nk);
 | 
			
		||||
	for(int k=0; k<Nk; ++k){
 | 
			
		||||
//	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
 | 
			
		||||
//	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Restarting loop begins
 | 
			
		||||
	for(int iter = 0; iter<Niter; ++iter){
 | 
			
		||||
@@ -382,20 +547,24 @@ until convergence
 | 
			
		||||
	    lme2[k] = lme[k+k1-1];
 | 
			
		||||
	  }
 | 
			
		||||
	  setUnit_Qt(Nm,Qt);
 | 
			
		||||
	  diagonalize(eval2,lme2,Nm,Nm,Qt);
 | 
			
		||||
	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
 | 
			
		||||
 | 
			
		||||
	  // sorting
 | 
			
		||||
	  _sort.push(eval2,Nm);
 | 
			
		||||
	  
 | 
			
		||||
	  // Implicitly shifted QR transformations
 | 
			
		||||
	  setUnit_Qt(Nm,Qt);
 | 
			
		||||
	  for(int ip=k2; ip<Nm; ++ip) 
 | 
			
		||||
	  for(int ip=k2; ip<Nm; ++ip){ 
 | 
			
		||||
	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 | 
			
		||||
	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
 | 
			
		||||
		
 | 
			
		||||
	}
 | 
			
		||||
    
 | 
			
		||||
	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 | 
			
		||||
	  
 | 
			
		||||
	  for(int j=k1-1; j<k2+1; ++j){
 | 
			
		||||
	    for(int k=0; k<Nm; ++k){
 | 
			
		||||
	    B[j].checkerboard = evec[k].checkerboard;
 | 
			
		||||
	      B[j] += Qt[k+Nm*j] * evec[k];
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
@@ -418,21 +587,25 @@ until convergence
 | 
			
		||||
	    lme2[k] = lme[k];
 | 
			
		||||
	  }
 | 
			
		||||
	  setUnit_Qt(Nm,Qt);
 | 
			
		||||
	  diagonalize(eval2,lme2,Nk,Nm,Qt);
 | 
			
		||||
	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
 | 
			
		||||
	  
 | 
			
		||||
	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
 | 
			
		||||
	  
 | 
			
		||||
	  for(int j = 0; j<Nk; ++j){
 | 
			
		||||
	    for(int k = 0; k<Nk; ++k){
 | 
			
		||||
	    B[j].checkerboard = evec[k].checkerboard;
 | 
			
		||||
	      B[j] += Qt[k+j*Nm] * evec[k];
 | 
			
		||||
	    }
 | 
			
		||||
//	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 | 
			
		||||
	  }
 | 
			
		||||
//	_sort.push(eval2,B,Nk);
 | 
			
		||||
 | 
			
		||||
	  Nconv = 0;
 | 
			
		||||
	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 | 
			
		||||
	  for(int i=0; i<Nk; ++i){
 | 
			
		||||
 | 
			
		||||
	    _poly(_Linop,B[i],v);
 | 
			
		||||
//	    _poly(_Linop,B[i],v);
 | 
			
		||||
	    _Linop.HermOp(B[i],v);
 | 
			
		||||
	    
 | 
			
		||||
	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
 | 
			
		||||
	    RealD vden = norm2(B[i]);
 | 
			
		||||
@@ -440,11 +613,13 @@ until convergence
 | 
			
		||||
	    v -= eval2[i]*B[i];
 | 
			
		||||
	    RealD vv = norm2(v);
 | 
			
		||||
	    
 | 
			
		||||
	    std::cout.precision(13);
 | 
			
		||||
	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 | 
			
		||||
	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 | 
			
		||||
	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
 | 
			
		||||
	    
 | 
			
		||||
	    if(vv<eresid*eresid){
 | 
			
		||||
	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
 | 
			
		||||
	    if((vv<eresid*eresid) && (i == Nconv) ){
 | 
			
		||||
	      Iconv[Nconv] = i;
 | 
			
		||||
	      ++Nconv;
 | 
			
		||||
	    }
 | 
			
		||||
@@ -455,7 +630,7 @@ until convergence
 | 
			
		||||
 | 
			
		||||
	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
 | 
			
		||||
 | 
			
		||||
	  if( Nconv>=Nk ){
 | 
			
		||||
	  if( Nconv>=Nstop ){
 | 
			
		||||
	    goto converged;
 | 
			
		||||
	  }
 | 
			
		||||
	} // end of iter loop
 | 
			
		||||
@@ -464,21 +639,20 @@ until convergence
 | 
			
		||||
	abort();
 | 
			
		||||
	
 | 
			
		||||
      converged:
 | 
			
		||||
	// Sorting
 | 
			
		||||
       // Sorting
 | 
			
		||||
       eval.resize(Nconv);
 | 
			
		||||
       evec.resize(Nconv,grid);
 | 
			
		||||
       for(int i=0; i<Nconv; ++i){
 | 
			
		||||
         eval[i] = eval2[Iconv[i]];
 | 
			
		||||
         evec[i] = B[Iconv[i]];
 | 
			
		||||
       }
 | 
			
		||||
      _sort.push(eval,evec,Nconv);
 | 
			
		||||
 | 
			
		||||
	eval.clear();
 | 
			
		||||
	evec.clear();
 | 
			
		||||
	for(int i=0; i<Nconv; ++i){
 | 
			
		||||
	  eval.push_back(eval2[Iconv[i]]);
 | 
			
		||||
	  evec.push_back(B[Iconv[i]]);
 | 
			
		||||
	}
 | 
			
		||||
	_sort.push(eval,evec,Nconv);
 | 
			
		||||
	
 | 
			
		||||
	std::cout << "\n Converged\n Summary :\n";
 | 
			
		||||
	std::cout << " -- Iterations  = "<< Nconv  << "\n";
 | 
			
		||||
	std::cout << " -- beta(k)     = "<< beta_k << "\n";
 | 
			
		||||
	std::cout << " -- Nconv       = "<< Nconv  << "\n";
 | 
			
		||||
      }
 | 
			
		||||
      std::cout << "\n Converged\n Summary :\n";
 | 
			
		||||
      std::cout << " -- Iterations  = "<< Nconv  << "\n";
 | 
			
		||||
      std::cout << " -- beta(k)     = "<< beta_k << "\n";
 | 
			
		||||
      std::cout << " -- Nconv       = "<< Nconv  << "\n";
 | 
			
		||||
     }
 | 
			
		||||
 | 
			
		||||
    /////////////////////////////////////////////////
 | 
			
		||||
    // Adapted from Rudy's lanczos factor routine
 | 
			
		||||
@@ -1025,6 +1199,7 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
@@ -47,6 +47,10 @@ namespace Grid {
 | 
			
		||||
    int mmax;
 | 
			
		||||
    int nstep;
 | 
			
		||||
    int steps;
 | 
			
		||||
    GridStopWatch PrecTimer;
 | 
			
		||||
    GridStopWatch MatTimer;
 | 
			
		||||
    GridStopWatch LinalgTimer;
 | 
			
		||||
 | 
			
		||||
    LinearFunction<Field> &Preconditioner;
 | 
			
		||||
 | 
			
		||||
   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
 | 
			
		||||
@@ -68,14 +72,24 @@ namespace Grid {
 | 
			
		||||
      
 | 
			
		||||
      Field r(src._grid);
 | 
			
		||||
 | 
			
		||||
        PrecTimer.Reset();
 | 
			
		||||
         MatTimer.Reset();
 | 
			
		||||
      LinalgTimer.Reset();
 | 
			
		||||
 | 
			
		||||
      GridStopWatch SolverTimer;
 | 
			
		||||
      SolverTimer.Start();
 | 
			
		||||
 | 
			
		||||
      steps=0;
 | 
			
		||||
      for(int k=0;k<MaxIterations;k++){
 | 
			
		||||
 | 
			
		||||
	cp=GCRnStep(Linop,src,psi,rsq);
 | 
			
		||||
 | 
			
		||||
	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
 | 
			
		||||
 | 
			
		||||
	if(cp<rsq) {
 | 
			
		||||
 | 
			
		||||
	  SolverTimer.Stop();
 | 
			
		||||
 | 
			
		||||
	  Linop.HermOp(psi,r);
 | 
			
		||||
	  axpy(r,-1.0,src,r);
 | 
			
		||||
	  RealD tr = norm2(r);
 | 
			
		||||
@@ -83,6 +97,11 @@ namespace Grid {
 | 
			
		||||
		   << " computed residual "<<sqrt(cp/ssq)
 | 
			
		||||
	           << " true residual "    <<sqrt(tr/ssq)
 | 
			
		||||
	           << " target "           <<Tolerance <<std::endl;
 | 
			
		||||
 | 
			
		||||
	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
 | 
			
		||||
	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
 | 
			
		||||
	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
 | 
			
		||||
	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
 | 
			
		||||
	  return;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
@@ -90,6 +109,7 @@ namespace Grid {
 | 
			
		||||
      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
 | 
			
		||||
 | 
			
		||||
      RealD cp;
 | 
			
		||||
@@ -116,24 +136,25 @@ namespace Grid {
 | 
			
		||||
      // initial guess x0 is taken as nonzero.
 | 
			
		||||
      // r0=src-A x0 = src
 | 
			
		||||
      //////////////////////////////////
 | 
			
		||||
      MatTimer.Start();
 | 
			
		||||
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
 | 
			
		||||
      MatTimer.Stop();
 | 
			
		||||
      r=src-Az;
 | 
			
		||||
      
 | 
			
		||||
      /////////////////////
 | 
			
		||||
      // p = Prec(r)
 | 
			
		||||
      /////////////////////
 | 
			
		||||
      PrecTimer.Start();
 | 
			
		||||
      Preconditioner(r,z);
 | 
			
		||||
      PrecTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
 | 
			
		||||
      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
 | 
			
		||||
      
 | 
			
		||||
      MatTimer.Start();
 | 
			
		||||
      Linop.HermOp(z,tmp); 
 | 
			
		||||
      MatTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
 | 
			
		||||
      ttmp=tmp;
 | 
			
		||||
      tmp=tmp-r;
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 | 
			
		||||
      /*
 | 
			
		||||
      std::cout<<GridLogMessage<<r<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage<<z<<std::endl;
 | 
			
		||||
@@ -141,7 +162,9 @@ namespace Grid {
 | 
			
		||||
      std::cout<<GridLogMessage<<tmp<<std::endl;
 | 
			
		||||
      */
 | 
			
		||||
 | 
			
		||||
      MatTimer.Start();
 | 
			
		||||
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
 | 
			
		||||
      MatTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      //p[0],q[0],qq[0] 
 | 
			
		||||
      p[0]= z;
 | 
			
		||||
@@ -165,18 +188,22 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
	cp = axpy_norm(r,-a,q[peri_k],r);  
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
 | 
			
		||||
	if((k==nstep-1)||(cp<rsq)){
 | 
			
		||||
	  return cp;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 
 | 
			
		||||
 | 
			
		||||
	PrecTimer.Start();
 | 
			
		||||
	Preconditioner(r,z);// solve Az = r
 | 
			
		||||
	PrecTimer.Stop();
 | 
			
		||||
 | 
			
		||||
	MatTimer.Start();
 | 
			
		||||
	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	Linop.HermOp(z,tmp);
 | 
			
		||||
	MatTimer.Stop();
 | 
			
		||||
        tmp=tmp-r;
 | 
			
		||||
	std::cout<<GridLogMessage<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 | 
			
		||||
	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 | 
			
		||||
 | 
			
		||||
	q[peri_kp]=Az;
 | 
			
		||||
	p[peri_kp]=z;
 | 
			
		||||
 
 | 
			
		||||
@@ -102,6 +102,8 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,in);
 | 
			
		||||
      pickCheckerboard(Odd ,src_o,in);
 | 
			
		||||
      pickCheckerboard(Even,sol_e,out);
 | 
			
		||||
      pickCheckerboard(Odd ,sol_o,out);
 | 
			
		||||
    
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
 | 
			
		||||
 
 | 
			
		||||
@@ -115,27 +115,11 @@ public:
 | 
			
		||||
      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
 | 
			
		||||
      return idx;
 | 
			
		||||
    }
 | 
			
		||||
    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
 | 
			
		||||
      int nd= dims.size();
 | 
			
		||||
      coor.resize(nd);
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	coor[d] = index % dims[d];
 | 
			
		||||
	index   = index / dims[d];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
 | 
			
		||||
      CoorFromIndex(coor,Oindex,_rdimensions);
 | 
			
		||||
    }
 | 
			
		||||
    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
 | 
			
		||||
      int nd=dims.size();
 | 
			
		||||
      int stride=1;
 | 
			
		||||
      index=0;
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	index = index+stride*coor[d];
 | 
			
		||||
	stride=stride*dims[d];
 | 
			
		||||
      }
 | 
			
		||||
      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////////
 | 
			
		||||
    // SIMD lane addressing
 | 
			
		||||
    //////////////////////////////////////////////////////////
 | 
			
		||||
@@ -147,13 +131,32 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
 | 
			
		||||
    {
 | 
			
		||||
      CoorFromIndex(coor,lane,_simd_layout);
 | 
			
		||||
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
 | 
			
		||||
    }
 | 
			
		||||
    inline int PermuteDim(int dimension){
 | 
			
		||||
      return _simd_layout[dimension]>1;
 | 
			
		||||
    }
 | 
			
		||||
    inline int PermuteType(int dimension){
 | 
			
		||||
      int permute_type=0;
 | 
			
		||||
      //
 | 
			
		||||
      // FIXME:
 | 
			
		||||
      //
 | 
			
		||||
      // Best way to encode this would be to present a mask 
 | 
			
		||||
      // for which simd dimensions are rotated, and the rotation
 | 
			
		||||
      // size. If there is only one simd dimension rotated, this is just 
 | 
			
		||||
      // a permute. 
 | 
			
		||||
      //
 | 
			
		||||
      // Cases: PermuteType == 1,2,4,8
 | 
			
		||||
      // Distance should be either 0,1,2..
 | 
			
		||||
      //
 | 
			
		||||
      if ( _simd_layout[dimension] > 2 ) { 
 | 
			
		||||
	for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
 | 
			
		||||
	}
 | 
			
		||||
	permute_type = RotateBit; // How to specify distance; this is not just direction.
 | 
			
		||||
	return permute_type;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      for(int d=_ndimension-1;d>dimension;d--){
 | 
			
		||||
	if (_simd_layout[d]>1 ) permute_type++;
 | 
			
		||||
      }
 | 
			
		||||
@@ -163,12 +166,12 @@ public:
 | 
			
		||||
    // Array sizing queries
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    inline int iSites(void) { return _isites; };
 | 
			
		||||
    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
 | 
			
		||||
    inline int oSites(void) { return _osites; };
 | 
			
		||||
    inline int lSites(void) { return _isites*_osites; }; 
 | 
			
		||||
    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
 | 
			
		||||
    inline int Nd    (void) { return _ndimension;};
 | 
			
		||||
    inline int iSites(void) const { return _isites; };
 | 
			
		||||
    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
 | 
			
		||||
    inline int oSites(void) const { return _osites; };
 | 
			
		||||
    inline int lSites(void) const { return _isites*_osites; }; 
 | 
			
		||||
    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
 | 
			
		||||
    inline int Nd    (void) const { return _ndimension;};
 | 
			
		||||
 | 
			
		||||
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
 | 
			
		||||
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
 | 
			
		||||
@@ -179,7 +182,10 @@ public:
 | 
			
		||||
    // Global addressing
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
 | 
			
		||||
      CoorFromIndex(gcoor,gidx,_gdimensions);
 | 
			
		||||
      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
 | 
			
		||||
    }
 | 
			
		||||
    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
 | 
			
		||||
      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
 | 
			
		||||
    }
 | 
			
		||||
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
 | 
			
		||||
      gidx=0;
 | 
			
		||||
 
 | 
			
		||||
@@ -170,9 +170,15 @@ public:
 | 
			
		||||
	// Use a reduced simd grid
 | 
			
		||||
	_simd_layout[d] = simd_layout[d];
 | 
			
		||||
	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 | 
			
		||||
	assert(_rdimensions[d]>0);
 | 
			
		||||
 | 
			
		||||
	// all elements of a simd vector must have same checkerboard.
 | 
			
		||||
	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 
 | 
			
		||||
	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 | 
			
		||||
	if ( _simd_layout[d]>1 ) {
 | 
			
		||||
	  if ( d != _checker_dim ) { 
 | 
			
		||||
	    assert( (_rdimensions[d]&0x1) == 0 );
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	_osites *= _rdimensions[d];
 | 
			
		||||
	_isites *= _simd_layout[d];
 | 
			
		||||
 
 | 
			
		||||
@@ -34,6 +34,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_COMMS_MPI
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
#include <mpp/shmem.h>
 | 
			
		||||
#endif
 | 
			
		||||
namespace Grid {
 | 
			
		||||
class CartesianCommunicator {
 | 
			
		||||
  public:    
 | 
			
		||||
@@ -53,6 +56,8 @@ class CartesianCommunicator {
 | 
			
		||||
    typedef int CommsRequest_t;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    static void Init(int *argc, char ***argv);
 | 
			
		||||
 | 
			
		||||
    // Constructor
 | 
			
		||||
    CartesianCommunicator(const std::vector<int> &pdimensions_in);
 | 
			
		||||
 | 
			
		||||
@@ -81,6 +86,7 @@ class CartesianCommunicator {
 | 
			
		||||
    void GlobalSumVector(RealD *,int N);
 | 
			
		||||
 | 
			
		||||
    void GlobalSum(uint32_t &);
 | 
			
		||||
    void GlobalSum(uint64_t &);
 | 
			
		||||
 | 
			
		||||
    void GlobalSum(ComplexF &c)
 | 
			
		||||
    {
 | 
			
		||||
@@ -115,12 +121,11 @@ class CartesianCommunicator {
 | 
			
		||||
			int recv_from_rank,
 | 
			
		||||
			int bytes);
 | 
			
		||||
 | 
			
		||||
    void RecvFrom(void *recv,
 | 
			
		||||
		  int recv_from_rank,
 | 
			
		||||
		  int bytes);
 | 
			
		||||
    void SendTo(void *xmit,
 | 
			
		||||
		int xmit_to_rank,
 | 
			
		||||
		int bytes);
 | 
			
		||||
    void SendRecvPacket(void *xmit,
 | 
			
		||||
			void *recv,
 | 
			
		||||
			int xmit_to_rank,
 | 
			
		||||
			int recv_from_rank,
 | 
			
		||||
			int bytes);
 | 
			
		||||
 | 
			
		||||
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
			 void *xmit,
 | 
			
		||||
 
 | 
			
		||||
@@ -31,6 +31,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  // Should error check all MPI calls.
 | 
			
		||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  int flag;
 | 
			
		||||
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
 | 
			
		||||
  if ( !flag ) {
 | 
			
		||||
    MPI_Init(argc,argv);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  int Rank(void) {
 | 
			
		||||
    int pe;
 | 
			
		||||
    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
 | 
			
		||||
    return pe;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{
 | 
			
		||||
@@ -59,6 +72,10 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
@@ -108,21 +125,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 | 
			
		||||
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
 | 
			
		||||
  SendToRecvFromComplete(reqs);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::RecvFrom(void *recv,
 | 
			
		||||
				     int from,
 | 
			
		||||
				     int bytes) 
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int sender,
 | 
			
		||||
					   int receiver,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  MPI_Status stat;
 | 
			
		||||
  int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendTo(void *xmit,
 | 
			
		||||
				   int dest,
 | 
			
		||||
				   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int rank = _processor; // used for tag; must know who it comes from
 | 
			
		||||
  int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  assert(sender != receiver);
 | 
			
		||||
  int tag = sender;
 | 
			
		||||
  if ( _processor == sender ) {
 | 
			
		||||
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
 | 
			
		||||
  }
 | 
			
		||||
  if ( _processor == receiver ) { 
 | 
			
		||||
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
 
 | 
			
		||||
@@ -28,6 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include "Grid.h"
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::Init(int *argc, char *** arv)
 | 
			
		||||
{
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int Rank(void ){ return 0; };
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{
 | 
			
		||||
  _processors = processors;
 | 
			
		||||
@@ -47,17 +53,14 @@ void CartesianCommunicator::GlobalSum(float &){}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(float *,int N){}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(double &){}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint32_t &){}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint64_t &){}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(double *,int N){}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::RecvFrom(void *recv,
 | 
			
		||||
				     int recv_from_rank,
 | 
			
		||||
				     int bytes) 
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendTo(void *xmit,
 | 
			
		||||
				   int xmit_to_rank,
 | 
			
		||||
				   int bytes)
 | 
			
		||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int xmit_to_rank,
 | 
			
		||||
					   int recv_from_rank,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										334
									
								
								lib/communicator/Communicator_shmem.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										334
									
								
								lib/communicator/Communicator_shmem.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,334 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/communicator/Communicator_shmem.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include "Grid.h"
 | 
			
		||||
#include <mpp/shmem.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  // Should error check all MPI calls.
 | 
			
		||||
#define SHMEM_VET(addr) 
 | 
			
		||||
 | 
			
		||||
#define SHMEM_VET_DEBUG(addr) {				\
 | 
			
		||||
  if ( ! shmem_addr_accessible(addr,_processor) ) {\
 | 
			
		||||
    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
 | 
			
		||||
    BACKTRACEFILE();		   \
 | 
			
		||||
  }\
 | 
			
		||||
}
 | 
			
		||||
int Rank(void) {
 | 
			
		||||
  return shmem_my_pe();
 | 
			
		||||
}
 | 
			
		||||
typedef struct HandShake_t { 
 | 
			
		||||
  uint64_t seq_local;
 | 
			
		||||
  uint64_t seq_remote;
 | 
			
		||||
} HandShake;
 | 
			
		||||
 | 
			
		||||
static Vector< HandShake > XConnections;
 | 
			
		||||
static Vector< HandShake > RConnections;
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  shmem_init();
 | 
			
		||||
  XConnections.resize(shmem_n_pes());
 | 
			
		||||
  RConnections.resize(shmem_n_pes());
 | 
			
		||||
  for(int pe =0 ; pe<shmem_n_pes();pe++){
 | 
			
		||||
    XConnections[pe].seq_local = 0;
 | 
			
		||||
    XConnections[pe].seq_remote= 0;
 | 
			
		||||
    RConnections[pe].seq_local = 0;
 | 
			
		||||
    RConnections[pe].seq_remote= 0;
 | 
			
		||||
  }
 | 
			
		||||
  shmem_barrier_all();
 | 
			
		||||
}
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
  std::vector<int> periodic(_ndimension,1);
 | 
			
		||||
 | 
			
		||||
  _Nprocessors=1;
 | 
			
		||||
  _processors = processors;
 | 
			
		||||
  _processor_coor.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
  _processor = shmem_my_pe();
 | 
			
		||||
  
 | 
			
		||||
  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<_ndimension;i++){
 | 
			
		||||
    _Nprocessors*=_processors[i];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int Size = shmem_n_pes(); 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  assert(Size==_Nprocessors);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
			
		||||
  static long long source ;
 | 
			
		||||
  static long long dest   ;
 | 
			
		||||
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
			
		||||
  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
 | 
			
		||||
  //  int nreduce=1;
 | 
			
		||||
  //  int pestart=0;
 | 
			
		||||
  //  int logStride=0;
 | 
			
		||||
 | 
			
		||||
  source = u;
 | 
			
		||||
  dest   = 0;
 | 
			
		||||
  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
			
		||||
  shmem_barrier_all(); // necessary?
 | 
			
		||||
  u = dest;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
 | 
			
		||||
  static long long source ;
 | 
			
		||||
  static long long dest   ;
 | 
			
		||||
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
			
		||||
  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
 | 
			
		||||
  //  int nreduce=1;
 | 
			
		||||
  //  int pestart=0;
 | 
			
		||||
  //  int logStride=0;
 | 
			
		||||
 | 
			
		||||
  source = u;
 | 
			
		||||
  dest   = 0;
 | 
			
		||||
  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
			
		||||
  shmem_barrier_all(); // necessary?
 | 
			
		||||
  u = dest;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
			
		||||
  static float source ;
 | 
			
		||||
  static float dest   ;
 | 
			
		||||
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
			
		||||
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
 | 
			
		||||
  source = f;
 | 
			
		||||
  dest   =0.0;
 | 
			
		||||
  shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
			
		||||
  f = dest;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
 | 
			
		||||
{
 | 
			
		||||
  static float source ;
 | 
			
		||||
  static float dest   = 0 ;
 | 
			
		||||
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
			
		||||
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
 | 
			
		||||
  if ( shmem_addr_accessible(f,_processor)  ){
 | 
			
		||||
    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    dest   =0.0;
 | 
			
		||||
    source = f[i];
 | 
			
		||||
    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
			
		||||
    f[i] = dest;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(double &d)
 | 
			
		||||
{
 | 
			
		||||
  static double source;
 | 
			
		||||
  static double dest  ;
 | 
			
		||||
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
			
		||||
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
 | 
			
		||||
  source = d;
 | 
			
		||||
  dest   = 0;
 | 
			
		||||
  shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
			
		||||
  d = dest;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
 | 
			
		||||
{
 | 
			
		||||
  static double source ;
 | 
			
		||||
  static double dest   ;
 | 
			
		||||
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
			
		||||
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
 | 
			
		||||
  if ( shmem_addr_accessible(d,_processor)  ){
 | 
			
		||||
    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    source = d[i];
 | 
			
		||||
    dest   =0.0;
 | 
			
		||||
    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
			
		||||
    d[i] = dest;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<int> coor = _processor_coor;
 | 
			
		||||
 | 
			
		||||
  assert(std::abs(shift) <_processors[dim]);
 | 
			
		||||
 | 
			
		||||
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
 | 
			
		||||
  Lexicographic::IndexFromCoor(coor,source,_processors);
 | 
			
		||||
 | 
			
		||||
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
 | 
			
		||||
  Lexicographic::IndexFromCoor(coor,dest,_processors);
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 | 
			
		||||
{
 | 
			
		||||
  int rank;
 | 
			
		||||
  Lexicographic::IndexFromCoor(coor,rank,_processors);
 | 
			
		||||
  return rank;
 | 
			
		||||
}
 | 
			
		||||
void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 | 
			
		||||
{
 | 
			
		||||
  Lexicographic::CoorFromIndex(coor,rank,_processors);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
 | 
			
		||||
					   int dest,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int from,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  SHMEM_VET(xmit);
 | 
			
		||||
  SHMEM_VET(recv);
 | 
			
		||||
  std::vector<CommsRequest_t> reqs(0);
 | 
			
		||||
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
 | 
			
		||||
  SendToRecvFromComplete(reqs);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int sender,
 | 
			
		||||
					   int receiver,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  static uint64_t seq;
 | 
			
		||||
 | 
			
		||||
  assert(recv!=xmit);
 | 
			
		||||
  volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
 | 
			
		||||
  volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
 | 
			
		||||
 | 
			
		||||
  if ( _processor == sender ) {
 | 
			
		||||
 | 
			
		||||
    printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
 | 
			
		||||
    // Check he has posted a receive
 | 
			
		||||
    while(SendSeq->seq_remote == SendSeq->seq_local);
 | 
			
		||||
 | 
			
		||||
    printf("Sender receive %d posted\n",sender,receiver);
 | 
			
		||||
 | 
			
		||||
    // Advance our send count
 | 
			
		||||
    seq = ++(SendSeq->seq_local);
 | 
			
		||||
    
 | 
			
		||||
    // Send this packet 
 | 
			
		||||
    SHMEM_VET(recv);
 | 
			
		||||
    shmem_putmem(recv,xmit,bytes,receiver);
 | 
			
		||||
    shmem_fence();
 | 
			
		||||
 | 
			
		||||
    printf("Sender sent payload %d\n",seq);
 | 
			
		||||
    //Notify him we're done
 | 
			
		||||
    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
 | 
			
		||||
    shmem_fence();
 | 
			
		||||
    printf("Sender ringing door bell  %d\n",seq);
 | 
			
		||||
  }
 | 
			
		||||
  if ( _processor == receiver ) {
 | 
			
		||||
 | 
			
		||||
    printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
 | 
			
		||||
    // Post a receive
 | 
			
		||||
    seq = ++(RecvSeq->seq_local);
 | 
			
		||||
    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
 | 
			
		||||
 | 
			
		||||
    printf("Receiver Opening letter box %d\n",seq);
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    // Now wait until he has advanced our reception counter
 | 
			
		||||
    while(RecvSeq->seq_remote != RecvSeq->seq_local);
 | 
			
		||||
 | 
			
		||||
    printf("Receiver Got the mail %d\n",seq);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						void *xmit,
 | 
			
		||||
						int dest,
 | 
			
		||||
						void *recv,
 | 
			
		||||
						int from,
 | 
			
		||||
						int bytes)
 | 
			
		||||
{
 | 
			
		||||
  SHMEM_VET(xmit);
 | 
			
		||||
  SHMEM_VET(recv);
 | 
			
		||||
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
 | 
			
		||||
  shmem_putmem(recv,xmit,bytes,dest);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  //  shmem_quiet();      // I'm done
 | 
			
		||||
  shmem_barrier_all();// He's done too
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::Barrier(void)
 | 
			
		||||
{
 | 
			
		||||
  shmem_barrier_all();
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 | 
			
		||||
{
 | 
			
		||||
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
  static uint32_t word;
 | 
			
		||||
  uint32_t *array = (uint32_t *) data;
 | 
			
		||||
  assert( (bytes % 4)==0);
 | 
			
		||||
  int words = bytes/4;
 | 
			
		||||
 | 
			
		||||
  if ( shmem_addr_accessible(data,_processor)  ){
 | 
			
		||||
    shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync);
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    word = array[w];
 | 
			
		||||
    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
 | 
			
		||||
    if ( shmem_my_pe() != root ) {
 | 
			
		||||
      array[w] = word;
 | 
			
		||||
    }
 | 
			
		||||
    shmem_barrier_all();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 | 
			
		||||
{
 | 
			
		||||
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
  static uint32_t word;
 | 
			
		||||
  uint32_t *array = (uint32_t *) data;
 | 
			
		||||
  assert( (bytes % 4)==0);
 | 
			
		||||
  int words = bytes/4;
 | 
			
		||||
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    word = array[w];
 | 
			
		||||
    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
 | 
			
		||||
    if ( shmem_my_pe() != root ) {
 | 
			
		||||
      array[w]= word;
 | 
			
		||||
    }
 | 
			
		||||
    shmem_barrier_all();
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -35,7 +35,7 @@ class SimpleCompressor {
 | 
			
		||||
public:
 | 
			
		||||
  void Point(int) {};
 | 
			
		||||
 | 
			
		||||
  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
 | 
			
		||||
  vobj operator() (const vobj &arg) {
 | 
			
		||||
    return arg;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
@@ -56,24 +56,24 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
 | 
			
		||||
  
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
 | 
			
		||||
  int stride=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  if ( cbmask == 0x3 ) { 
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o  = n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
	int bo = n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
	buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
			
		||||
	int o  = n*stride;
 | 
			
		||||
	int bo = n*e2;
 | 
			
		||||
	buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
     int bo=0;
 | 
			
		||||
     for(int n=0;n<e1;n++){
 | 
			
		||||
       for(int b=0;b<e2;b++){
 | 
			
		||||
	 int o  = n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
	 int o  = n*stride;
 | 
			
		||||
	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
			
		||||
	 if ( ocb &cbmask ) {
 | 
			
		||||
	   buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
			
		||||
	   buffer[off+bo++]=compress(rhs._odata[so+o+b]);
 | 
			
		||||
	 }
 | 
			
		||||
       }
 | 
			
		||||
     }
 | 
			
		||||
@@ -97,16 +97,16 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
  
 | 
			
		||||
  int n1=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  int n2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
  if ( cbmask ==0x3){
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
	int o=n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
	int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
 | 
			
		||||
	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
			
		||||
	int o      =   n*n1;
 | 
			
		||||
	int offset = b+n*n2;
 | 
			
		||||
	cobj temp =compress(rhs._odata[so+o+b]);
 | 
			
		||||
	extract<cobj>(temp,pointers,offset);
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
@@ -121,7 +121,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
	int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
 | 
			
		||||
	if ( ocb & cbmask ) {
 | 
			
		||||
	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
			
		||||
	  cobj temp =compress(rhs._odata[so+o+b]);
 | 
			
		||||
	  extract<cobj>(temp,pointers,offset);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
@@ -243,13 +243,13 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
			
		||||
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
 | 
			
		||||
  int stride = rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  if(cbmask == 0x3 ){
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 
 | 
			
		||||
        int o =n*rhs._grid->_slice_stride[dimension]+b;
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
			
		||||
	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 | 
			
		||||
      }
 | 
			
		||||
@@ -259,7 +259,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 
 | 
			
		||||
        int o =n*rhs._grid->_slice_stride[dimension]+b;
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
 | 
			
		||||
        if ( ocb&cbmask ) {
 | 
			
		||||
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
			
		||||
@@ -285,11 +285,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
 | 
			
		||||
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block [dimension];
 | 
			
		||||
  int stride = rhs._grid->_slice_stride[dimension];
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
  for(int n=0;n<e1;n++){
 | 
			
		||||
  for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
      int o  =n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
      int o  =n*stride;
 | 
			
		||||
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
      if ( ocb&cbmask ) {
 | 
			
		||||
	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
 | 
			
		||||
@@ -323,6 +324,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
			
		||||
  int rd = grid->_rdimensions[dimension];
 | 
			
		||||
  int ld = grid->_ldimensions[dimension];
 | 
			
		||||
  int gd = grid->_gdimensions[dimension];
 | 
			
		||||
  int ly = grid->_simd_layout[dimension];
 | 
			
		||||
 | 
			
		||||
  // Map to always positive shift modulo global full dimension.
 | 
			
		||||
  shift = (shift+fd)%fd;
 | 
			
		||||
@@ -331,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
			
		||||
  // the permute type
 | 
			
		||||
  int permute_dim =grid->PermuteDim(dimension);
 | 
			
		||||
  int permute_type=grid->PermuteType(dimension);
 | 
			
		||||
  int permute_type_dist;
 | 
			
		||||
 | 
			
		||||
  for(int x=0;x<rd;x++){       
 | 
			
		||||
 | 
			
		||||
@@ -342,15 +345,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
			
		||||
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
			
		||||
    int sx     = (x+sshift)%rd;
 | 
			
		||||
 | 
			
		||||
    // FIXME : This must change where we have a 
 | 
			
		||||
    // Rotate slice.
 | 
			
		||||
    
 | 
			
		||||
    // Document how this works ; why didn't I do this when I first wrote it...
 | 
			
		||||
    // wrap is whether sshift > rd.
 | 
			
		||||
    //  num is sshift mod rd.
 | 
			
		||||
    // 
 | 
			
		||||
    int permute_slice=0;
 | 
			
		||||
    if(permute_dim){
 | 
			
		||||
      int wrap = sshift/rd;
 | 
			
		||||
      int  num = sshift%rd;
 | 
			
		||||
 | 
			
		||||
      if ( x< rd-num ) permute_slice=wrap;
 | 
			
		||||
      else permute_slice = 1-wrap;
 | 
			
		||||
      else permute_slice = (wrap+1)%ly;
 | 
			
		||||
 | 
			
		||||
      if ( (ly>2) && (permute_slice) ) {
 | 
			
		||||
	assert(permute_type & RotateBit);
 | 
			
		||||
	permute_type_dist = permute_type|permute_slice;
 | 
			
		||||
      } else {
 | 
			
		||||
	permute_type_dist = permute_type;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
 | 
			
		||||
    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
 | 
			
		||||
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -191,8 +191,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 | 
			
		||||
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
 | 
			
		||||
  int words = sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 | 
			
		||||
  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 | 
			
		||||
  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
 | 
			
		||||
  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
 | 
			
		||||
 | 
			
		||||
  int bytes = buffer_size*sizeof(scalar_object);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_object *>  pointers(Nsimd); // 
 | 
			
		||||
 
 | 
			
		||||
@@ -55,7 +55,13 @@ extern int GridCshiftPermuteMap[4][16];
 | 
			
		||||
// Basic expressions used in Expression Template
 | 
			
		||||
////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
class LatticeBase {};
 | 
			
		||||
class LatticeBase
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    virtual ~LatticeBase(void) = default;
 | 
			
		||||
    GridBase *_grid;
 | 
			
		||||
};
 | 
			
		||||
    
 | 
			
		||||
class LatticeExpressionBase {};
 | 
			
		||||
 | 
			
		||||
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
 | 
			
		||||
@@ -88,8 +94,6 @@ template<class vobj>
 | 
			
		||||
class Lattice : public LatticeBase
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
    GridBase *_grid;
 | 
			
		||||
    int checkerboard;
 | 
			
		||||
    Vector<vobj> _odata;
 | 
			
		||||
    
 | 
			
		||||
@@ -177,8 +181,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  }
 | 
			
		||||
  //GridFromExpression is tricky to do
 | 
			
		||||
  template<class Op,class T1>
 | 
			
		||||
    Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
 | 
			
		||||
 | 
			
		||||
    Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
 | 
			
		||||
    _grid = nullptr;
 | 
			
		||||
    GridFromExpression(_grid,expr);
 | 
			
		||||
    assert(_grid!=nullptr);
 | 
			
		||||
 | 
			
		||||
@@ -199,7 +203,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class Op,class T1, class T2>
 | 
			
		||||
  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
 | 
			
		||||
  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
 | 
			
		||||
    _grid = nullptr;
 | 
			
		||||
    GridFromExpression(_grid,expr);
 | 
			
		||||
    assert(_grid!=nullptr);
 | 
			
		||||
 | 
			
		||||
@@ -220,7 +225,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class Op,class T1, class T2, class T3>
 | 
			
		||||
  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
 | 
			
		||||
  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
 | 
			
		||||
    _grid = nullptr;
 | 
			
		||||
    GridFromExpression(_grid,expr);
 | 
			
		||||
    assert(_grid!=nullptr);
 | 
			
		||||
 | 
			
		||||
@@ -240,7 +246,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    // Constructor requires "grid" passed.
 | 
			
		||||
    // what about a default grid?
 | 
			
		||||
    //////////////////////////////////////////////////////////////////
 | 
			
		||||
    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
 | 
			
		||||
    Lattice(GridBase *grid) : _odata(grid->oSites()) {
 | 
			
		||||
        _grid = grid;
 | 
			
		||||
    //        _odata.reserve(_grid->oSites());
 | 
			
		||||
    //        _odata.resize(_grid->oSites());
 | 
			
		||||
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
 | 
			
		||||
@@ -248,6 +255,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
        checkerboard=0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    virtual ~Lattice(void) = default;
 | 
			
		||||
    
 | 
			
		||||
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
 
 | 
			
		||||
@@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    // Peek a scalar object from the SIMD array
 | 
			
		||||
    //////////////////////////////////////////////////////////
 | 
			
		||||
    template<class vobj,class sobj>
 | 
			
		||||
    void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
 | 
			
		||||
    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
 | 
			
		||||
        
 | 
			
		||||
      GridBase *grid=l._grid;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -152,7 +152,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
			
		||||
  assert(grid!=NULL);
 | 
			
		||||
 | 
			
		||||
  // FIXME
 | 
			
		||||
  std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 | 
			
		||||
  // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  const int    Nd = grid->_ndimension;
 | 
			
		||||
  const int Nsimd = grid->Nsimd();
 | 
			
		||||
@@ -178,7 +178,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
    GridBase::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
			
		||||
    int r = coor[orthogdim];
 | 
			
		||||
    lvSum[r]=lvSum[r]+Data._odata[ss];
 | 
			
		||||
  }  
 | 
			
		||||
 
 | 
			
		||||
@@ -75,7 +75,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    std::seed_seq src;
 | 
			
		||||
    
 | 
			
		||||
    fixedSeed(std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
 | 
			
		||||
    fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
 | 
			
		||||
 | 
			
		||||
    result_type operator () (void){
 | 
			
		||||
 | 
			
		||||
@@ -119,9 +119,10 @@ namespace Grid {
 | 
			
		||||
    typedef uint32_t     RngStateType;
 | 
			
		||||
    static const int     RngStateCount = std::mt19937::state_size;
 | 
			
		||||
#endif
 | 
			
		||||
    std::vector<RngEngine>             _generators;
 | 
			
		||||
    std::vector<std::uniform_real_distribution<RealD> > _uniform;
 | 
			
		||||
    std::vector<std::normal_distribution<RealD> >       _gaussian;
 | 
			
		||||
    std::vector<RngEngine>                             _generators;
 | 
			
		||||
    std::vector<std::uniform_real_distribution<RealD>> _uniform;
 | 
			
		||||
    std::vector<std::normal_distribution<RealD>>       _gaussian;
 | 
			
		||||
    std::vector<std::discrete_distribution<int32_t>>     _bernoulli;
 | 
			
		||||
 | 
			
		||||
    void GetState(std::vector<RngStateType> & saved,int gen) {
 | 
			
		||||
      saved.resize(RngStateCount);
 | 
			
		||||
@@ -161,6 +162,7 @@ namespace Grid {
 | 
			
		||||
      _generators.resize(1);
 | 
			
		||||
      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
 | 
			
		||||
      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
 | 
			
		||||
      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
 | 
			
		||||
      _seeded=0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -242,7 +244,7 @@ namespace Grid {
 | 
			
		||||
      std::random_device rd;
 | 
			
		||||
      Seed(rd);
 | 
			
		||||
    }
 | 
			
		||||
    void SeedFixedIntegers(std::vector<int> &seeds){
 | 
			
		||||
    void SeedFixedIntegers(const std::vector<int> &seeds){
 | 
			
		||||
      fixedSeed src(seeds);
 | 
			
		||||
      Seed(src);
 | 
			
		||||
    }
 | 
			
		||||
@@ -266,6 +268,7 @@ namespace Grid {
 | 
			
		||||
      _generators.resize(_vol);
 | 
			
		||||
      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
 | 
			
		||||
      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
 | 
			
		||||
      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
 | 
			
		||||
      _seeded=0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -354,7 +357,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      std::random_device rd;
 | 
			
		||||
      Seed(rd);
 | 
			
		||||
    }
 | 
			
		||||
    void SeedFixedIntegers(std::vector<int> &seeds){
 | 
			
		||||
    void SeedFixedIntegers(const std::vector<int> &seeds){
 | 
			
		||||
      fixedSeed src(seeds);
 | 
			
		||||
      Seed(src);
 | 
			
		||||
    }
 | 
			
		||||
@@ -369,13 +372,21 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    rng.fill(l,rng._gaussian);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){
 | 
			
		||||
    rng.fill(l,rng._bernoulli);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
 | 
			
		||||
    rng.fill(l,rng._uniform);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
 | 
			
		||||
    rng.fill(l,rng._gaussian);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){
 | 
			
		||||
    rng.fill(l,rng._bernoulli);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -115,9 +115,9 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
    int sc;
 | 
			
		||||
    std::vector<int> coor_c(_ndimension);
 | 
			
		||||
    std::vector<int> coor_f(_ndimension);
 | 
			
		||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
			
		||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
 | 
			
		||||
    for(int i=0;i<nbasis;i++) {
 | 
			
		||||
      
 | 
			
		||||
@@ -160,9 +160,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    std::vector<int> coor_c(_ndimension);
 | 
			
		||||
    std::vector<int> coor_f(_ndimension);
 | 
			
		||||
 | 
			
		||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
			
		||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
 | 
			
		||||
    // z = A x + y
 | 
			
		||||
    fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
 | 
			
		||||
@@ -225,9 +225,9 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 | 
			
		||||
    std::vector<int> coor_c(_ndimension);
 | 
			
		||||
    std::vector<int> coor_f(_ndimension);
 | 
			
		||||
 | 
			
		||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
			
		||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
 | 
			
		||||
    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
 | 
			
		||||
 | 
			
		||||
@@ -311,9 +311,9 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
    std::vector<int> coor_c(_ndimension);
 | 
			
		||||
    std::vector<int> coor_f(_ndimension);
 | 
			
		||||
 | 
			
		||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
			
		||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
 | 
			
		||||
    for(int i=0;i<nbasis;i++) {
 | 
			
		||||
      if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
 | 
			
		||||
@@ -325,6 +325,126 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 | 
			
		||||
// Simd layouts need not match since we use peek/poke Local
 | 
			
		||||
template<class vobj,class vvobj>
 | 
			
		||||
void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vvobj::scalar_object ssobj;
 | 
			
		||||
 | 
			
		||||
  sobj s;
 | 
			
		||||
  ssobj ss;
 | 
			
		||||
 | 
			
		||||
  GridBase *ig = in._grid;
 | 
			
		||||
  GridBase *og = out._grid;
 | 
			
		||||
 | 
			
		||||
  int ni = ig->_ndimension;
 | 
			
		||||
  int no = og->_ndimension;
 | 
			
		||||
 | 
			
		||||
  assert(ni == no);
 | 
			
		||||
 | 
			
		||||
  for(int d=0;d<no;d++){
 | 
			
		||||
    assert(ig->_processors[d]  == og->_processors[d]);
 | 
			
		||||
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<ig->lSites();idx++){
 | 
			
		||||
    std::vector<int> lcoor(ni);
 | 
			
		||||
    ig->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    peekLocalSite(s,in,lcoor);
 | 
			
		||||
    ss=s;
 | 
			
		||||
    pokeLocalSite(ss,out,lcoor);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj s;
 | 
			
		||||
 | 
			
		||||
  GridBase *lg = lowDim._grid;
 | 
			
		||||
  GridBase *hg = higherDim._grid;
 | 
			
		||||
  int nl = lg->_ndimension;
 | 
			
		||||
  int nh = hg->_ndimension;
 | 
			
		||||
 | 
			
		||||
  assert(nl+1 == nh);
 | 
			
		||||
  assert(orthog<nh);
 | 
			
		||||
  assert(orthog>=0);
 | 
			
		||||
  assert(hg->_processors[orthog]==1);
 | 
			
		||||
 | 
			
		||||
  int dl; dl = 0;
 | 
			
		||||
  for(int d=0;d<nh;d++){
 | 
			
		||||
    if ( d != orthog) {
 | 
			
		||||
      assert(lg->_processors[dl]  == hg->_processors[d]);
 | 
			
		||||
      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
 | 
			
		||||
      dl++;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    dl=0;
 | 
			
		||||
    hcoor[orthog] = slice;
 | 
			
		||||
    for(int d=0;d<nh;d++){
 | 
			
		||||
      if ( d!=orthog ) { 
 | 
			
		||||
	hcoor[d]=lcoor[dl++];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    peekLocalSite(s,lowDim,lcoor);
 | 
			
		||||
    pokeLocalSite(s,higherDim,hcoor);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj s;
 | 
			
		||||
 | 
			
		||||
  GridBase *lg = lowDim._grid;
 | 
			
		||||
  GridBase *hg = higherDim._grid;
 | 
			
		||||
  int nl = lg->_ndimension;
 | 
			
		||||
  int nh = hg->_ndimension;
 | 
			
		||||
 | 
			
		||||
  assert(nl+1 == nh);
 | 
			
		||||
  assert(orthog<nh);
 | 
			
		||||
  assert(orthog>=0);
 | 
			
		||||
  assert(hg->_processors[orthog]==1);
 | 
			
		||||
 | 
			
		||||
  int dl; dl = 0;
 | 
			
		||||
  for(int d=0;d<nh;d++){
 | 
			
		||||
    if ( d != orthog) {
 | 
			
		||||
      assert(lg->_processors[dl]  == hg->_processors[d]);
 | 
			
		||||
      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
 | 
			
		||||
      dl++;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    dl=0;
 | 
			
		||||
    hcoor[orthog] = slice;
 | 
			
		||||
    for(int d=0;d<nh;d++){
 | 
			
		||||
      if ( d!=orthog ) { 
 | 
			
		||||
	hcoor[d]=lcoor[dl++];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    peekLocalSite(s,higherDim,hcoor);
 | 
			
		||||
    pokeLocalSite(s,lowDim,lcoor);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 | 
			
		||||
 
 | 
			
		||||
@@ -146,7 +146,7 @@ class BinaryIO {
 | 
			
		||||
    csum = 0;
 | 
			
		||||
    std::vector<int> lcoor;
 | 
			
		||||
    for(int l=0;l<grid->lSites();l++){
 | 
			
		||||
      grid->CoorFromIndex(lcoor,l,grid->_ldimensions);
 | 
			
		||||
      Lexicographic::CoorFromIndex(lcoor,l,grid->_ldimensions);
 | 
			
		||||
      peekLocalSite(siteObj,lat,lcoor);
 | 
			
		||||
      munge(siteObj,fileObj,csum);
 | 
			
		||||
    }
 | 
			
		||||
@@ -168,6 +168,7 @@ class BinaryIO {
 | 
			
		||||
    GridBase *grid = Umu._grid;
 | 
			
		||||
 | 
			
		||||
    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
 | 
			
		||||
    GridStopWatch timer; timer.Start();
 | 
			
		||||
 | 
			
		||||
    int ieee32big = (format == std::string("IEEE32BIG"));
 | 
			
		||||
    int ieee32    = (format == std::string("IEEE32"));
 | 
			
		||||
@@ -182,6 +183,7 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
    Umu = zero;
 | 
			
		||||
    uint32_t csum=0;
 | 
			
		||||
    uint64_t bytes=0;
 | 
			
		||||
    fobj file_object;
 | 
			
		||||
    sobj munged;
 | 
			
		||||
    
 | 
			
		||||
@@ -194,7 +196,7 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
      if ( grid->IsBoss() ) {
 | 
			
		||||
	fin.read((char *)&file_object,sizeof(file_object));
 | 
			
		||||
	
 | 
			
		||||
	bytes += sizeof(file_object);
 | 
			
		||||
	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
@@ -205,6 +207,10 @@ class BinaryIO {
 | 
			
		||||
      // The boss who read the file has their value poked
 | 
			
		||||
      pokeSite(munged,Umu,site);
 | 
			
		||||
    }}}}
 | 
			
		||||
    timer.Stop();
 | 
			
		||||
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
			
		||||
	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -224,13 +230,14 @@ class BinaryIO {
 | 
			
		||||
    // Serialise through node zero
 | 
			
		||||
    //////////////////////////////////////////////////
 | 
			
		||||
    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
 | 
			
		||||
    GridStopWatch timer; timer.Start();
 | 
			
		||||
 | 
			
		||||
    std::ofstream fout;
 | 
			
		||||
    if ( grid->IsBoss() ) {
 | 
			
		||||
      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
			
		||||
      fout.seekp(offset);
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    uint64_t bytes=0;
 | 
			
		||||
    uint32_t csum=0;
 | 
			
		||||
    fobj file_object;
 | 
			
		||||
    sobj unmunged;
 | 
			
		||||
@@ -253,9 +260,14 @@ class BinaryIO {
 | 
			
		||||
	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
 | 
			
		||||
 | 
			
		||||
	// NB could gather an xstrip as an optimisation.
 | 
			
		||||
	fout.write((char *)&file_object,sizeof(file_object));
 | 
			
		||||
	bytes+=sizeof(file_object);
 | 
			
		||||
      }
 | 
			
		||||
    }}}}
 | 
			
		||||
    timer.Stop();
 | 
			
		||||
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
			
		||||
	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
@@ -265,6 +277,7 @@ class BinaryIO {
 | 
			
		||||
    typedef typename GridSerialRNG::RngStateType RngStateType;
 | 
			
		||||
    const int RngStateCount = GridSerialRNG::RngStateCount;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    GridBase *grid = parallel._grid;
 | 
			
		||||
    int gsites = grid->_gsites;
 | 
			
		||||
 | 
			
		||||
@@ -310,7 +323,7 @@ class BinaryIO {
 | 
			
		||||
      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
			
		||||
      fout.write((char *)&saved[0],bytes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    grid->Broadcast(0,(void *)&csum,sizeof(csum));
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
  static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file,int offset)
 | 
			
		||||
@@ -360,6 +373,8 @@ class BinaryIO {
 | 
			
		||||
      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    grid->Broadcast(0,(void *)&csum,sizeof(csum));
 | 
			
		||||
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -398,7 +413,7 @@ class BinaryIO {
 | 
			
		||||
    int IOnode = 1;
 | 
			
		||||
    for(int d=0;d<grid->_ndimension;d++) {
 | 
			
		||||
 | 
			
		||||
      if ( d==0 ) parallel[d] = 0;
 | 
			
		||||
      if ( d == 0 ) parallel[d] = 0;
 | 
			
		||||
      if (parallel[d]) {
 | 
			
		||||
	range[d] = grid->_ldimensions[d];
 | 
			
		||||
	start[d] = grid->_processor_coor[d]*range[d];
 | 
			
		||||
@@ -426,6 +441,9 @@ class BinaryIO {
 | 
			
		||||
      std::cout << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    GridStopWatch timer; timer.Start();
 | 
			
		||||
    uint64_t bytes=0;
 | 
			
		||||
 | 
			
		||||
    int myrank = grid->ThisRank();
 | 
			
		||||
    int iorank = grid->RankFromProcessorCoor(ioproc);
 | 
			
		||||
 | 
			
		||||
@@ -439,9 +457,9 @@ class BinaryIO {
 | 
			
		||||
    // available (how short sighted is that?)
 | 
			
		||||
    //////////////////////////////////////////////////////////
 | 
			
		||||
    Umu = zero;
 | 
			
		||||
    uint32_t csum=0;
 | 
			
		||||
    static uint32_t csum=0;
 | 
			
		||||
    fobj fileObj;
 | 
			
		||||
    sobj siteObj;
 | 
			
		||||
    static sobj siteObj; // Static to place in symmetric region for SHMEM
 | 
			
		||||
 | 
			
		||||
      // need to implement these loops in Nd independent way with a lexico conversion
 | 
			
		||||
    for(int tlex=0;tlex<slice_vol;tlex++){
 | 
			
		||||
@@ -451,7 +469,7 @@ class BinaryIO {
 | 
			
		||||
      std::vector<int> lsite(nd);
 | 
			
		||||
      std::vector<int> iosite(nd);
 | 
			
		||||
 | 
			
		||||
      grid->CoorFromIndex(tsite,tlex,range);
 | 
			
		||||
      Lexicographic::CoorFromIndex(tsite,tlex,range);
 | 
			
		||||
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
			
		||||
@@ -461,7 +479,7 @@ class BinaryIO {
 | 
			
		||||
      /////////////////////////
 | 
			
		||||
      // Get the rank of owner of data
 | 
			
		||||
      /////////////////////////
 | 
			
		||||
	int rank, o_idx,i_idx, g_idx;
 | 
			
		||||
      int rank, o_idx,i_idx, g_idx;
 | 
			
		||||
      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
 | 
			
		||||
      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
 | 
			
		||||
      
 | 
			
		||||
@@ -472,6 +490,7 @@ class BinaryIO {
 | 
			
		||||
	
 | 
			
		||||
	fin.seekg(offset+g_idx*sizeof(fileObj));
 | 
			
		||||
	fin.read((char *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	bytes+=sizeof(fileObj);
 | 
			
		||||
	
 | 
			
		||||
	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
			
		||||
@@ -480,22 +499,28 @@ class BinaryIO {
 | 
			
		||||
	
 | 
			
		||||
	munge(fileObj,siteObj,csum);
 | 
			
		||||
 | 
			
		||||
	if ( rank != myrank ) {
 | 
			
		||||
	  grid->SendTo((void *)&siteObj,rank,sizeof(siteObj));
 | 
			
		||||
	} else { 
 | 
			
		||||
	  pokeLocalSite(siteObj,Umu,lsite);
 | 
			
		||||
	}
 | 
			
		||||
      }	
 | 
			
		||||
 | 
			
		||||
      } else { 
 | 
			
		||||
	if ( myrank == rank ) {
 | 
			
		||||
	  grid->RecvFrom((void *)&siteObj,iorank,sizeof(siteObj));
 | 
			
		||||
	  pokeLocalSite(siteObj,Umu,lsite);
 | 
			
		||||
      // Possibly do transport through pt2pt 
 | 
			
		||||
      if ( rank != iorank ) { 
 | 
			
		||||
	if ( (myrank == rank) || (myrank==iorank) ) {
 | 
			
		||||
	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      // Poke at destination
 | 
			
		||||
      if ( myrank == rank ) {
 | 
			
		||||
	  pokeLocalSite(siteObj,Umu,lsite);
 | 
			
		||||
      }
 | 
			
		||||
      grid->Barrier(); // necessary?
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    grid->GlobalSum(csum);
 | 
			
		||||
    grid->GlobalSum(bytes);
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
 | 
			
		||||
    timer.Stop();
 | 
			
		||||
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
			
		||||
	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
@@ -530,7 +555,7 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
    for(int d=0;d<grid->_ndimension;d++) {
 | 
			
		||||
 | 
			
		||||
      if ( d==0 ) parallel[d] = 0;
 | 
			
		||||
      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
 | 
			
		||||
 | 
			
		||||
      if (parallel[d]) {
 | 
			
		||||
	range[d] = grid->_ldimensions[d];
 | 
			
		||||
@@ -559,6 +584,9 @@ class BinaryIO {
 | 
			
		||||
      std::cout << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    GridStopWatch timer; timer.Start();
 | 
			
		||||
    uint64_t bytes=0;
 | 
			
		||||
 | 
			
		||||
    int myrank = grid->ThisRank();
 | 
			
		||||
    int iorank = grid->RankFromProcessorCoor(ioproc);
 | 
			
		||||
 | 
			
		||||
@@ -577,10 +605,10 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
    uint32_t csum=0;
 | 
			
		||||
    fobj fileObj;
 | 
			
		||||
    sobj siteObj;
 | 
			
		||||
    static sobj siteObj; // static for SHMEM target; otherwise dynamic allocate with AlignedAllocator
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      // need to implement these loops in Nd independent way with a lexico conversion
 | 
			
		||||
    // should aggregate a whole chunk and then write.
 | 
			
		||||
    // need to implement these loops in Nd independent way with a lexico conversion
 | 
			
		||||
    for(int tlex=0;tlex<slice_vol;tlex++){
 | 
			
		||||
	
 | 
			
		||||
      std::vector<int> tsite(nd); // temporary mixed up site
 | 
			
		||||
@@ -588,7 +616,7 @@ class BinaryIO {
 | 
			
		||||
      std::vector<int> lsite(nd);
 | 
			
		||||
      std::vector<int> iosite(nd);
 | 
			
		||||
 | 
			
		||||
      grid->CoorFromIndex(tsite,tlex,range);
 | 
			
		||||
      Lexicographic::CoorFromIndex(tsite,tlex,range);
 | 
			
		||||
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
			
		||||
@@ -606,13 +634,21 @@ class BinaryIO {
 | 
			
		||||
      ////////////////////////////////
 | 
			
		||||
      // iorank writes from the seek
 | 
			
		||||
      ////////////////////////////////
 | 
			
		||||
      if (myrank == iorank) {
 | 
			
		||||
      
 | 
			
		||||
	if ( rank != myrank ) {
 | 
			
		||||
	  grid->RecvFrom((void *)&siteObj,rank,sizeof(siteObj));
 | 
			
		||||
	} else { 
 | 
			
		||||
	  peekLocalSite(siteObj,Umu,lsite);
 | 
			
		||||
      // Owner of data peeks it
 | 
			
		||||
      peekLocalSite(siteObj,Umu,lsite);
 | 
			
		||||
 | 
			
		||||
      // Pair of nodes may need to do pt2pt send
 | 
			
		||||
      if ( rank != iorank ) { // comms is necessary
 | 
			
		||||
	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
 | 
			
		||||
	  // Send to IOrank 
 | 
			
		||||
	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      grid->Barrier(); // necessary?
 | 
			
		||||
 | 
			
		||||
      if (myrank == iorank) {
 | 
			
		||||
	
 | 
			
		||||
	munge(siteObj,fileObj,csum);
 | 
			
		||||
 | 
			
		||||
@@ -623,17 +659,16 @@ class BinaryIO {
 | 
			
		||||
	
 | 
			
		||||
	fout.seekp(offset+g_idx*sizeof(fileObj));
 | 
			
		||||
	fout.write((char *)&fileObj,sizeof(fileObj));
 | 
			
		||||
 | 
			
		||||
      } else { 
 | 
			
		||||
	if ( myrank == rank ) {
 | 
			
		||||
	  peekLocalSite(siteObj,Umu,lsite);
 | 
			
		||||
	  grid->SendTo((void *)&siteObj,iorank,sizeof(siteObj));
 | 
			
		||||
	} 
 | 
			
		||||
	bytes+=sizeof(fileObj);
 | 
			
		||||
      }
 | 
			
		||||
      grid->Barrier(); // necessary// or every 16 packets to rate throttle??
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    grid->GlobalSum(csum);
 | 
			
		||||
    grid->GlobalSum(bytes);
 | 
			
		||||
 | 
			
		||||
    timer.Stop();
 | 
			
		||||
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
			
		||||
	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
			
		||||
 | 
			
		||||
    return csum;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -213,37 +213,38 @@ class NerscIO : public BinaryIO {
 | 
			
		||||
  static inline void truncate(std::string file){
 | 
			
		||||
    std::ofstream fout(file,std::ios::out);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  #define dump_nersc_header(field, s)\
 | 
			
		||||
  s << "BEGIN_HEADER"      << std::endl;\
 | 
			
		||||
  s << "HDR_VERSION = "    << field.hdr_version    << std::endl;\
 | 
			
		||||
  s << "DATATYPE = "       << field.data_type      << std::endl;\
 | 
			
		||||
  s << "STORAGE_FORMAT = " << field.storage_format << std::endl;\
 | 
			
		||||
  for(int i=0;i<4;i++){\
 | 
			
		||||
    s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;\
 | 
			
		||||
  }\
 | 
			
		||||
  s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;\
 | 
			
		||||
  s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;\
 | 
			
		||||
  for(int i=0;i<4;i++){\
 | 
			
		||||
    s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;\
 | 
			
		||||
  }\
 | 
			
		||||
  \
 | 
			
		||||
  s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;\
 | 
			
		||||
  s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;\
 | 
			
		||||
  s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;\
 | 
			
		||||
  s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;\
 | 
			
		||||
  s << "CREATOR = "         << field.creator          << std::endl;\
 | 
			
		||||
  s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;\
 | 
			
		||||
  s << "CREATION_DATE = "   << field.creation_date    << std::endl;\
 | 
			
		||||
  s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;\
 | 
			
		||||
  s << "FLOATING_POINT = "  << field.floating_point   << std::endl;\
 | 
			
		||||
  s << "END_HEADER"         << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  static inline unsigned int writeHeader(NerscField &field,std::string file)
 | 
			
		||||
  {
 | 
			
		||||
    std::ofstream fout(file,std::ios::out|std::ios::in);
 | 
			
		||||
  
 | 
			
		||||
    fout.seekp(0,std::ios::beg);
 | 
			
		||||
    fout << "BEGIN_HEADER"      << std::endl;
 | 
			
		||||
    fout << "HDR_VERSION = "    << field.hdr_version    << std::endl;
 | 
			
		||||
    fout << "DATATYPE = "       << field.data_type      << std::endl;
 | 
			
		||||
    fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;
 | 
			
		||||
 | 
			
		||||
    for(int i=0;i<4;i++){
 | 
			
		||||
      fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
 | 
			
		||||
    }
 | 
			
		||||
    // just to keep the space and write it later
 | 
			
		||||
    fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
 | 
			
		||||
    fout << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;
 | 
			
		||||
    for(int i=0;i<4;i++){
 | 
			
		||||
      fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fout << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;
 | 
			
		||||
 | 
			
		||||
    fout << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;
 | 
			
		||||
    fout << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;
 | 
			
		||||
    fout << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;
 | 
			
		||||
    fout << "CREATOR = "         << field.creator          << std::endl;
 | 
			
		||||
    fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
 | 
			
		||||
    fout << "CREATION_DATE = "   << field.creation_date    << std::endl;
 | 
			
		||||
    fout << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;
 | 
			
		||||
    fout << "FLOATING_POINT = "  << field.floating_point   << std::endl;
 | 
			
		||||
    fout << "END_HEADER"         << std::endl;
 | 
			
		||||
    dump_nersc_header(field, fout);
 | 
			
		||||
    field.data_start = fout.tellp();
 | 
			
		||||
    return field.data_start;
 | 
			
		||||
}
 | 
			
		||||
@@ -345,17 +346,17 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 | 
			
		||||
  if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
 | 
			
		||||
    if ( ieee32 || ieee32big ) {
 | 
			
		||||
      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 | 
			
		||||
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 | 
			
		||||
	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 | 
			
		||||
	(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
 | 
			
		||||
    }
 | 
			
		||||
    if ( ieee64 || ieee64big ) {
 | 
			
		||||
      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 | 
			
		||||
      //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 | 
			
		||||
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 | 
			
		||||
	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
 | 
			
		||||
      	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
 | 
			
		||||
    }
 | 
			
		||||
  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
 | 
			
		||||
  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
 | 
			
		||||
    if ( ieee32 || ieee32big ) {
 | 
			
		||||
      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 | 
			
		||||
      //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 | 
			
		||||
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 | 
			
		||||
	(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
 | 
			
		||||
    }
 | 
			
		||||
@@ -372,6 +373,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 | 
			
		||||
 | 
			
		||||
  assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
 | 
			
		||||
  assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
 | 
			
		||||
 | 
			
		||||
  assert(csum == header.checksum );
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
 | 
			
		||||
@@ -419,6 +421,7 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
 | 
			
		||||
    std::string file1 = file+"para";
 | 
			
		||||
    int offset1 = writeHeader(header,file1);
 | 
			
		||||
    int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
 | 
			
		||||
    //int csum1=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
 | 
			
		||||
@@ -429,11 +432,12 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
    header.floating_point = std::string("IEEE64BIG");
 | 
			
		||||
    header.data_type      = std::string("4D_SU3_GAUGE_3X3");
 | 
			
		||||
    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
 | 
			
		||||
    NerscSimpleUnmunger<fobj3D,sobj> munge;
 | 
			
		||||
    BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
 | 
			
		||||
    offset = writeHeader(header,file);
 | 
			
		||||
    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
 | 
			
		||||
    //    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
 | 
			
		||||
    csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
 | 
			
		||||
@@ -507,6 +511,8 @@ static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel
 | 
			
		||||
  // munger is a function of <floating point, Real, data_type>
 | 
			
		||||
  uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
 | 
			
		||||
 | 
			
		||||
  std::cerr<<" Csum "<< csum << " "<< header.checksum <<std::endl;
 | 
			
		||||
 | 
			
		||||
  assert(csum == header.checksum );
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -90,7 +90,7 @@ namespace QCD {
 | 
			
		||||
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
 | 
			
		||||
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
 | 
			
		||||
 | 
			
		||||
    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 | 
			
		||||
    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 | 
			
		||||
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 | 
			
		||||
 | 
			
		||||
    // Spin matrix
 | 
			
		||||
@@ -383,7 +383,6 @@ namespace QCD {
 | 
			
		||||
    //////////////////////////////////////////////
 | 
			
		||||
    // Poke scalars
 | 
			
		||||
    //////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
 | 
			
		||||
    {
 | 
			
		||||
      pokeIndex<SpinIndex>(lhs,rhs,i);
 | 
			
		||||
@@ -407,6 +406,40 @@ namespace QCD {
 | 
			
		||||
      pokeIndex<LorentzIndex>(lhs,rhs,i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////
 | 
			
		||||
    // Fermion <-> propagator assignements
 | 
			
		||||
    //////////////////////////////////////////////
 | 
			
		||||
    template <class Prop, class Ferm>
 | 
			
		||||
    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
 | 
			
		||||
    {
 | 
			
		||||
        for(int j = 0; j < Ns; ++j)
 | 
			
		||||
        {
 | 
			
		||||
            auto pjs = peekSpin(p, j, s);
 | 
			
		||||
            auto fj  = peekSpin(f, j);
 | 
			
		||||
            
 | 
			
		||||
            for(int i = 0; i < Nc; ++i)
 | 
			
		||||
            {
 | 
			
		||||
                pokeColour(pjs, peekColour(fj, i), i, c);
 | 
			
		||||
            }
 | 
			
		||||
            pokeSpin(p, pjs, j, s);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    template <class Prop, class Ferm>
 | 
			
		||||
    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
 | 
			
		||||
    {
 | 
			
		||||
        for(int j = 0; j < Ns; ++j)
 | 
			
		||||
        {
 | 
			
		||||
            auto pjs = peekSpin(p, j, s);
 | 
			
		||||
            auto fj  = peekSpin(f, j);
 | 
			
		||||
            
 | 
			
		||||
            for(int i = 0; i < Nc; ++i)
 | 
			
		||||
            {
 | 
			
		||||
                pokeColour(fj, peekColour(pjs, i, c), i);
 | 
			
		||||
            }
 | 
			
		||||
            pokeSpin(f, fj, j);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    //////////////////////////////////////////////
 | 
			
		||||
    // transpose array and scalar
 | 
			
		||||
 
 | 
			
		||||
@@ -109,10 +109,12 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 | 
			
		||||
 | 
			
		||||
#define FermOpTemplateInstantiate(A) \
 | 
			
		||||
  template class A<WilsonImplF>;		\
 | 
			
		||||
  template class A<WilsonImplD>;  \
 | 
			
		||||
  template class A<WilsonImplD>;		\
 | 
			
		||||
  template class A<GparityWilsonImplF>;		\
 | 
			
		||||
  template class A<GparityWilsonImplD>;		
 | 
			
		||||
 | 
			
		||||
#define GparityFermOpTemplateInstantiate(A) 
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
// Fermion operators / actions
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
@@ -208,6 +210,14 @@ typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 | 
			
		||||
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 | 
			
		||||
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
 | 
			
		||||
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
 | 
			
		||||
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 | 
			
		||||
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 | 
			
		||||
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  }}
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 | 
			
		||||
 
 | 
			
		||||
@@ -527,6 +527,7 @@ namespace QCD {
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  FermOpTemplateInstantiate(CayleyFermion5D);
 | 
			
		||||
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -130,7 +130,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
			
		||||
      typedef WilsonImplParams ImplParams;
 | 
			
		||||
      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
 | 
			
		||||
      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
 | 
			
		||||
 | 
			
		||||
      ImplParams Params;
 | 
			
		||||
 | 
			
		||||
@@ -142,6 +142,10 @@ namespace Grid {
 | 
			
		||||
        mult(&phi(),&U(mu),&chi());
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      template<class ref>
 | 
			
		||||
      inline void loadLinkElement(Simd & reg,ref &memory){
 | 
			
		||||
	reg = memory;
 | 
			
		||||
      }
 | 
			
		||||
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
			
		||||
      {
 | 
			
		||||
        conformable(Uds._grid,GaugeGrid);
 | 
			
		||||
@@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ///////
 | 
			
		||||
    // Single flavour four spinors with colour index, 5d redblack
 | 
			
		||||
    ///////
 | 
			
		||||
    template<class S,int Nrepresentation=Nc>
 | 
			
		||||
    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
 | 
			
		||||
 | 
			
		||||
      INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
      
 | 
			
		||||
      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
 | 
			
		||||
      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
 | 
			
		||||
      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
 | 
			
		||||
      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
 | 
			
		||||
      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
 | 
			
		||||
    
 | 
			
		||||
      typedef iImplSpinor    <Simd>           SiteSpinor;
 | 
			
		||||
      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
 | 
			
		||||
      typedef Lattice<SiteSpinor>             FermionField;
 | 
			
		||||
 | 
			
		||||
      // Make the doubled gauge field a *scalar*
 | 
			
		||||
      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
 | 
			
		||||
      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
 | 
			
		||||
      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
 | 
			
		||||
 | 
			
		||||
      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
 | 
			
		||||
 | 
			
		||||
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
			
		||||
      typedef WilsonImplParams ImplParams;
 | 
			
		||||
      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
 | 
			
		||||
 | 
			
		||||
      ImplParams Params;
 | 
			
		||||
 | 
			
		||||
      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
 | 
			
		||||
 | 
			
		||||
      bool overlapCommsCompute(void) { return false; };
 | 
			
		||||
    
 | 
			
		||||
      template<class ref>
 | 
			
		||||
      inline void loadLinkElement(Simd & reg,ref &memory){
 | 
			
		||||
	vsplat(reg,memory);
 | 
			
		||||
      }
 | 
			
		||||
      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
 | 
			
		||||
      {
 | 
			
		||||
	SiteGaugeLink UU;
 | 
			
		||||
	for(int i=0;i<Nrepresentation;i++){
 | 
			
		||||
	  for(int j=0;j<Nrepresentation;j++){
 | 
			
		||||
	    vsplat(UU()()(i,j),U(mu)()(i,j));
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
        mult(&phi(),&UU(),&chi());
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
			
		||||
      {
 | 
			
		||||
	SiteScalarGaugeField  ScalarUmu;
 | 
			
		||||
	SiteDoubledGaugeField ScalarUds;
 | 
			
		||||
 | 
			
		||||
        GaugeLinkField U   (Umu._grid);
 | 
			
		||||
	GaugeField     Uadj(Umu._grid);
 | 
			
		||||
        for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
  	  U = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
	  U = adj(Cshift(U,mu,-1));
 | 
			
		||||
	  PokeIndex<LorentzIndex>(Uadj,U,mu);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
 | 
			
		||||
	  std::vector<int> lcoor;
 | 
			
		||||
	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
 | 
			
		||||
 | 
			
		||||
	  peekLocalSite(ScalarUmu,Umu,lcoor);
 | 
			
		||||
	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
 | 
			
		||||
 | 
			
		||||
	  peekLocalSite(ScalarUmu,Uadj,lcoor);
 | 
			
		||||
	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
 | 
			
		||||
 | 
			
		||||
	  pokeLocalSite(ScalarUds,Uds,lcoor);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
	
 | 
			
		||||
      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }   
 | 
			
		||||
 | 
			
		||||
      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Flavour doubled spinors; is Gparity the only? what about C*?
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -205,7 +303,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 | 
			
		||||
 | 
			
		||||
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
			
		||||
      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
 | 
			
		||||
      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
 | 
			
		||||
 | 
			
		||||
      typedef GparityWilsonImplParams ImplParams;
 | 
			
		||||
 | 
			
		||||
@@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
	conformable(Uds._grid,GaugeGrid);
 | 
			
		||||
	conformable(Umu._grid,GaugeGrid);
 | 
			
		||||
	
 | 
			
		||||
	GaugeLinkField Utmp(GaugeGrid);
 | 
			
		||||
	GaugeLinkField U(GaugeGrid);
 | 
			
		||||
	GaugeLinkField Utmp (GaugeGrid);
 | 
			
		||||
	GaugeLinkField U    (GaugeGrid);
 | 
			
		||||
	GaugeLinkField Uconj(GaugeGrid);
 | 
			
		||||
	
 | 
			
		||||
	Lattice<iScalar<vInteger> > coor(GaugeGrid);
 | 
			
		||||
@@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
 | 
			
		||||
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
 | 
			
		||||
 | 
			
		||||
    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
 | 
			
		||||
    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
 | 
			
		||||
    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
 | 
			
		||||
 | 
			
		||||
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
 | 
			
		||||
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
 | 
			
		||||
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
 | 
			
		||||
 
 | 
			
		||||
@@ -48,14 +48,16 @@ namespace Grid {
 | 
			
		||||
			GridCartesian         &FourDimGrid,
 | 
			
		||||
			GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
			RealD _mass,RealD _M5,
 | 
			
		||||
			RealD scale) :
 | 
			
		||||
//			RealD scale):
 | 
			
		||||
			RealD scale,const ImplParams &p= ImplParams()) :
 | 
			
		||||
      
 | 
			
		||||
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
 | 
			
		||||
      MobiusFermion<Impl>(_Umu,
 | 
			
		||||
		    FiveDimGrid,
 | 
			
		||||
		    FiveDimRedBlackGrid,
 | 
			
		||||
		    FourDimGrid,
 | 
			
		||||
		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
 | 
			
		||||
	FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
 | 
			
		||||
//		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
 | 
			
		||||
      {
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -48,12 +48,7 @@ namespace QCD {
 | 
			
		||||
      mu=p;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
 | 
			
		||||
      return spinproject(in);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    SiteHalfSpinor spinproject(const SiteSpinor &in)
 | 
			
		||||
    {
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      int mudag=mu;
 | 
			
		||||
      if (!dag) {
 | 
			
		||||
@@ -92,6 +87,173 @@ namespace QCD {
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  /////////////////////////
 | 
			
		||||
  // optimised versions
 | 
			
		||||
  /////////////////////////
 | 
			
		||||
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonXpCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjXp(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonYpCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjYp(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonZpCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjZp(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonTpCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjTp(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonXmCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjXm(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonYmCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjYm(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonZmCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjZm(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class SiteHalfSpinor,class SiteSpinor>
 | 
			
		||||
  class WilsonTmCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
			
		||||
      SiteHalfSpinor ret;
 | 
			
		||||
      spProjTm(ret,in);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
    // Fast comms buffer manipulation which should inline right through (avoid direction
 | 
			
		||||
    // dependent logic that prevents inlining
 | 
			
		||||
  template<class vobj,class cobj>
 | 
			
		||||
  class WilsonStencil : public CartesianStencil<vobj,cobj> {
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    WilsonStencil(GridBase *grid,
 | 
			
		||||
		int npoints,
 | 
			
		||||
		int checkerboard,
 | 
			
		||||
		const std::vector<int> &directions,
 | 
			
		||||
		const std::vector<int> &distances)  : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
 | 
			
		||||
      {    };
 | 
			
		||||
 | 
			
		||||
    template < class compressor>
 | 
			
		||||
    std::thread HaloExchangeOptBegin(const Lattice<vobj> &source,compressor &compress) {
 | 
			
		||||
      this->Mergers.resize(0); 
 | 
			
		||||
      this->Packets.resize(0);
 | 
			
		||||
      this->HaloGatherOpt(source,compress);
 | 
			
		||||
      return std::thread([&] { this->Communicate(); });
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template < class compressor>
 | 
			
		||||
    void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
    {
 | 
			
		||||
      auto thr = this->HaloExchangeOptBegin(source,compress);
 | 
			
		||||
      this->HaloExchangeOptComplete(thr);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void HaloExchangeOptComplete(std::thread &thr) 
 | 
			
		||||
    {
 | 
			
		||||
	this->CommsMerge(); // spins
 | 
			
		||||
	this->jointime-=usecond();
 | 
			
		||||
	thr.join();
 | 
			
		||||
	this->jointime+=usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template < class compressor>
 | 
			
		||||
    void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
 | 
			
		||||
    {
 | 
			
		||||
	// conformable(source._grid,_grid);
 | 
			
		||||
	assert(source._grid==this->_grid);
 | 
			
		||||
	this->halogtime-=usecond();
 | 
			
		||||
 | 
			
		||||
	assert (this->comm_buf.size() == this->_unified_buffer_size );
 | 
			
		||||
	this->u_comm_offset=0;
 | 
			
		||||
 | 
			
		||||
	int dag = compress.dag;
 | 
			
		||||
	static std::vector<int> dirs(Nd*2);
 | 
			
		||||
	for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	  if ( dag ) {
 | 
			
		||||
	    dirs[mu]  =mu;
 | 
			
		||||
	    dirs[mu+4]=mu+Nd;
 | 
			
		||||
	  } else { 
 | 
			
		||||
	    dirs[mu]  =mu+Nd;
 | 
			
		||||
	    dirs[mu+Nd]=mu;
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	WilsonXpCompressor<cobj,vobj> XpCompress;
 | 
			
		||||
	this->HaloGatherDir(source,XpCompress,dirs[0]);
 | 
			
		||||
 | 
			
		||||
	WilsonYpCompressor<cobj,vobj> YpCompress;
 | 
			
		||||
	this->HaloGatherDir(source,YpCompress,dirs[1]);
 | 
			
		||||
 | 
			
		||||
	WilsonZpCompressor<cobj,vobj> ZpCompress;
 | 
			
		||||
	this->HaloGatherDir(source,ZpCompress,dirs[2]);
 | 
			
		||||
 | 
			
		||||
	WilsonTpCompressor<cobj,vobj> TpCompress;
 | 
			
		||||
	this->HaloGatherDir(source,TpCompress,dirs[3]);
 | 
			
		||||
 | 
			
		||||
	WilsonXmCompressor<cobj,vobj> XmCompress;
 | 
			
		||||
	this->HaloGatherDir(source,XmCompress,dirs[4]);
 | 
			
		||||
 | 
			
		||||
	WilsonYmCompressor<cobj,vobj> YmCompress;
 | 
			
		||||
	this->HaloGatherDir(source,YmCompress,dirs[5]);
 | 
			
		||||
 | 
			
		||||
	WilsonZmCompressor<cobj,vobj> ZmCompress;
 | 
			
		||||
	this->HaloGatherDir(source,ZmCompress,dirs[6]);
 | 
			
		||||
 | 
			
		||||
	WilsonTmCompressor<cobj,vobj> TmCompress;
 | 
			
		||||
	this->HaloGatherDir(source,TmCompress,dirs[7]);
 | 
			
		||||
 | 
			
		||||
	assert(this->u_comm_offset==this->_unified_buffer_size);
 | 
			
		||||
	this->halogtime+=usecond();
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}} // namespace close
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -64,7 +64,9 @@ namespace QCD {
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
			
		||||
  {
 | 
			
		||||
    Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
 | 
			
		||||
    GaugeField HUmu(_Umu._grid);
 | 
			
		||||
    HUmu = _Umu*(-0.5);
 | 
			
		||||
    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
 | 
			
		||||
    pickCheckerboard(Even,UmuEven,Umu);
 | 
			
		||||
    pickCheckerboard(Odd ,UmuOdd,Umu);
 | 
			
		||||
  }
 | 
			
		||||
@@ -286,121 +288,27 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
					 const FermionField &in, FermionField &out,int dag) 
 | 
			
		||||
  {
 | 
			
		||||
    if ( Impl::overlapCommsCompute () ) { 
 | 
			
		||||
      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
 | 
			
		||||
    } else { 
 | 
			
		||||
      DhopInternalCommsThenCompute(st,U,in,out,dag);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
							 const FermionField &in, FermionField &out,int dag) {
 | 
			
		||||
 | 
			
		||||
    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
 | 
			
		||||
    Compressor compressor(dag);
 | 
			
		||||
    st.HaloExchange(in,compressor);
 | 
			
		||||
    
 | 
			
		||||
    if ( dag == DaggerYes ) {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
						     const FermionField &in, FermionField &out,int dag) {
 | 
			
		||||
 | 
			
		||||
    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
 | 
			
		||||
    Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
    auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
 | 
			
		||||
    bool local    = true;
 | 
			
		||||
    bool nonlocal = false;
 | 
			
		||||
    if ( dag == DaggerYes ) {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    st.HaloExchangeComplete(handle);
 | 
			
		||||
 | 
			
		||||
    local    = false;
 | 
			
		||||
    nonlocal = true;
 | 
			
		||||
    if ( dag == DaggerYes ) {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      if( HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
  FermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
  GparityFermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -114,12 +114,6 @@ namespace Grid {
 | 
			
		||||
      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
			const FermionField &in, FermionField &out,int dag) ;
 | 
			
		||||
 | 
			
		||||
      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
				    const FermionField &in, FermionField &out,int dag) ;
 | 
			
		||||
      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
				    const FermionField &in, FermionField &out,int dag) ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      // Constructor
 | 
			
		||||
      WilsonFermion(GaugeField &_Umu,
 | 
			
		||||
		    GridCartesian         &Fgrid,
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
@@ -38,8 +38,6 @@ namespace QCD {
 | 
			
		||||
// S-direction is INNERMOST and takes no part in the parity.
 | 
			
		||||
const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 | 
			
		||||
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 | 
			
		||||
int WilsonFermion5DStatic::HandOptDslash;
 | 
			
		||||
int WilsonFermion5DStatic::AsmOptDslash;
 | 
			
		||||
 | 
			
		||||
  // 5d lattice for DWF.
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -67,10 +65,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
  // some assertions
 | 
			
		||||
  assert(FiveDimGrid._ndimension==5);
 | 
			
		||||
  assert(FourDimGrid._ndimension==4);
 | 
			
		||||
  
 | 
			
		||||
  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
			
		||||
  assert(FourDimRedBlackGrid._ndimension==4);
 | 
			
		||||
 | 
			
		||||
  assert(FiveDimRedBlackGrid._checker_dim==1);
 | 
			
		||||
 | 
			
		||||
  // Dimension zero of the five-d is the Ls direction
 | 
			
		||||
@@ -99,16 +95,74 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
 | 
			
		||||
  // Allocate the required comms buffer
 | 
			
		||||
  ImportGauge(_Umu);
 | 
			
		||||
  alltime=0;
 | 
			
		||||
  commtime=0;
 | 
			
		||||
  jointime=0;
 | 
			
		||||
  dslashtime=0;
 | 
			
		||||
  dslash1time=0;
 | 
			
		||||
}  
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 | 
			
		||||
				       GridCartesian         &FiveDimGrid,
 | 
			
		||||
				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
				       GridCartesian         &FourDimGrid,
 | 
			
		||||
				       RealD _M5,const ImplParams &p) :
 | 
			
		||||
  Kernels(p),
 | 
			
		||||
  _FiveDimGrid        (&FiveDimGrid),
 | 
			
		||||
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
 | 
			
		||||
  _FourDimGrid        (&FourDimGrid),
 | 
			
		||||
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
 | 
			
		||||
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
 | 
			
		||||
  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
 | 
			
		||||
  M5(_M5),
 | 
			
		||||
  Umu(_FourDimGrid),
 | 
			
		||||
  UmuEven(_FourDimGrid),
 | 
			
		||||
  UmuOdd (_FourDimGrid),
 | 
			
		||||
  Lebesgue(_FourDimGrid),
 | 
			
		||||
  LebesgueEvenOdd(_FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
  // some assertions
 | 
			
		||||
  assert(FiveDimGrid._ndimension==5);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
 | 
			
		||||
  assert(FourDimGrid._ndimension==4);
 | 
			
		||||
 | 
			
		||||
  // Dimension zero of the five-d is the Ls direction
 | 
			
		||||
  Ls=FiveDimGrid._fdimensions[0];
 | 
			
		||||
  assert(FiveDimGrid._processors[0]         ==1);
 | 
			
		||||
  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
 | 
			
		||||
 | 
			
		||||
  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._processors[0] ==1);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
 | 
			
		||||
 | 
			
		||||
  // Other dimensions must match the decomposition of the four-D fields 
 | 
			
		||||
  for(int d=0;d<4;d++){
 | 
			
		||||
    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
 | 
			
		||||
 | 
			
		||||
    assert(FourDimGrid._simd_layout[d]=1);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
 | 
			
		||||
 | 
			
		||||
    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
 | 
			
		||||
    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
 | 
			
		||||
    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    GaugeField HUmu(_Umu._grid);
 | 
			
		||||
    HUmu = _Umu*(-0.5);
 | 
			
		||||
    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
 | 
			
		||||
    UmuEven=Umu;// Really want a reference.
 | 
			
		||||
    UmuOdd =Umu;
 | 
			
		||||
  }
 | 
			
		||||
}  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
			
		||||
{
 | 
			
		||||
  Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
 | 
			
		||||
  GaugeField HUmu(_Umu._grid);
 | 
			
		||||
  HUmu = _Umu*(-0.5);
 | 
			
		||||
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
 | 
			
		||||
  pickCheckerboard(Even,UmuEven,Umu);
 | 
			
		||||
  pickCheckerboard(Odd ,UmuOdd,Umu);
 | 
			
		||||
}
 | 
			
		||||
@@ -232,30 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::Report(void)
 | 
			
		||||
{
 | 
			
		||||
  std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Wilson5d      time "<<alltime <<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "HaloBegin     time "<<commtime <<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Dslash        time "<<dslashtime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Dslash1       time "<<dslash1time<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "HaloComplete  time "<<jointime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil all gather      time "<<Stencil.halogtime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil splice   gather time "<<Stencil.splicetime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 | 
			
		||||
				  const FermionField &A,
 | 
			
		||||
@@ -277,280 +307,32 @@ template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
					 DoubledGaugeField & U,
 | 
			
		||||
					 const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  if ( Impl::overlapCommsCompute () ) { 
 | 
			
		||||
    DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
 | 
			
		||||
  } else { 
 | 
			
		||||
    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
					 DoubledGaugeField & U,
 | 
			
		||||
					 const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
  int LLs = in._grid->_rdimensions[0];
 | 
			
		||||
  
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
  st.HaloExchange(in,compressor);
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	int sF=LLs*sU;
 | 
			
		||||
	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for 
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=ss+ ssoff;
 | 
			
		||||
 | 
			
		||||
	    if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
			
		||||
	      sU = lo.Reorder(sU);
 | 
			
		||||
	    }
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
      /*
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  sU=ss+ ssoff;
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      */
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
      int sU=ss;
 | 
			
		||||
      int sF=LLs*sU;
 | 
			
		||||
      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						     DoubledGaugeField & U,
 | 
			
		||||
						     const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
 | 
			
		||||
  int calls;
 | 
			
		||||
  int updates;
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
  bool local    = true;
 | 
			
		||||
  bool nonlocal = false;
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
 | 
			
		||||
  local    = false;
 | 
			
		||||
  nonlocal = true;
 | 
			
		||||
  dslash1time -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  dslash1time +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
@@ -593,6 +375,9 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FermOpTemplateInstantiate(WilsonFermion5D);
 | 
			
		||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
 | 
			
		||||
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 | 
			
		||||
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
 | 
			
		||||
  
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,4 @@
 | 
			
		||||
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
@@ -48,8 +49,6 @@ namespace Grid {
 | 
			
		||||
    class WilsonFermion5DStatic { 
 | 
			
		||||
    public:
 | 
			
		||||
      // S-direction is INNERMOST and takes no part in the parity.
 | 
			
		||||
      static int AsmOptDslash; // these are a temporary hack
 | 
			
		||||
      static int HandOptDslash; // these are a temporary hack
 | 
			
		||||
      static const std::vector<int> directions;
 | 
			
		||||
      static const std::vector<int> displacements;
 | 
			
		||||
      const int npoint = 8;
 | 
			
		||||
@@ -61,11 +60,7 @@ namespace Grid {
 | 
			
		||||
    public:
 | 
			
		||||
     INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
     typedef WilsonKernels<Impl> Kernels;
 | 
			
		||||
     double alltime;
 | 
			
		||||
     double jointime;
 | 
			
		||||
     double commtime;
 | 
			
		||||
     double dslashtime;
 | 
			
		||||
     double dslash1time;
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Implement the abstract base
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -86,6 +81,7 @@ namespace Grid {
 | 
			
		||||
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 | 
			
		||||
 | 
			
		||||
      // These can be overridden by fancy 5d chiral action
 | 
			
		||||
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
 | 
			
		||||
@@ -120,19 +116,6 @@ namespace Grid {
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalCommsThenCompute(StencilImpl & st,
 | 
			
		||||
			LebesgueOrder &lo,
 | 
			
		||||
			DoubledGaugeField &U,
 | 
			
		||||
			const FermionField &in, 
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
      void DhopInternalCommsOverlapCompute(StencilImpl & st,
 | 
			
		||||
			LebesgueOrder &lo,
 | 
			
		||||
			DoubledGaugeField &U,
 | 
			
		||||
			const FermionField &in, 
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
		      GridCartesian         &FiveDimGrid,
 | 
			
		||||
@@ -141,14 +124,21 @@ namespace Grid {
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      double _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      WilsonFermion5D(int simd, 
 | 
			
		||||
		      GaugeField &_Umu,
 | 
			
		||||
		      GridCartesian         &FiveDimGrid,
 | 
			
		||||
		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
		      GridCartesian         &FourDimGrid,
 | 
			
		||||
		      double _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      // DoubleStore
 | 
			
		||||
      void ImportGauge(const GaugeField &_Umu);
 | 
			
		||||
 | 
			
		||||
      void Report(void);
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Data members require to support the functionality
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
    protected:
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
      // Add these to the support from Wilson
 | 
			
		||||
      GridBase *_FourDimGrid;
 | 
			
		||||
 
 | 
			
		||||
@@ -31,440 +31,410 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  int WilsonKernelsStatic::HandOpt;
 | 
			
		||||
  int WilsonKernelsStatic::AsmOpt;
 | 
			
		||||
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 | 
			
		||||
 | 
			
		||||
  // Need controls to do interior, exterior, or both
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  if ( AsmOpt ) {
 | 
			
		||||
 | 
			
		||||
    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
 | 
			
		||||
 | 
			
		||||
  } else {
 | 
			
		||||
 | 
			
		||||
    for(int site=0;site<Ns;site++) {
 | 
			
		||||
      for(int s=0;s<Ls;s++) {
 | 
			
		||||
	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
 | 
			
		||||
	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
 | 
			
		||||
	sF++;
 | 
			
		||||
      }
 | 
			
		||||
      sU++;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 | 
			
		||||
					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  // No asm implementation yet.
 | 
			
		||||
  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
			
		||||
  //  else
 | 
			
		||||
  for(int site=0;site<Ns;site++) {
 | 
			
		||||
    for(int s=0;s<Ls;s++) {
 | 
			
		||||
      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
			
		||||
      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
			
		||||
      sF++;
 | 
			
		||||
    }
 | 
			
		||||
    sU++;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////
 | 
			
		||||
  // Generic implementation; move to different file?
 | 
			
		||||
  ////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					   int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor  tmp;    
 | 
			
		||||
  SiteHalfSpinor  chi;    
 | 
			
		||||
  SiteHalfSpinor *chi_p;
 | 
			
		||||
  SiteHalfSpinor Uchi;
 | 
			
		||||
  SiteSpinor result;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  int num = 0;
 | 
			
		||||
 | 
			
		||||
  result=zero;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if (SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
 | 
			
		||||
    accumReconXp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
 | 
			
		||||
  spReconXp(result,Uchi);
 | 
			
		||||
    
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Yp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
 | 
			
		||||
  accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
 | 
			
		||||
  accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjTp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
 | 
			
		||||
  accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjXm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
 | 
			
		||||
  accumReconXm(result,Uchi);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Ym
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
 | 
			
		||||
  accumReconYm(result,Uchi);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
 | 
			
		||||
  accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else { 
 | 
			
		||||
      spProjTm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
 | 
			
		||||
  accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    vstream(out._odata[sF],result*(-0.5));
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
 | 
			
		||||
  }
 | 
			
		||||
  vstream(out._odata[sF],result);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Need controls to do interior, exterior, or both
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						  int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  SiteHalfSpinor  tmp;    
 | 
			
		||||
  SiteHalfSpinor  chi;    
 | 
			
		||||
  SiteHalfSpinor *chi_p;    
 | 
			
		||||
  SiteHalfSpinor Uchi;
 | 
			
		||||
  SiteSpinor result;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  int num = 0;
 | 
			
		||||
 | 
			
		||||
  result=zero;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
 | 
			
		||||
    accumReconXp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
 | 
			
		||||
  spReconXp(result,Uchi);
 | 
			
		||||
    
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Yp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
 | 
			
		||||
  accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) { 
 | 
			
		||||
  if ( SE->_is_local ) { 
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
 | 
			
		||||
  accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tp
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjTp(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
 | 
			
		||||
  accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Xm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjXm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjXm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
 | 
			
		||||
  accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Ym
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjYm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
 | 
			
		||||
  accumReconYm(result,Uchi);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Zm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else {
 | 
			
		||||
      spProjZm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
 | 
			
		||||
  accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  // Tm
 | 
			
		||||
  ///////////////////////////
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,sF);
 | 
			
		||||
 | 
			
		||||
  if (local && SE->_is_local ) {
 | 
			
		||||
  if ( SE->_is_local ) {
 | 
			
		||||
    chi_p = χ
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[SE->_offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else { 
 | 
			
		||||
      spProjTm(chi,in._odata[SE->_offset]);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    chi_p=&buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
			
		||||
    chi=buf[SE->_offset];
 | 
			
		||||
  }
 | 
			
		||||
  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
 | 
			
		||||
  accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
    num++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    vstream(out._odata[sF],result*(-0.5));
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
 | 
			
		||||
  }
 | 
			
		||||
  vstream(out._odata[sF],result);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
@@ -593,19 +563,13 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
    spReconTm(result,Uchi);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  vstream(out._odata[sF],result*(-0.5));
 | 
			
		||||
  vstream(out._odata[sF],result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
 | 
			
		||||
template<class Impl> 
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					      int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  FermOpTemplateInstantiate(WilsonKernels);
 | 
			
		||||
 | 
			
		||||
template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 | 
			
		||||
template class WilsonKernels<DomainWallRedBlack5dImplD>;
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -38,37 +38,56 @@ namespace Grid {
 | 
			
		||||
    // Helper routines that implement Wilson stencil for a single site.
 | 
			
		||||
    // Common to both the WilsonFermion and WilsonFermion5D
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    class WilsonKernelsStatic { 
 | 
			
		||||
    public:
 | 
			
		||||
      // S-direction is INNERMOST and takes no part in the parity.
 | 
			
		||||
      static int AsmOpt;  // these are a temporary hack
 | 
			
		||||
      static int HandOpt; // these are a temporary hack
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
 | 
			
		||||
    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
     INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
     typedef FermionOperator<Impl> Base;
 | 
			
		||||
     
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			   int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
      
 | 
			
		||||
     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
			
		||||
 | 
			
		||||
     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 | 
			
		||||
 | 
			
		||||
    private:
 | 
			
		||||
     // Specialised variants
 | 
			
		||||
     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			   int sF,int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
      
 | 
			
		||||
     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in,FermionField &out);
 | 
			
		||||
 | 
			
		||||
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
 | 
			
		||||
     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
     
 | 
			
		||||
     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
				 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
				 int sF,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
     WilsonKernels(const ImplParams &p= ImplParams());
 | 
			
		||||
     
 | 
			
		||||
 
 | 
			
		||||
@@ -2,6 +2,8 @@
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
@@ -26,320 +28,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#if defined(AVX512) || defined (IMCI)
 | 
			
		||||
 | 
			
		||||
#include <simd/Avx512Asm.h>
 | 
			
		||||
 | 
			
		||||
#undef VLOAD
 | 
			
		||||
#undef VSTORE
 | 
			
		||||
#undef VMUL
 | 
			
		||||
#undef VMADD
 | 
			
		||||
#undef ZEND
 | 
			
		||||
#undef ZLOAD
 | 
			
		||||
#undef ZMUL
 | 
			
		||||
#undef ZMADD
 | 
			
		||||
#undef VZERO
 | 
			
		||||
#undef VTIMESI
 | 
			
		||||
#undef VTIMESMINUSI
 | 
			
		||||
 | 
			
		||||
#define VZERO(A)                  VZEROf(A)
 | 
			
		||||
#define VMOV(A,B)                 VMOVf(A,B)
 | 
			
		||||
#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
			
		||||
#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
			
		||||
 | 
			
		||||
#define VADD(A,B,C)               VADDf(A,B,C)
 | 
			
		||||
#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
			
		||||
#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
 | 
			
		||||
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
 | 
			
		||||
 | 
			
		||||
#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
 | 
			
		||||
#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
			
		||||
 | 
			
		||||
#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
 | 
			
		||||
#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
 | 
			
		||||
#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
 | 
			
		||||
#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#define VPERM0(A,B)               VPERM0f(A,B)
 | 
			
		||||
#define VPERM1(A,B)               VPERM1f(A,B)
 | 
			
		||||
#define VPERM2(A,B)               VPERM2f(A,B)
 | 
			
		||||
#define VPERM3(A,B)               VPERM3f(A,B)
 | 
			
		||||
#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
			
		||||
 | 
			
		||||
#define ZEND1(A,B,C)               ZEND1f(A,B,C)
 | 
			
		||||
#define ZEND2(A,B,C)               ZEND2f(A,B,C)
 | 
			
		||||
#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 | 
			
		||||
#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
			
		||||
#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
			
		||||
 | 
			
		||||
#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
			
		||||
#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
			
		||||
 | 
			
		||||
#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
 | 
			
		||||
#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
 | 
			
		||||
 | 
			
		||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  // Default to no assembler implementation
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
 | 
			
		||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t  now;
 | 
			
		||||
  uint64_t first ;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  const SiteHalfSpinor *pbuf = & buf[0];
 | 
			
		||||
  const SiteSpinor   *plocal = & in._odata[0];
 | 
			
		||||
  void *pf;
 | 
			
		||||
  int osites = in._grid->oSites();
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
  //#define STAMP(i) timers[i] = __rdtsc() ; 
 | 
			
		||||
#define STAMP(i) //timers[i] = __rdtsc() ; 
 | 
			
		||||
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
 | 
			
		||||
  first = __rdtsc();
 | 
			
		||||
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
 | 
			
		||||
  LOAD64(%r9,pf);
 | 
			
		||||
  __asm__( 
 | 
			
		||||
	  VPREFETCH(0,%r9)
 | 
			
		||||
	  VPREFETCH(1,%r9)
 | 
			
		||||
	  VPREFETCH(2,%r9)
 | 
			
		||||
	  VPREFETCH(3,%r9)
 | 
			
		||||
	  VPREFETCH(4,%r9)
 | 
			
		||||
	  VPREFETCH(5,%r9)
 | 
			
		||||
	  VPREFETCH(6,%r9)
 | 
			
		||||
	  VPREFETCH(7,%r9)
 | 
			
		||||
	  VPREFETCH(8,%r9)
 | 
			
		||||
	  VPREFETCH(9,%r9)
 | 
			
		||||
	  VPREFETCH(10,%r9)
 | 
			
		||||
	  VPREFETCH(11,%r9) );
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  // Xm
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  // Prefetch
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    XM_PROJMEM(&plocal[offset]);
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(&pbuf[offset]);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFXM(Xm,pf);
 | 
			
		||||
  }
 | 
			
		||||
  XM_RECON;
 | 
			
		||||
 | 
			
		||||
  // Ym
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  // Prefetch
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    YM_PROJMEM(&plocal[offset]);
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(&pbuf[offset]);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFYM(Ym,pf);
 | 
			
		||||
  }
 | 
			
		||||
  YM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  // Prefetch
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    ZM_PROJMEM(&plocal[offset]);
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(&pbuf[offset]);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFZM(Zm,pf);
 | 
			
		||||
  }
 | 
			
		||||
  ZM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    TM_PROJMEM(&plocal[offset]);
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(&pbuf[offset]);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFTM(Tm,pf);
 | 
			
		||||
  }
 | 
			
		||||
  TM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  // Prefetch
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    TP_PROJMEM(&plocal[offset]);
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(&pbuf[offset]);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFTP(Tp,pf);
 | 
			
		||||
  }
 | 
			
		||||
  TP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  // Prefetch
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    ZP_PROJMEM(&plocal[offset]);
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(&pbuf[offset]);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFZP(Zp,pf);
 | 
			
		||||
  }
 | 
			
		||||
  ZP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  // Prefetch
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    YP_PROJMEM(&plocal[offset]);
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(&pbuf[offset]);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFYP(Yp,pf);
 | 
			
		||||
  }
 | 
			
		||||
  YP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Xp
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
    
 | 
			
		||||
  //  PREFETCH_R(A);
 | 
			
		||||
 | 
			
		||||
  // Prefetch
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    XP_PROJMEM(&plocal[offset]);
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(&pbuf[offset]);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFXP(Xp,pf);
 | 
			
		||||
  }
 | 
			
		||||
  XP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
 debug:
 | 
			
		||||
  SAVE_RESULT(&out._odata[ss]);
 | 
			
		||||
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  template class WilsonKernels<WilsonImplF>;		
 | 
			
		||||
  template class WilsonKernels<WilsonImplD>; 
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  // If we are AVX512 specialise the single precision routine
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#include <simd/Intel512wilson.h>
 | 
			
		||||
#include <simd/Intel512single.h>
 | 
			
		||||
 | 
			
		||||
static Vector<vComplexF> signs;
 | 
			
		||||
 | 
			
		||||
int setupSigns(void ){
 | 
			
		||||
  Vector<vComplexF> bother(2);
 | 
			
		||||
  signs = bother;
 | 
			
		||||
  vrsign(signs[0]);
 | 
			
		||||
  visign(signs[1]);
 | 
			
		||||
  return 1;
 | 
			
		||||
}
 | 
			
		||||
static int signInit = setupSigns();
 | 
			
		||||
 | 
			
		||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
 | 
			
		||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
 | 
			
		||||
#undef VMOVIDUP
 | 
			
		||||
#undef VMOVRDUP
 | 
			
		||||
#undef MAYBEPERM
 | 
			
		||||
#undef MULT_2SPIN
 | 
			
		||||
#define MAYBEPERM(A,B) 
 | 
			
		||||
#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 | 
			
		||||
#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 | 
			
		||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 | 
			
		||||
template<>
 | 
			
		||||
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										164
									
								
								lib/qcd/action/fermion/WilsonKernelsAsmBody.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										164
									
								
								lib/qcd/action/fermion/WilsonKernelsAsmBody.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,164 @@
 | 
			
		||||
{
 | 
			
		||||
  int locala,perma, ptypea;
 | 
			
		||||
  int localb,permb, ptypeb;
 | 
			
		||||
  uint64_t basea, baseb;
 | 
			
		||||
  uint64_t basex;
 | 
			
		||||
  const uint64_t plocal =(uint64_t) & in._odata[0];
 | 
			
		||||
 | 
			
		||||
  //  vComplexF isigns[2] = { signs[0], signs[1] };
 | 
			
		||||
  vComplexF *isigns = &signs[0];
 | 
			
		||||
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
 | 
			
		||||
  for(int site=0;site<Ns;site++) {
 | 
			
		||||
  for(int s=0;s<Ls;s++) {
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Xp
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  int ent=ss*8;// 2*Ndim
 | 
			
		||||
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
			
		||||
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
			
		||||
  basex = basea;
 | 
			
		||||
 | 
			
		||||
  if ( locala ) {
 | 
			
		||||
    LOAD64(%r10,isigns);
 | 
			
		||||
    XM_PROJMEM(basea);
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(basea);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFXP(Xp,baseb);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);
 | 
			
		||||
  XM_RECON;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Yp
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
			
		||||
  if ( localb ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
    YM_PROJMEM(baseb);
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(baseb);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFYP(Yp,basea);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
  YM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Zp
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
			
		||||
  if ( locala ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
    ZM_PROJMEM(basea);
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(basea);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFZP(Zp,baseb);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
  ZM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Tp
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
			
		||||
  if ( localb ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
    TM_PROJMEM(baseb);
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(baseb);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFTP(Tp,basea);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
  TM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Xm
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
			
		||||
  if ( locala ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
    XP_PROJMEM(basea);
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(basea);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFXM(Xm,baseb);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
  XP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Ym
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
			
		||||
  if ( localb ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
    YP_PROJMEM(baseb);
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(baseb);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFYM(Ym,basea);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
  YP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Zm
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
			
		||||
  if ( locala ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
    ZP_PROJMEM(basea);
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(basea);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFZM(Zm,baseb);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
  ZP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  // Tm
 | 
			
		||||
  ////////////////////////////////
 | 
			
		||||
  basea = (uint64_t)&out._odata[ss];
 | 
			
		||||
  if ( localb ) {
 | 
			
		||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
    TP_PROJMEM(baseb);
 | 
			
		||||
    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI(baseb);
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN_DIR_PFTM(Tm,basea);
 | 
			
		||||
  }
 | 
			
		||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
			
		||||
  TP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  PREFETCH_CHIMU(basex);
 | 
			
		||||
  SAVE_RESULT(&out._odata[ss]);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  ss++;
 | 
			
		||||
  } 
 | 
			
		||||
  sU++;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    Chi_11 = ref()(1)(1);\
 | 
			
		||||
    Chi_12 = ref()(1)(2);
 | 
			
		||||
 | 
			
		||||
// To splat or not to splat depends on the implementation
 | 
			
		||||
#define MULT_2SPIN(A)\
 | 
			
		||||
   auto & ref(U._odata[sU](A));	\
 | 
			
		||||
    U_00 = ref()(0,0);\
 | 
			
		||||
    U_10 = ref()(1,0);\
 | 
			
		||||
    U_20 = ref()(2,0);\
 | 
			
		||||
    U_01 = ref()(0,1);\
 | 
			
		||||
    U_11 = ref()(1,1);				\
 | 
			
		||||
    U_21 = ref()(2,1);\
 | 
			
		||||
   Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
			
		||||
    UChi_00 = U_00*Chi_00;\
 | 
			
		||||
    UChi_10 = U_00*Chi_10;\
 | 
			
		||||
    UChi_01 = U_10*Chi_00;\
 | 
			
		||||
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    UChi_11+= U_11*Chi_11;\
 | 
			
		||||
    UChi_02+= U_21*Chi_01;\
 | 
			
		||||
    UChi_12+= U_21*Chi_11;\
 | 
			
		||||
    U_00 = ref()(0,2);\
 | 
			
		||||
    U_10 = ref()(1,2);\
 | 
			
		||||
    U_20 = ref()(2,2);\
 | 
			
		||||
    Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
			
		||||
    UChi_00+= U_00*Chi_02;\
 | 
			
		||||
    UChi_10+= U_00*Chi_12;\
 | 
			
		||||
    UChi_01+= U_10*Chi_02;\
 | 
			
		||||
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    UChi_02+= U_20*Chi_02;\
 | 
			
		||||
    UChi_12+= U_20*Chi_12;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR(dir)			\
 | 
			
		||||
      permute##dir(Chi_00,Chi_00);\
 | 
			
		||||
      permute##dir(Chi_01,Chi_01);\
 | 
			
		||||
@@ -309,546 +311,10 @@ namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
 | 
			
		||||
  
 | 
			
		||||
  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd Chi_01;
 | 
			
		||||
  REGISTER Simd Chi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_10;
 | 
			
		||||
  REGISTER Simd Chi_11;
 | 
			
		||||
  REGISTER Simd Chi_12;   // 14 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd UChi_01;
 | 
			
		||||
  REGISTER Simd UChi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_10;
 | 
			
		||||
  REGISTER Simd UChi_11;
 | 
			
		||||
  REGISTER Simd UChi_12;  // 8 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
			
		||||
  REGISTER Simd U_10;
 | 
			
		||||
  REGISTER Simd U_20;  
 | 
			
		||||
  REGISTER Simd U_01;
 | 
			
		||||
  REGISTER Simd U_11;
 | 
			
		||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
#define Chimu_02 Chi_02
 | 
			
		||||
#define Chimu_10 Chi_10
 | 
			
		||||
#define Chimu_11 Chi_11
 | 
			
		||||
#define Chimu_12 Chi_12
 | 
			
		||||
#define Chimu_20 UChi_00
 | 
			
		||||
#define Chimu_21 UChi_01
 | 
			
		||||
#define Chimu_22 UChi_02
 | 
			
		||||
#define Chimu_30 UChi_10
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset, ptype;
 | 
			
		||||
  int num = 0;
 | 
			
		||||
 | 
			
		||||
  // Xp
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Xp);
 | 
			
		||||
    XP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Yp
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Yp);
 | 
			
		||||
    YP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Zp);
 | 
			
		||||
    ZP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Tp);
 | 
			
		||||
    TP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Xm
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Xm);
 | 
			
		||||
    XM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Ym
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Ym);
 | 
			
		||||
    YM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Zm);
 | 
			
		||||
    ZM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Tm);
 | 
			
		||||
    TM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  SiteSpinor & ref (out._odata[ss]);
 | 
			
		||||
  if ( Local ) {
 | 
			
		||||
    vstream(ref()(0)(0),result_00*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(1),result_01*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(2),result_02*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(0),result_10*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(1),result_11*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(2),result_12*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(0),result_20*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(1),result_21*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(2),result_22*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
 | 
			
		||||
  
 | 
			
		||||
  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd Chi_01;
 | 
			
		||||
  REGISTER Simd Chi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_10;
 | 
			
		||||
  REGISTER Simd Chi_11;
 | 
			
		||||
  REGISTER Simd Chi_12;   // 14 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd UChi_01;
 | 
			
		||||
  REGISTER Simd UChi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_10;
 | 
			
		||||
  REGISTER Simd UChi_11;
 | 
			
		||||
  REGISTER Simd UChi_12;  // 8 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
			
		||||
  REGISTER Simd U_10;
 | 
			
		||||
  REGISTER Simd U_20;  
 | 
			
		||||
  REGISTER Simd U_01;
 | 
			
		||||
  REGISTER Simd U_11;
 | 
			
		||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
#define Chimu_02 Chi_02
 | 
			
		||||
#define Chimu_10 Chi_10
 | 
			
		||||
#define Chimu_11 Chi_11
 | 
			
		||||
#define Chimu_12 Chi_12
 | 
			
		||||
#define Chimu_20 UChi_00
 | 
			
		||||
#define Chimu_21 UChi_01
 | 
			
		||||
#define Chimu_22 UChi_02
 | 
			
		||||
#define Chimu_30 UChi_10
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset, ptype;
 | 
			
		||||
  int num = 0;
 | 
			
		||||
 | 
			
		||||
  // Xp
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Xp);
 | 
			
		||||
    XM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Yp
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Yp);
 | 
			
		||||
    YM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Zp);
 | 
			
		||||
    ZM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TM_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Tp);
 | 
			
		||||
    TM_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Xm
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Xm);
 | 
			
		||||
    XP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Ym
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Ym);
 | 
			
		||||
    YP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Zm);
 | 
			
		||||
    ZP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
 | 
			
		||||
  if (Local && SE->_is_local ) { 
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TP_PROJ;
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
			
		||||
    MULT_2SPIN(Tm);
 | 
			
		||||
    TP_RECON_ACCUM;
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  SiteSpinor & ref (out._odata[ss]);
 | 
			
		||||
  if ( Local ) {
 | 
			
		||||
    vstream(ref()(0)(0),result_00*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(1),result_01*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(2),result_02*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(0),result_10*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(1),result_11*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(2),result_12*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(0),result_20*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(1),result_21*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(2),result_22*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
@@ -1073,89 +539,346 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);
 | 
			
		||||
    vstream(ref()(0)(0),result_00*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(1),result_01*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(2),result_02*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(0),result_10*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(1),result_11*(-0.5));
 | 
			
		||||
    vstream(ref()(1)(2),result_12*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(0),result_20*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(1),result_21*(-0.5));
 | 
			
		||||
    vstream(ref()(2)(2),result_22*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(0),result_00);
 | 
			
		||||
    vstream(ref()(0)(1),result_01);
 | 
			
		||||
    vstream(ref()(0)(2),result_02);
 | 
			
		||||
    vstream(ref()(1)(0),result_10);
 | 
			
		||||
    vstream(ref()(1)(1),result_11);
 | 
			
		||||
    vstream(ref()(1)(2),result_12);
 | 
			
		||||
    vstream(ref()(2)(0),result_20);
 | 
			
		||||
    vstream(ref()(2)(1),result_21);
 | 
			
		||||
    vstream(ref()(2)(2),result_22);
 | 
			
		||||
    vstream(ref()(3)(0),result_30);
 | 
			
		||||
    vstream(ref()(3)(1),result_31);
 | 
			
		||||
    vstream(ref()(3)(2),result_32);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_00; // 12 regs on knc
 | 
			
		||||
  REGISTER Simd result_01;
 | 
			
		||||
  REGISTER Simd result_02;
 | 
			
		||||
  
 | 
			
		||||
  REGISTER Simd result_10;
 | 
			
		||||
  REGISTER Simd result_11;
 | 
			
		||||
  REGISTER Simd result_12;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_20;
 | 
			
		||||
  REGISTER Simd result_21;
 | 
			
		||||
  REGISTER Simd result_22;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd result_30;
 | 
			
		||||
  REGISTER Simd result_31;
 | 
			
		||||
  REGISTER Simd result_32; // 20 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd Chi_01;
 | 
			
		||||
  REGISTER Simd Chi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd Chi_10;
 | 
			
		||||
  REGISTER Simd Chi_11;
 | 
			
		||||
  REGISTER Simd Chi_12;   // 14 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
			
		||||
  REGISTER Simd UChi_01;
 | 
			
		||||
  REGISTER Simd UChi_02;
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd UChi_10;
 | 
			
		||||
  REGISTER Simd UChi_11;
 | 
			
		||||
  REGISTER Simd UChi_12;  // 8 left
 | 
			
		||||
 | 
			
		||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
			
		||||
  REGISTER Simd U_10;
 | 
			
		||||
  REGISTER Simd U_20;  
 | 
			
		||||
  REGISTER Simd U_01;
 | 
			
		||||
  REGISTER Simd U_11;
 | 
			
		||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
#define Chimu_02 Chi_02
 | 
			
		||||
#define Chimu_10 Chi_10
 | 
			
		||||
#define Chimu_11 Chi_11
 | 
			
		||||
#define Chimu_12 Chi_12
 | 
			
		||||
#define Chimu_20 UChi_00
 | 
			
		||||
#define Chimu_21 UChi_01
 | 
			
		||||
#define Chimu_22 UChi_02
 | 
			
		||||
#define Chimu_30 UChi_10
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  
 | 
			
		||||
  // Xp
 | 
			
		||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Xp);
 | 
			
		||||
  }
 | 
			
		||||
  XP_RECON;
 | 
			
		||||
 | 
			
		||||
  // Yp
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Yp);
 | 
			
		||||
  }
 | 
			
		||||
  YP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Zp
 | 
			
		||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Zp);
 | 
			
		||||
  }
 | 
			
		||||
  ZP_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Tp
 | 
			
		||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TP_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Tp);
 | 
			
		||||
  }
 | 
			
		||||
  TP_RECON_ACCUM;
 | 
			
		||||
  
 | 
			
		||||
  // Xm
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    XM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Xm);
 | 
			
		||||
  }
 | 
			
		||||
  XM_RECON_ACCUM;
 | 
			
		||||
  
 | 
			
		||||
  // Ym
 | 
			
		||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
  
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    YM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Ym);
 | 
			
		||||
  }
 | 
			
		||||
  YM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Zm
 | 
			
		||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    ZM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Zm);
 | 
			
		||||
  }
 | 
			
		||||
  ZM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  // Tm
 | 
			
		||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
  perm   = SE->_permute;
 | 
			
		||||
 | 
			
		||||
  if ( local ) {
 | 
			
		||||
    LOAD_CHIMU;
 | 
			
		||||
    TM_PROJ;
 | 
			
		||||
    if ( perm) {
 | 
			
		||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    LOAD_CHI;
 | 
			
		||||
  }
 | 
			
		||||
  {
 | 
			
		||||
    MULT_2SPIN(Tm);
 | 
			
		||||
  }
 | 
			
		||||
  TM_RECON_ACCUM;
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);
 | 
			
		||||
    vstream(ref()(0)(0),result_00);
 | 
			
		||||
    vstream(ref()(0)(1),result_01);
 | 
			
		||||
    vstream(ref()(0)(2),result_02);
 | 
			
		||||
    vstream(ref()(1)(0),result_10);
 | 
			
		||||
    vstream(ref()(1)(1),result_11);
 | 
			
		||||
    vstream(ref()(1)(2),result_12);
 | 
			
		||||
    vstream(ref()(2)(0),result_20);
 | 
			
		||||
    vstream(ref()(2)(1),result_21);
 | 
			
		||||
    vstream(ref()(2)(2),result_22);
 | 
			
		||||
    vstream(ref()(3)(0),result_30);
 | 
			
		||||
    vstream(ref()(3)(1),result_31);
 | 
			
		||||
    vstream(ref()(3)(2),result_32);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  // Specialise Gparity to simple implementation
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
 | 
			
		||||
  //check consistency of return types between these functions and the ones in WilsonKernels.cc
 | 
			
		||||
  return 0;
 | 
			
		||||
  
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
////////////// Wilson ; uses this implementation /////////////////////
 | 
			
		||||
// Need Nc=3 though //
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
 | 
			
		||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
			
		||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
			
		||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
			
		||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
			
		||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
			
		||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
 | 
			
		||||
#define INHERIT_GIMPL_TYPES(GImpl) \
 | 
			
		||||
    typedef typename GImpl::Simd                           Simd;\
 | 
			
		||||
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
 | 
			
		||||
    typedef typename GImpl::GaugeField               GaugeField;	
 | 
			
		||||
    typedef typename GImpl::GaugeField               GaugeField;\
 | 
			
		||||
    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
 | 
			
		||||
    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;
 | 
			
		||||
 | 
			
		||||
    // 
 | 
			
		||||
    template<class S,int Nrepresentation=Nc>
 | 
			
		||||
@@ -62,9 +64,9 @@ template<class Gimpl> class WilsonLoops;
 | 
			
		||||
 | 
			
		||||
    // Move this elsewhere?
 | 
			
		||||
    static inline void AddGaugeLink(GaugeField& U, GaugeLinkField& W, int mu){  // U[mu] += W 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
      for(auto ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	U._odata[ss]._internal[mu] = U._odata[ss]._internal[mu] + W._odata[ss]._internal;
 | 
			
		||||
	         U._odata[ss]._internal[mu] = U._odata[ss]._internal[mu] + W._odata[ss]._internal;
 | 
			
		||||
        }  
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
 
 | 
			
		||||
@@ -92,13 +92,13 @@ public:
 | 
			
		||||
    
 | 
			
		||||
    // Create integrator, including the smearing policy
 | 
			
		||||
    // Smearing policy
 | 
			
		||||
    std::cout << GridLogMessage << " Creating the Stout class\n";
 | 
			
		||||
    double rho = 0.1; // smearing parameter
 | 
			
		||||
    std::cout << GridLogDebug << " Creating the Stout class\n";
 | 
			
		||||
    double rho = 0.1; // smearing parameter, now hardcoded
 | 
			
		||||
    int Nsmear = 1;   // number of smearing levels
 | 
			
		||||
    Smear_Stout<Gimpl> Stout(rho);
 | 
			
		||||
    std::cout << GridLogMessage << " Creating the SmearedConfiguration class\n";
 | 
			
		||||
    std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
 | 
			
		||||
    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
 | 
			
		||||
    std::cout << GridLogMessage << " done\n";
 | 
			
		||||
    std::cout << GridLogDebug << " done\n";
 | 
			
		||||
    //////////////
 | 
			
		||||
    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
 | 
			
		||||
    IntegratorParameters MDpar(20);
 | 
			
		||||
@@ -116,27 +116,27 @@ public:
 | 
			
		||||
 | 
			
		||||
    if ( StartType == HotStart ) {
 | 
			
		||||
      // Hot start
 | 
			
		||||
      HMCpar.NoMetropolisUntil =0;
 | 
			
		||||
      HMCpar.NoMetropolisUntil =10;
 | 
			
		||||
      HMCpar.MetropolisTest = true;
 | 
			
		||||
      sRNG.SeedFixedIntegers(SerSeed);
 | 
			
		||||
      pRNG.SeedFixedIntegers(ParSeed);
 | 
			
		||||
      SU3::HotConfiguration(pRNG, U);
 | 
			
		||||
    } else if ( StartType == ColdStart ) { 
 | 
			
		||||
      // Cold start
 | 
			
		||||
      HMCpar.NoMetropolisUntil =0;
 | 
			
		||||
      HMCpar.NoMetropolisUntil =10;
 | 
			
		||||
      HMCpar.MetropolisTest = true;
 | 
			
		||||
      sRNG.SeedFixedIntegers(SerSeed);
 | 
			
		||||
      pRNG.SeedFixedIntegers(ParSeed);
 | 
			
		||||
      SU3::ColdConfiguration(pRNG, U);
 | 
			
		||||
    } else if ( StartType == TepidStart ) {       
 | 
			
		||||
      // Tepid start
 | 
			
		||||
      HMCpar.NoMetropolisUntil =0;
 | 
			
		||||
      HMCpar.NoMetropolisUntil =10;
 | 
			
		||||
      HMCpar.MetropolisTest = true;
 | 
			
		||||
      sRNG.SeedFixedIntegers(SerSeed);
 | 
			
		||||
      pRNG.SeedFixedIntegers(ParSeed);
 | 
			
		||||
      SU3::TepidConfiguration(pRNG, U);
 | 
			
		||||
    } else if ( StartType == CheckpointStart ) { 
 | 
			
		||||
      HMCpar.NoMetropolisUntil =0;
 | 
			
		||||
      HMCpar.NoMetropolisUntil =10;
 | 
			
		||||
      HMCpar.MetropolisTest = true;
 | 
			
		||||
      // CheckpointRestart
 | 
			
		||||
      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
 | 
			
		||||
 
 | 
			
		||||
@@ -61,6 +61,31 @@ namespace Grid {
 | 
			
		||||
      "         "
 | 
			
		||||
    };
 | 
			
		||||
    
 | 
			
		||||
    SpinMatrix makeGammaProd(const unsigned int i)
 | 
			
		||||
    {
 | 
			
		||||
      SpinMatrix g;
 | 
			
		||||
      
 | 
			
		||||
      g = 1.;
 | 
			
		||||
      if (i & 0x1)
 | 
			
		||||
      {
 | 
			
		||||
        g = g*Gamma(Gamma::GammaMatrix::GammaX);
 | 
			
		||||
      }
 | 
			
		||||
      if (i & 0x2)
 | 
			
		||||
      {
 | 
			
		||||
        g = g*Gamma(Gamma::GammaMatrix::GammaY);
 | 
			
		||||
      }
 | 
			
		||||
      if (i & 0x4)
 | 
			
		||||
      {
 | 
			
		||||
        g = g*Gamma(Gamma::GammaMatrix::GammaZ);
 | 
			
		||||
      }
 | 
			
		||||
      if (i & 0x8)
 | 
			
		||||
      {
 | 
			
		||||
        g = g*Gamma(Gamma::GammaMatrix::GammaT);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      return g;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //    void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){
 | 
			
		||||
    //      vHalfSpinColourVector hspin;
 | 
			
		||||
    //      spProjXp(hspin,in);
 | 
			
		||||
 
 | 
			
		||||
@@ -83,6 +83,9 @@ namespace QCD {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
    // Make gamma products (Chroma convention)
 | 
			
		||||
    SpinMatrix makeGammaProd(const unsigned int i);
 | 
			
		||||
    
 | 
			
		||||
    /* Gx
 | 
			
		||||
     *  0 0  0  i    
 | 
			
		||||
     *  0 0  i  0    
 | 
			
		||||
 
 | 
			
		||||
@@ -608,14 +608,14 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
 | 
			
		||||
    LatticeMatrix Umu(out._grid);
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
      LieRandomize(pRNG,Umu,0.01);
 | 
			
		||||
      pokeLorentz(out,Umu,mu);
 | 
			
		||||
      PokeIndex<LorentzIndex>(out,Umu,mu);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
 | 
			
		||||
    LatticeMatrix Umu(out._grid);
 | 
			
		||||
    Umu=1.0;
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
      pokeLorentz(out,Umu,mu);
 | 
			
		||||
      PokeIndex<LorentzIndex>(out,Umu,mu);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 | 
			
		||||
{
 | 
			
		||||
  return new GridRedBlackCartesian(FourDimGrid); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<int> simd(4,1);
 | 
			
		||||
  return makeFourDimGrid(latt,simd,mpi);
 | 
			
		||||
}
 | 
			
		||||
GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int N4=FourDimGrid->_ndimension;
 | 
			
		||||
@@ -58,6 +62,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
 | 
			
		||||
  return new GridCartesian(latt5,simd5,mpi5); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int N4=FourDimGrid->_ndimension;
 | 
			
		||||
@@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
 | 
			
		||||
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int N4=FourDimGrid->_ndimension;
 | 
			
		||||
  int nsimd = FourDimGrid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt5(1,Ls);
 | 
			
		||||
  std::vector<int> simd5(1,nsimd);
 | 
			
		||||
  std::vector<int>  mpi5(1,1);
 | 
			
		||||
  
 | 
			
		||||
  for(int d=0;d<N4;d++){
 | 
			
		||||
    latt5.push_back(FourDimGrid->_fdimensions[d]);
 | 
			
		||||
    simd5.push_back(1);
 | 
			
		||||
     mpi5.push_back(FourDimGrid->_processors[d]);
 | 
			
		||||
  }
 | 
			
		||||
  return new GridCartesian(latt5,simd5,mpi5); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
			
		||||
{
 | 
			
		||||
  int N4=FourDimGrid->_ndimension;
 | 
			
		||||
  int nsimd = FourDimGrid->Nsimd();
 | 
			
		||||
  int cbd=0;
 | 
			
		||||
  std::vector<int> latt5(1,Ls);
 | 
			
		||||
  std::vector<int> simd5(1,nsimd);
 | 
			
		||||
  std::vector<int>  mpi5(1,1);
 | 
			
		||||
  std::vector<int>   cb5(1,1);
 | 
			
		||||
    
 | 
			
		||||
  for(int d=0;d<N4;d++){
 | 
			
		||||
    latt5.push_back(FourDimGrid->_fdimensions[d]);
 | 
			
		||||
    simd5.push_back(1);
 | 
			
		||||
     mpi5.push_back(FourDimGrid->_processors[d]);
 | 
			
		||||
      cb5.push_back(1);
 | 
			
		||||
    }
 | 
			
		||||
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -35,9 +35,14 @@ class SpaceTimeGrid {
 | 
			
		||||
 | 
			
		||||
  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
 | 
			
		||||
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
 | 
			
		||||
 | 
			
		||||
  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
 | 
			
		||||
  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
 | 
			
		||||
 | 
			
		||||
  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
 | 
			
		||||
  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
 | 
			
		||||
  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -52,9 +52,9 @@ namespace Grid {
 | 
			
		||||
	// or this-> ; there is no "this" in a static method. This forces explicit Gimpl scope
 | 
			
		||||
	// resolution throughout the usage in this file, and rather defeats the purpose of deriving
 | 
			
		||||
	// from Gimpl.
 | 
			
		||||
	plaq= Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
				      Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
							      Gimpl::CovShiftForward (U[mu],mu,U[nu])));
 | 
			
		||||
	plaq = Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
		   Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
		   Gimpl::CovShiftForward (U[mu],mu,U[nu])));
 | 
			
		||||
      }
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // trace of directed plaquette oriented in mu,nu plane
 | 
			
		||||
@@ -100,16 +100,16 @@ namespace Grid {
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // average over all x,y,z,t and over all planes of plaquette
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      static RealD avgPlaquette(const GaugeLorentz &Umu){
 | 
			
		||||
	static RealD avgPlaquette(const GaugeLorentz &Umu){
 | 
			
		||||
		RealD sumplaq = sumPlaquette(Umu);
 | 
			
		||||
		double vol = Umu._grid->gSites();
 | 
			
		||||
		double faces = (1.0*Nd*(Nd-1))/2.0;
 | 
			
		||||
		return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
 | 
			
		||||
   	}
 | 
			
		||||
 | 
			
		||||
	RealD sumplaq = sumPlaquette(Umu);
 | 
			
		||||
	
 | 
			
		||||
	double vol = Umu._grid->gSites();
 | 
			
		||||
	
 | 
			
		||||
	double faces = (1.0*Nd*(Nd-1))/2.0;
 | 
			
		||||
	
 | 
			
		||||
	return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
 | 
			
		||||
      }
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // average over traced single links
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      static RealD linkTrace(const GaugeLorentz &Umu){
 | 
			
		||||
	std::vector<GaugeMat> U(4,Umu._grid);
 | 
			
		||||
	
 | 
			
		||||
@@ -126,47 +126,6 @@ namespace Grid {
 | 
			
		||||
	
 | 
			
		||||
	return p.real()/vol/4.0/3.0;
 | 
			
		||||
      };
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // the sum over all staples on each site
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
 | 
			
		||||
	
 | 
			
		||||
	GridBase *grid = Umu._grid;
 | 
			
		||||
	
 | 
			
		||||
	std::vector<GaugeMat> U(4,grid);
 | 
			
		||||
	for(int d=0;d<Nd;d++){
 | 
			
		||||
	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
 | 
			
		||||
	}
 | 
			
		||||
	staple = zero;
 | 
			
		||||
		
 | 
			
		||||
	
 | 
			
		||||
	for(int nu=0;nu<Nd;nu++){
 | 
			
		||||
	  
 | 
			
		||||
	  if(nu != mu) {
 | 
			
		||||
	    
 | 
			
		||||
	    // mu
 | 
			
		||||
	    // ^
 | 
			
		||||
	    // |__>  nu
 | 
			
		||||
	    
 | 
			
		||||
	    //    __ 
 | 
			
		||||
	    //      |
 | 
			
		||||
	    //    __|
 | 
			
		||||
	    //
 | 
			
		||||
	    
 | 
			
		||||
	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftForward (U[nu],nu, 
 | 
			
		||||
							       Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
										       Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
 | 
			
		||||
	    
 | 
			
		||||
	    //  __ 
 | 
			
		||||
	    // |   
 | 
			
		||||
	    // |__ 
 | 
			
		||||
	    //
 | 
			
		||||
	    //
 | 
			
		||||
	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftBackward(U[nu],nu,		  		  
 | 
			
		||||
							       Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // the sum over all staples on each site in direction mu,nu
 | 
			
		||||
@@ -210,6 +169,51 @@ namespace Grid {
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////
 | 
			
		||||
// the sum over all staples on each site
 | 
			
		||||
//////////////////////////////////////////////////
 | 
			
		||||
  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
 | 
			
		||||
 | 
			
		||||
    GridBase *grid = Umu._grid;
 | 
			
		||||
 | 
			
		||||
    std::vector<GaugeMat> U(Nd,grid);
 | 
			
		||||
    for(int d=0;d<Nd;d++){
 | 
			
		||||
      U[d] = PeekIndex<LorentzIndex>(Umu,d);
 | 
			
		||||
    }
 | 
			
		||||
    staple = zero;
 | 
			
		||||
    GaugeMat tmp(grid);
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    for(int nu=0;nu<Nd;nu++){
 | 
			
		||||
 | 
			
		||||
      if(nu != mu) {
 | 
			
		||||
 | 
			
		||||
      // mu
 | 
			
		||||
      // ^
 | 
			
		||||
      // |__>  nu
 | 
			
		||||
 | 
			
		||||
      //    __ 
 | 
			
		||||
      //      |
 | 
			
		||||
      //    __|
 | 
			
		||||
      //
 | 
			
		||||
 | 
			
		||||
	staple+=Gimpl::ShiftStaple(
 | 
			
		||||
	        Gimpl::CovShiftForward (U[nu],nu, 
 | 
			
		||||
		Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
		Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
 | 
			
		||||
 | 
			
		||||
      //  __ 
 | 
			
		||||
      // |   
 | 
			
		||||
      // |__ 
 | 
			
		||||
      //
 | 
			
		||||
      //
 | 
			
		||||
	staple+=Gimpl::ShiftStaple(  
 | 
			
		||||
                Gimpl::CovShiftBackward(U[nu],nu,		  		  
 | 
			
		||||
		Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // the sum over all staples on each site in direction mu,nu, upper part
 | 
			
		||||
@@ -247,246 +251,246 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      //////////////////////////////////////////////////////
 | 
			
		||||
      // Similar to above for rectangle is required
 | 
			
		||||
      //////////////////////////////////////////////////////
 | 
			
		||||
      static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
 | 
			
		||||
      {
 | 
			
		||||
	rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
 | 
			
		||||
	  adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
 | 
			
		||||
	rect = rect + 
 | 
			
		||||
  //////////////////////////////////////////////////////
 | 
			
		||||
  // Similar to above for rectangle is required
 | 
			
		||||
  //////////////////////////////////////////////////////
 | 
			
		||||
  static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
 | 
			
		||||
  {
 | 
			
		||||
    rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
 | 
			
		||||
	adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
 | 
			
		||||
    rect = rect + 
 | 
			
		||||
          Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[nu],nu,U[nu]))* // ->||
 | 
			
		||||
	  adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
 | 
			
		||||
      adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
 | 
			
		||||
  }
 | 
			
		||||
  static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
 | 
			
		||||
  {
 | 
			
		||||
    GaugeMat sp(U[0]._grid);
 | 
			
		||||
    dirRectangle(sp,U,mu,nu);
 | 
			
		||||
    rect=trace(sp);
 | 
			
		||||
  }
 | 
			
		||||
  static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
 | 
			
		||||
  {
 | 
			
		||||
    LatticeComplex siteRect(U[0]._grid);
 | 
			
		||||
    Rect=zero;
 | 
			
		||||
    for(int mu=1;mu<Nd;mu++){
 | 
			
		||||
      for(int nu=0;nu<mu;nu++){
 | 
			
		||||
	traceDirRectangle(siteRect,U,mu,nu);
 | 
			
		||||
	Rect = Rect + siteRect;
 | 
			
		||||
      }
 | 
			
		||||
      static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
 | 
			
		||||
      {
 | 
			
		||||
	GaugeMat sp(U[0]._grid);
 | 
			
		||||
	dirRectangle(sp,U,mu,nu);
 | 
			
		||||
	rect=trace(sp);
 | 
			
		||||
      }
 | 
			
		||||
      static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
 | 
			
		||||
      {
 | 
			
		||||
	LatticeComplex siteRect(U[0]._grid);
 | 
			
		||||
	Rect=zero;
 | 
			
		||||
	for(int mu=1;mu<Nd;mu++){
 | 
			
		||||
	  for(int nu=0;nu<mu;nu++){
 | 
			
		||||
	    traceDirRectangle(siteRect,U,mu,nu);
 | 
			
		||||
	    Rect = Rect + siteRect;
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // sum over all x,y,z,t and over all planes of plaquette
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      static RealD sumRectangle(const GaugeLorentz &Umu){
 | 
			
		||||
	std::vector<GaugeMat> U(4,Umu._grid);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
	for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	  U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
	}
 | 
			
		||||
 //////////////////////////////////////////////////
 | 
			
		||||
  // sum over all x,y,z,t and over all planes of plaquette
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static RealD sumRectangle(const GaugeLorentz &Umu){
 | 
			
		||||
    std::vector<GaugeMat> U(Nd,Umu._grid);
 | 
			
		||||
 | 
			
		||||
	LatticeComplex Rect(Umu._grid);
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
	siteRectangle(Rect,U);
 | 
			
		||||
    LatticeComplex Rect(Umu._grid);
 | 
			
		||||
    
 | 
			
		||||
	TComplex Tp = sum(Rect);
 | 
			
		||||
	Complex p  = TensorRemove(Tp);
 | 
			
		||||
	return p.real();
 | 
			
		||||
      }
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // average over all x,y,z,t and over all planes of plaquette
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      static RealD avgRectangle(const GaugeLorentz &Umu){
 | 
			
		||||
    siteRectangle(Rect,U);
 | 
			
		||||
    
 | 
			
		||||
	RealD sumrect = sumRectangle(Umu);
 | 
			
		||||
    TComplex Tp = sum(Rect);
 | 
			
		||||
    Complex p  = TensorRemove(Tp);
 | 
			
		||||
    return p.real();
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // average over all x,y,z,t and over all planes of plaquette
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static RealD avgRectangle(const GaugeLorentz &Umu){
 | 
			
		||||
 | 
			
		||||
	double vol = Umu._grid->gSites();
 | 
			
		||||
    RealD sumrect = sumRectangle(Umu);
 | 
			
		||||
    
 | 
			
		||||
	double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
 | 
			
		||||
    double vol = Umu._grid->gSites();
 | 
			
		||||
    
 | 
			
		||||
	return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
 | 
			
		||||
      }
 | 
			
		||||
    double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
 | 
			
		||||
    
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      // the sum over all staples on each site
 | 
			
		||||
      //////////////////////////////////////////////////
 | 
			
		||||
      static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
 | 
			
		||||
	U2 = U * Cshift(U,mu,1);
 | 
			
		||||
      }
 | 
			
		||||
    return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
      // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
 | 
			
		||||
      // but need to track two deep where cross boundary and apply a conjugation).
 | 
			
		||||
      // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
 | 
			
		||||
      ////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
      static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // the sum over all staples on each site
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
 | 
			
		||||
    U2 = U * Cshift(U,mu,1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
	Stap = zero;
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
 | 
			
		||||
  // but need to track two deep where cross boundary and apply a conjugation).
 | 
			
		||||
  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){
 | 
			
		||||
 | 
			
		||||
	GridBase *grid = U[0]._grid;
 | 
			
		||||
    Stap = zero;
 | 
			
		||||
 | 
			
		||||
	GaugeMat Staple2x1 (grid);
 | 
			
		||||
	GaugeMat tmp (grid);
 | 
			
		||||
    GridBase *grid = U[0]._grid;
 | 
			
		||||
 | 
			
		||||
	for(int nu=0;nu<Nd;nu++){
 | 
			
		||||
	  if ( nu!=mu) {
 | 
			
		||||
    GaugeMat Staple2x1 (grid);
 | 
			
		||||
    GaugeMat tmp (grid);
 | 
			
		||||
 | 
			
		||||
	    // Up staple    ___ ___ 
 | 
			
		||||
	    //             |       |
 | 
			
		||||
	    tmp = Cshift(adj(U[nu]),nu,-1); 
 | 
			
		||||
	    tmp = adj(U2[mu])*tmp;
 | 
			
		||||
	    tmp = Cshift(tmp,mu,-2);
 | 
			
		||||
    for(int nu=0;nu<Nd;nu++){
 | 
			
		||||
      if ( nu!=mu) {
 | 
			
		||||
 | 
			
		||||
	    Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);
 | 
			
		||||
	// Up staple    ___ ___ 
 | 
			
		||||
	//             |       |
 | 
			
		||||
	tmp = Cshift(adj(U[nu]),nu,-1); 
 | 
			
		||||
	tmp = adj(U2[mu])*tmp;
 | 
			
		||||
	tmp = Cshift(tmp,mu,-2);
 | 
			
		||||
 | 
			
		||||
	Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	    // Down staple
 | 
			
		||||
	    //             |___ ___|
 | 
			
		||||
	    //
 | 
			
		||||
	    tmp = adj(U2[mu])*U[nu];
 | 
			
		||||
	    Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));
 | 
			
		||||
	// Down staple
 | 
			
		||||
	//             |___ ___|
 | 
			
		||||
	//
 | 
			
		||||
	tmp = adj(U2[mu])*U[nu];
 | 
			
		||||
	Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	    //              ___ ___
 | 
			
		||||
	    //             |    ___|
 | 
			
		||||
	    //             |___ ___|
 | 
			
		||||
	    //
 | 
			
		||||
	//              ___ ___
 | 
			
		||||
	//             |    ___|
 | 
			
		||||
	//             |___ ___|
 | 
			
		||||
	//
 | 
			
		||||
 | 
			
		||||
	    Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);
 | 
			
		||||
	Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);
 | 
			
		||||
 | 
			
		||||
	    //              ___ ___
 | 
			
		||||
	    //             |___    |
 | 
			
		||||
	    //             |___ ___|
 | 
			
		||||
	    //
 | 
			
		||||
	//              ___ ___
 | 
			
		||||
	//             |___    |
 | 
			
		||||
	//             |___ ___|
 | 
			
		||||
	//
 | 
			
		||||
 | 
			
		||||
	    //	tmp= Staple2x1* Cshift(U[mu],mu,-2);
 | 
			
		||||
	    //	Stap+= Cshift(tmp,mu,1) ;
 | 
			
		||||
	    Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;
 | 
			
		||||
	//	tmp= Staple2x1* Cshift(U[mu],mu,-2);
 | 
			
		||||
	//	Stap+= Cshift(tmp,mu,1) ;
 | 
			
		||||
	Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;
 | 
			
		||||
 | 
			
		||||
	    //       --    
 | 
			
		||||
	    //      |  |              
 | 
			
		||||
	    //          
 | 
			
		||||
	    //      |  | 
 | 
			
		||||
	//       --    
 | 
			
		||||
	//      |  |              
 | 
			
		||||
	//          
 | 
			
		||||
	//      |  | 
 | 
			
		||||
	
 | 
			
		||||
	    tmp = Cshift(adj(U2[nu]),nu,-2);
 | 
			
		||||
	    tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
 | 
			
		||||
	    tmp = U2[nu]*Cshift(tmp,nu,2);
 | 
			
		||||
	    Stap+= Cshift(tmp, mu, 1);
 | 
			
		||||
	tmp = Cshift(adj(U2[nu]),nu,-2);
 | 
			
		||||
	tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
 | 
			
		||||
	tmp = U2[nu]*Cshift(tmp,nu,2);
 | 
			
		||||
	Stap+= Cshift(tmp, mu, 1);
 | 
			
		||||
 | 
			
		||||
	    //      |  |              
 | 
			
		||||
	    //          
 | 
			
		||||
	    //      |  | 
 | 
			
		||||
	    //       -- 
 | 
			
		||||
	//      |  |              
 | 
			
		||||
	//          
 | 
			
		||||
	//      |  | 
 | 
			
		||||
	//       -- 
 | 
			
		||||
	
 | 
			
		||||
	    tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
 | 
			
		||||
	    tmp = adj(U2[nu])*tmp;
 | 
			
		||||
	    tmp = Cshift(tmp,nu,-2);
 | 
			
		||||
	    Stap+=Cshift(tmp, mu, 1);
 | 
			
		||||
	  }}
 | 
			
		||||
	tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
 | 
			
		||||
	tmp = adj(U2[nu])*tmp;
 | 
			
		||||
	tmp = Cshift(tmp,nu,-2);
 | 
			
		||||
	Stap+=Cshift(tmp, mu, 1);
 | 
			
		||||
    }}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
      static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
 | 
			
		||||
      {
 | 
			
		||||
	RectStapleUnoptimised(Stap,Umu,mu);
 | 
			
		||||
      }
 | 
			
		||||
      static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
 | 
			
		||||
			     std::vector<GaugeMat> &U2,
 | 
			
		||||
			     std::vector<GaugeMat> &U, int mu)
 | 
			
		||||
      {
 | 
			
		||||
	if ( Gimpl::isPeriodicGaugeField() ){ 
 | 
			
		||||
	  RectStapleOptimised(Stap,U2,U,mu);
 | 
			
		||||
	} else {
 | 
			
		||||
	  RectStapleUnoptimised(Stap,Umu,mu);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
  static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
 | 
			
		||||
  {
 | 
			
		||||
    RectStapleUnoptimised(Stap,Umu,mu);
 | 
			
		||||
  }
 | 
			
		||||
  static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
 | 
			
		||||
			 std::vector<GaugeMat> &U2,
 | 
			
		||||
			 std::vector<GaugeMat> &U, int mu)
 | 
			
		||||
  {
 | 
			
		||||
    if ( Gimpl::isPeriodicGaugeField() ){ 
 | 
			
		||||
      RectStapleOptimised(Stap,U2,U,mu);
 | 
			
		||||
    } else {
 | 
			
		||||
      RectStapleUnoptimised(Stap,Umu,mu);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
      static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
 | 
			
		||||
	GridBase *grid = Umu._grid;
 | 
			
		||||
  static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
 | 
			
		||||
    GridBase *grid = Umu._grid;
 | 
			
		||||
 | 
			
		||||
	std::vector<GaugeMat> U(4,grid);
 | 
			
		||||
	for(int d=0;d<Nd;d++){
 | 
			
		||||
	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
 | 
			
		||||
	}
 | 
			
		||||
    std::vector<GaugeMat> U(Nd,grid);
 | 
			
		||||
    for(int d=0;d<Nd;d++){
 | 
			
		||||
      U[d] = PeekIndex<LorentzIndex>(Umu,d);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
	Stap=zero;
 | 
			
		||||
    Stap=zero;
 | 
			
		||||
 | 
			
		||||
	for(int nu=0;nu<Nd;nu++){
 | 
			
		||||
	  if ( nu!=mu) {
 | 
			
		||||
	    //           __ ___ 
 | 
			
		||||
	    //          |    __ |
 | 
			
		||||
	    //
 | 
			
		||||
	    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
				      Gimpl::CovShiftForward (U[mu],mu,
 | 
			
		||||
							      Gimpl::CovShiftForward (U[nu],nu,
 | 
			
		||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
													      Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
																      Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
 | 
			
		||||
    for(int nu=0;nu<Nd;nu++){
 | 
			
		||||
      if ( nu!=mu) {
 | 
			
		||||
    //           __ ___ 
 | 
			
		||||
    //          |    __ |
 | 
			
		||||
    //
 | 
			
		||||
    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
		  Gimpl::CovShiftForward (U[mu],mu,
 | 
			
		||||
		  Gimpl::CovShiftForward (U[nu],nu,
 | 
			
		||||
		  Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
                  Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
		  Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
 | 
			
		||||
 | 
			
		||||
	    //              __ 
 | 
			
		||||
	    //          |__ __ |
 | 
			
		||||
    //              __ 
 | 
			
		||||
    //          |__ __ |
 | 
			
		||||
 | 
			
		||||
	    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
				      Gimpl::CovShiftForward (U[mu],mu,
 | 
			
		||||
							      Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
													      Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);
 | 
			
		||||
    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
                  Gimpl::CovShiftForward (U[mu],mu,
 | 
			
		||||
		  Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
		  Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
                  Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);
 | 
			
		||||
 | 
			
		||||
	    //           __ 
 | 
			
		||||
	    //          |__ __ |
 | 
			
		||||
    //           __ 
 | 
			
		||||
    //          |__ __ |
 | 
			
		||||
 | 
			
		||||
	    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
				      Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
							      Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
													      Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);
 | 
			
		||||
    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
		  Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
		  Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
		  Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
		  Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);
 | 
			
		||||
 | 
			
		||||
	    //           __ ___ 
 | 
			
		||||
	    //          |__    |
 | 
			
		||||
    //           __ ___ 
 | 
			
		||||
    //          |__    |
 | 
			
		||||
 | 
			
		||||
	    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
				      Gimpl::CovShiftForward (U[nu],nu,
 | 
			
		||||
							      Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
													      Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);
 | 
			
		||||
    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
		   Gimpl::CovShiftForward (U[nu],nu,
 | 
			
		||||
	           Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
                   Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
                   Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);
 | 
			
		||||
 | 
			
		||||
	    //       --    
 | 
			
		||||
	    //      |  |              
 | 
			
		||||
	    //          
 | 
			
		||||
	    //      |  | 
 | 
			
		||||
     //       --    
 | 
			
		||||
     //      |  |              
 | 
			
		||||
     //          
 | 
			
		||||
     //      |  | 
 | 
			
		||||
     
 | 
			
		||||
	    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
				      Gimpl::CovShiftForward(U[nu],nu,
 | 
			
		||||
							     Gimpl::CovShiftForward(U[nu],nu,
 | 
			
		||||
										    Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
													    Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
																    Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
 | 
			
		||||
    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
		   Gimpl::CovShiftForward(U[nu],nu,
 | 
			
		||||
		   Gimpl::CovShiftForward(U[nu],nu,
 | 
			
		||||
                   Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
                   Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
		   Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	    //      |  |              
 | 
			
		||||
	    //          
 | 
			
		||||
	    //      |  | 
 | 
			
		||||
	    //       -- 
 | 
			
		||||
     //      |  |              
 | 
			
		||||
     //          
 | 
			
		||||
     //      |  | 
 | 
			
		||||
     //       -- 
 | 
			
		||||
     
 | 
			
		||||
	    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
				      Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
							      Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
													      Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
 | 
			
		||||
	  }}
 | 
			
		||||
      }
 | 
			
		||||
    Stap+= Gimpl::ShiftStaple(
 | 
			
		||||
		   Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
		   Gimpl::CovShiftBackward(U[nu],nu,
 | 
			
		||||
                   Gimpl::CovShiftBackward(U[mu],mu,
 | 
			
		||||
                   Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
 | 
			
		||||
    }}
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
 | 
			
		||||
    typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
 | 
			
		||||
    typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
 | 
			
		||||
    typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
 | 
			
		||||
 typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
 | 
			
		||||
 typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
 | 
			
		||||
 typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
 | 
			
		||||
 typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
 | 
			
		||||
 | 
			
		||||
  }}
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -32,6 +32,40 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <type_traits>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
  // helper function to read space-separated values
 | 
			
		||||
  template <typename T>
 | 
			
		||||
  std::vector<T> strToVec(const std::string s)
 | 
			
		||||
  {
 | 
			
		||||
    std::istringstream sstr(s);
 | 
			
		||||
    T                  buf;
 | 
			
		||||
    std::vector<T>     v;
 | 
			
		||||
    
 | 
			
		||||
    while(!sstr.eof())
 | 
			
		||||
    {
 | 
			
		||||
      sstr >> buf;
 | 
			
		||||
      v.push_back(buf);
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    return v;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // output to streams for vectors
 | 
			
		||||
  template < class T >
 | 
			
		||||
  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
 | 
			
		||||
  {
 | 
			
		||||
    os << "[";
 | 
			
		||||
    for (auto &x: v)
 | 
			
		||||
    {
 | 
			
		||||
      os << x << " ";
 | 
			
		||||
    }
 | 
			
		||||
    if (v.size() > 0)
 | 
			
		||||
    {
 | 
			
		||||
      os << "\b";
 | 
			
		||||
    }
 | 
			
		||||
    os << "]";
 | 
			
		||||
    
 | 
			
		||||
    return os;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  class Serializable {};
 | 
			
		||||
  
 | 
			
		||||
@@ -138,23 +172,6 @@ namespace Grid {
 | 
			
		||||
    r.read(s, output);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template < class T >
 | 
			
		||||
  inline std::ostream& operator << (std::ostream& os, const std::vector<T>& v)
 | 
			
		||||
  {
 | 
			
		||||
    os << "[";
 | 
			
		||||
    for (auto &x: v)
 | 
			
		||||
    {
 | 
			
		||||
      os << x << " ";
 | 
			
		||||
    }
 | 
			
		||||
    if (v.size() > 0)
 | 
			
		||||
    {
 | 
			
		||||
      os << "\b";
 | 
			
		||||
    }
 | 
			
		||||
    os << "]";
 | 
			
		||||
    
 | 
			
		||||
    return os;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Writer template implementation ////////////////////////////////////////////
 | 
			
		||||
  template <typename T>
 | 
			
		||||
  Writer<T>::Writer(void)
 | 
			
		||||
 
 | 
			
		||||
@@ -120,7 +120,7 @@ THE SOFTWARE.
 | 
			
		||||
  \
 | 
			
		||||
  \
 | 
			
		||||
  template <typename T>\
 | 
			
		||||
  static void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
 | 
			
		||||
  static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
 | 
			
		||||
    push(WR,s);\
 | 
			
		||||
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
 | 
			
		||||
    pop(WR);\
 | 
			
		||||
@@ -128,14 +128,14 @@ THE SOFTWARE.
 | 
			
		||||
  \
 | 
			
		||||
  \
 | 
			
		||||
  template <typename T>\
 | 
			
		||||
  static void read(Reader<T> &RD,const std::string &s, cname &obj){	\
 | 
			
		||||
  static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
 | 
			
		||||
    push(RD,s);\
 | 
			
		||||
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
 | 
			
		||||
    pop(RD);\
 | 
			
		||||
  } \
 | 
			
		||||
  \
 | 
			
		||||
  \
 | 
			
		||||
  friend std::ostream & operator << (std::ostream &os, const cname &obj ) { \
 | 
			
		||||
  friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
 | 
			
		||||
    os<<"class "<<#cname<<" {"<<std::endl;\
 | 
			
		||||
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
 | 
			
		||||
      os<<"}";								\
 | 
			
		||||
@@ -165,7 +165,7 @@ namespace Grid {
 | 
			
		||||
  class EnumIO<name> {\
 | 
			
		||||
    public:\
 | 
			
		||||
      template <typename T>\
 | 
			
		||||
      static void write(Writer<T> &WR,const std::string &s, const name &obj){ \
 | 
			
		||||
      static inline void write(Writer<T> &WR,const std::string &s, const name &obj){ \
 | 
			
		||||
        switch (obj) {\
 | 
			
		||||
          GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
 | 
			
		||||
          default: Grid::write(WR,s,#undefname); break;\
 | 
			
		||||
@@ -173,7 +173,7 @@ namespace Grid {
 | 
			
		||||
      }\
 | 
			
		||||
      \
 | 
			
		||||
      template <typename T>\
 | 
			
		||||
      static void read(Reader<T> &RD,const std::string &s, name &obj){ \
 | 
			
		||||
      static inline void read(Reader<T> &RD,const std::string &s, name &obj){ \
 | 
			
		||||
        std::string buf;\
 | 
			
		||||
        Grid::read(RD, s, buf);\
 | 
			
		||||
        if (buf == #undefname) {obj = name::undefname;}\
 | 
			
		||||
@@ -182,7 +182,7 @@ namespace Grid {
 | 
			
		||||
      }\
 | 
			
		||||
  };\
 | 
			
		||||
  \
 | 
			
		||||
  std::ostream & operator << (std::ostream &os, const name &obj ) { \
 | 
			
		||||
  inline std::ostream & operator << (std::ostream &os, const name &obj ) { \
 | 
			
		||||
    switch (obj) {\
 | 
			
		||||
        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
 | 
			
		||||
        default: os << #undefname; break;\
 | 
			
		||||
 
 | 
			
		||||
@@ -80,6 +80,20 @@ void XmlReader::pop(void)
 | 
			
		||||
  node_ = node_.parent();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool XmlReader::nextElement(const std::string &s)
 | 
			
		||||
{
 | 
			
		||||
  if (node_.next_sibling(s.c_str()))
 | 
			
		||||
  {
 | 
			
		||||
    node_ = node_.next_sibling(s.c_str());
 | 
			
		||||
    
 | 
			
		||||
    return true;
 | 
			
		||||
  }
 | 
			
		||||
  else
 | 
			
		||||
  {
 | 
			
		||||
    return false;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <>
 | 
			
		||||
void XmlReader::readDefault(const string &s, string &output)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -68,6 +68,7 @@ namespace Grid
 | 
			
		||||
    virtual ~XmlReader(void) = default;
 | 
			
		||||
    void push(const std::string &s);
 | 
			
		||||
    void pop(void);
 | 
			
		||||
    bool nextElement(const std::string &s);
 | 
			
		||||
    template <typename U>
 | 
			
		||||
    void readDefault(const std::string &s, U &output);
 | 
			
		||||
    template <typename U>
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1139
									
								
								lib/simd/Avx512Asm.h
									
									
									
									
									
								
							
							
						
						
									
										1139
									
								
								lib/simd/Avx512Asm.h
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -410,22 +410,22 @@ namespace Optimization {
 | 
			
		||||
  struct Permute{
 | 
			
		||||
 | 
			
		||||
    static inline __m256 Permute0(__m256 in){
 | 
			
		||||
      return _mm256_permute2f128_ps(in,in,0x01);
 | 
			
		||||
      return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256 Permute1(__m256 in){
 | 
			
		||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
			
		||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256 Permute2(__m256 in){
 | 
			
		||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256 Permute3(__m256 in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline __m256d Permute0(__m256d in){
 | 
			
		||||
      return _mm256_permute2f128_pd(in,in,0x01);
 | 
			
		||||
      return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256d Permute1(__m256d in){
 | 
			
		||||
    static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
 | 
			
		||||
      return _mm256_shuffle_pd(in,in,0x5);
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m256d Permute2(__m256d in){
 | 
			
		||||
@@ -437,6 +437,111 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
#if defined (AVX2) || defined (AVXFMA4) 
 | 
			
		||||
#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
 | 
			
		||||
#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined (AVX1) 
 | 
			
		||||
 | 
			
		||||
#define _mm256_alignr_epi32(ret,a,b,n) {	\
 | 
			
		||||
    __m128 aa, bb;				\
 | 
			
		||||
						\
 | 
			
		||||
    aa  = _mm256_extractf128_ps(a,1);		\
 | 
			
		||||
    bb  = _mm256_extractf128_ps(b,1);		\
 | 
			
		||||
    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
 | 
			
		||||
    ret = _mm256_insertf128_ps(ret,aa,1);	\
 | 
			
		||||
						\
 | 
			
		||||
    aa  = _mm256_extractf128_ps(a,0);		\
 | 
			
		||||
    bb  = _mm256_extractf128_ps(b,0);		\
 | 
			
		||||
    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
 | 
			
		||||
    ret = _mm256_insertf128_ps(ret,aa,0);	\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define _mm256_alignr_epi64(ret,a,b,n) {	\
 | 
			
		||||
    __m128d aa, bb;				\
 | 
			
		||||
						\
 | 
			
		||||
    aa  = _mm256_extractf128_pd(a,1);		\
 | 
			
		||||
    bb  = _mm256_extractf128_pd(b,1);		\
 | 
			
		||||
    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
 | 
			
		||||
    ret = _mm256_insertf128_pd(ret,aa,1);	\
 | 
			
		||||
						\
 | 
			
		||||
    aa  = _mm256_extractf128_pd(a,0);		\
 | 
			
		||||
    bb  = _mm256_extractf128_pd(b,0);		\
 | 
			
		||||
    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
 | 
			
		||||
    ret = _mm256_insertf128_pd(ret,aa,0);	\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    inline std::ostream & operator << (std::ostream& stream, const __m256 a)
 | 
			
		||||
    {
 | 
			
		||||
      const float *p=(const float *)&a;
 | 
			
		||||
      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
 | 
			
		||||
      return stream;
 | 
			
		||||
    };
 | 
			
		||||
    inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
 | 
			
		||||
    {
 | 
			
		||||
      const double *p=(const double *)&a;
 | 
			
		||||
      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
 | 
			
		||||
      return stream;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline __m256 rotate(__m256 in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m256d rotate(__m256d in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  
 | 
			
		||||
    
 | 
			
		||||
    template<int n>
 | 
			
		||||
    static inline __m256 tRotate(__m256 in){ 
 | 
			
		||||
      __m256 tmp = Permute::Permute0(in);
 | 
			
		||||
      __m256 ret;
 | 
			
		||||
      if ( n > 3 ) { 
 | 
			
		||||
	_mm256_alignr_epi32(ret,in,tmp,n);  
 | 
			
		||||
      } else {
 | 
			
		||||
        _mm256_alignr_epi32(ret,tmp,in,n);          
 | 
			
		||||
      }
 | 
			
		||||
      //      std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
 | 
			
		||||
      return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<int n>
 | 
			
		||||
    static inline __m256d tRotate(__m256d in){ 
 | 
			
		||||
      __m256d tmp = Permute::Permute0(in);
 | 
			
		||||
      __m256d ret;
 | 
			
		||||
      if ( n > 1 ) {
 | 
			
		||||
	_mm256_alignr_epi64(ret,in,tmp,n);          
 | 
			
		||||
      } else {
 | 
			
		||||
        _mm256_alignr_epi64(ret,tmp,in,n);          
 | 
			
		||||
      }
 | 
			
		||||
      //      std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
 | 
			
		||||
      return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
 
 | 
			
		||||
@@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
namespace Grid{
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
  
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
@@ -246,26 +246,30 @@ namespace Optimization {
 | 
			
		||||
  struct TimesMinusI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline __m512 operator()(__m512 in, __m512 ret){
 | 
			
		||||
      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
 | 
			
		||||
      return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));   // 0x4E??
 | 
			
		||||
      //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
 | 
			
		||||
      //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0));   // 0x4E??
 | 
			
		||||
      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline __m512d operator()(__m512d in, __m512d ret){
 | 
			
		||||
      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
 | 
			
		||||
      return _mm512_shuffle_pd(tmp,tmp,0x55);
 | 
			
		||||
      //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
 | 
			
		||||
      //return _mm512_shuffle_pd(tmp,tmp,0x55);
 | 
			
		||||
      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
 | 
			
		||||
      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
 | 
			
		||||
    } 
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline __m512 operator()(__m512 in, __m512 ret){
 | 
			
		||||
      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
			
		||||
      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
 | 
			
		||||
      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
      return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); 
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline __m512d operator()(__m512d in, __m512d ret){
 | 
			
		||||
      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
 | 
			
		||||
      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
 | 
			
		||||
      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
 | 
			
		||||
      return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); 
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -305,6 +309,54 @@ namespace Optimization {
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline __m512 rotate(__m512 in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
 | 
			
		||||
      case 8 : return tRotate<8>(in);break;
 | 
			
		||||
      case 9 : return tRotate<9>(in);break;
 | 
			
		||||
      case 10: return tRotate<10>(in);break;
 | 
			
		||||
      case 11: return tRotate<11>(in);break;
 | 
			
		||||
      case 12: return tRotate<12>(in);break;
 | 
			
		||||
      case 13: return tRotate<13>(in);break;
 | 
			
		||||
      case 14: return tRotate<14>(in);break;
 | 
			
		||||
      case 15: return tRotate<15>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m512d rotate(__m512d in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<int n> static inline __m512 tRotate(__m512 in){ 
 | 
			
		||||
      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<int n> static inline __m512d tRotate(__m512d in){ 
 | 
			
		||||
      return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);          
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
  
 | 
			
		||||
@@ -345,7 +397,7 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef __m512 SIMD_Ftype;  // Single precision type
 | 
			
		||||
  typedef __m512d SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef __m512i SIMD_Itype; // Integer type
 | 
			
		||||
 
 | 
			
		||||
@@ -35,6 +35,7 @@ Author: neo <cossu@post.kek.jp>
 | 
			
		||||
// Time-stamp: <2015-06-09 14:28:02 neo>
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  template<class vtype>
 | 
			
		||||
@@ -54,51 +55,67 @@ namespace Optimization {
 | 
			
		||||
  
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
    //Complex float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(float a, float b){
 | 
			
		||||
      u128f out; 
 | 
			
		||||
      out.f[0] = a;
 | 
			
		||||
      out.f[1] = b;
 | 
			
		||||
      out.f[2] = a;
 | 
			
		||||
      out.f[3] = b;
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float operator()(float a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(float a){
 | 
			
		||||
      u128f out; 
 | 
			
		||||
      out.f[0] = a;
 | 
			
		||||
      out.f[1] = a;
 | 
			
		||||
      out.f[2] = a;
 | 
			
		||||
      out.f[3] = a;
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(double a, double b){
 | 
			
		||||
      u128d out; 
 | 
			
		||||
      out.f[0] = a;
 | 
			
		||||
      out.f[1] = b;
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Real double
 | 
			
		||||
    inline double operator()(double a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(double a){
 | 
			
		||||
      u128d out; 
 | 
			
		||||
      out.f[0] = a;
 | 
			
		||||
      out.f[1] = a;
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline int operator()(Integer a){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vstore{
 | 
			
		||||
    //Float 
 | 
			
		||||
    inline void operator()(float a, float* F){
 | 
			
		||||
      
 | 
			
		||||
    inline void operator()(u128f a, float* F){
 | 
			
		||||
      memcpy(F,a.f,4*sizeof(float));
 | 
			
		||||
    }
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(double a, double* D){
 | 
			
		||||
     
 | 
			
		||||
    inline void operator()(u128d a, double* D){
 | 
			
		||||
      memcpy(D,a.f,2*sizeof(double));
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline void operator()(int a, Integer* I){
 | 
			
		||||
      
 | 
			
		||||
      I[0] = a;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vstream{
 | 
			
		||||
    //Float
 | 
			
		||||
    inline void operator()(float * a, float b){
 | 
			
		||||
     
 | 
			
		||||
    inline void operator()(float * a, u128f b){
 | 
			
		||||
      memcpy(a,b.f,4*sizeof(float));
 | 
			
		||||
    }
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(double * a, double b){
 | 
			
		||||
     
 | 
			
		||||
    inline void operator()(double * a, u128d b){
 | 
			
		||||
      memcpy(a,b.f,2*sizeof(double));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -106,24 +123,40 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  struct Vset{
 | 
			
		||||
    // Complex float 
 | 
			
		||||
    inline float operator()(Grid::ComplexF *a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(Grid::ComplexF *a){
 | 
			
		||||
      u128f out; 
 | 
			
		||||
      out.f[0] = a[0].real();
 | 
			
		||||
      out.f[1] = a[0].imag();
 | 
			
		||||
      out.f[2] = a[1].real();
 | 
			
		||||
      out.f[3] = a[1].imag();
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double 
 | 
			
		||||
    inline double operator()(Grid::ComplexD *a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(Grid::ComplexD *a){
 | 
			
		||||
      u128d out; 
 | 
			
		||||
      out.f[0] = a[0].real();
 | 
			
		||||
      out.f[1] = a[0].imag();
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Real float 
 | 
			
		||||
    inline float operator()(float *a){
 | 
			
		||||
      return  0;
 | 
			
		||||
    inline u128f operator()(float *a){
 | 
			
		||||
      u128f out; 
 | 
			
		||||
      out.f[0] = a[0];
 | 
			
		||||
      out.f[1] = a[1];
 | 
			
		||||
      out.f[2] = a[2];
 | 
			
		||||
      out.f[3] = a[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline double operator()(double *a){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(double *a){
 | 
			
		||||
      u128d out; 
 | 
			
		||||
      out.f[0] = a[0];
 | 
			
		||||
      out.f[1] = a[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Integer
 | 
			
		||||
    inline int operator()(Integer *a){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a[0];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -145,130 +178,279 @@ namespace Optimization {
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  struct Sum{
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f a, u128f b){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = a.f[0] + b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1] + b.f[1];
 | 
			
		||||
      out.f[2] = a.f[2] + b.f[2];
 | 
			
		||||
      out.f[3] = a.f[3] + b.f[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex/Real double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d a, u128d b){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = a.f[0] + b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1] + b.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a + b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Sub{
 | 
			
		||||
    //Complex/Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f a, u128f b){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = a.f[0] - b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1] - b.f[1];
 | 
			
		||||
      out.f[2] = a.f[2] - b.f[2];
 | 
			
		||||
      out.f[3] = a.f[3] - b.f[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex/Real double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d a, u128d b){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = a.f[0] - b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1] - b.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a-b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f a, u128f b){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
 | 
			
		||||
      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
 | 
			
		||||
      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
 | 
			
		||||
      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d a, u128d b){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
 | 
			
		||||
      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Mult{
 | 
			
		||||
    inline float  mac(float a, float b,double c){
 | 
			
		||||
      return 0;
 | 
			
		||||
    }
 | 
			
		||||
    inline double mac(double a, double b,double c){
 | 
			
		||||
      return 0;
 | 
			
		||||
    }
 | 
			
		||||
    //CK: Appear unneeded
 | 
			
		||||
    // inline float  mac(float a, float b,double c){
 | 
			
		||||
    //   return 0;
 | 
			
		||||
    // }
 | 
			
		||||
    // inline double mac(double a, double b,double c){
 | 
			
		||||
    //   return 0;
 | 
			
		||||
    // }
 | 
			
		||||
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float operator()(float a, float b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f a, u128f b){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = a.f[0]*b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1]*b.f[1];
 | 
			
		||||
      out.f[2] = a.f[2]*b.f[2];
 | 
			
		||||
      out.f[3] = a.f[3]*b.f[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline double operator()(double a, double b){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d a, u128d b){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = a.f[0]*b.f[0];
 | 
			
		||||
      out.f[1] = a.f[1]*b.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Integer
 | 
			
		||||
    inline int operator()(int a, int b){
 | 
			
		||||
      return 0;
 | 
			
		||||
      return a*b;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Conj{
 | 
			
		||||
    // Complex single
 | 
			
		||||
    inline float operator()(float in){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f in){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = in.f[0];
 | 
			
		||||
      out.f[1] = -in.f[1];
 | 
			
		||||
      out.f[2] = in.f[2];
 | 
			
		||||
      out.f[3] = -in.f[3];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline double operator()(double in){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d in){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = in.f[0];
 | 
			
		||||
      out.f[1] = -in.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    // do not define for integer input
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesMinusI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float operator()(float in, float ret){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = in.f[1];
 | 
			
		||||
      out.f[1] = -in.f[0];
 | 
			
		||||
      out.f[2] = in.f[3];
 | 
			
		||||
      out.f[3] = -in.f[2];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline double operator()(double in, double ret){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d in, u128d ret){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = in.f[1];
 | 
			
		||||
      out.f[1] = -in.f[0];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float operator()(float in, float ret){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = -in.f[1];
 | 
			
		||||
      out.f[1] = in.f[0];
 | 
			
		||||
      out.f[2] = -in.f[3];
 | 
			
		||||
      out.f[3] = in.f[2];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline double operator()(double in, double ret){
 | 
			
		||||
      return 0;
 | 
			
		||||
    inline u128d operator()(u128d in, u128d ret){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = -in.f[1];
 | 
			
		||||
      out.f[1] = in.f[0];
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
  struct Permute{
 | 
			
		||||
    //We just have to mirror the permutes of Grid_sse4.h
 | 
			
		||||
    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = in.f[2];
 | 
			
		||||
      out.f[1] = in.f[3];
 | 
			
		||||
      out.f[2] = in.f[0];
 | 
			
		||||
      out.f[3] = in.f[1];
 | 
			
		||||
      return out;
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
 | 
			
		||||
      u128f out;
 | 
			
		||||
      out.f[0] = in.f[1];
 | 
			
		||||
      out.f[1] = in.f[0];
 | 
			
		||||
      out.f[2] = in.f[3];
 | 
			
		||||
      out.f[3] = in.f[2];
 | 
			
		||||
      return out;
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128f Permute2(u128f in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128f Permute3(u128f in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline u128d Permute0(u128d in){ //AB -> BA
 | 
			
		||||
      u128d out;
 | 
			
		||||
      out.f[0] = in.f[1];
 | 
			
		||||
      out.f[1] = in.f[0];
 | 
			
		||||
      return out;      
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128d Permute1(u128d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128d Permute2(u128d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline u128d Permute3(u128d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  template < typename vtype > 
 | 
			
		||||
    void permute(vtype &a, vtype b, int perm) {
 | 
			
		||||
   };
 | 
			
		||||
    
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline u128f rotate(u128f in,int n){
 | 
			
		||||
      u128f out;
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0:
 | 
			
		||||
        out.f[0] = in.f[0];
 | 
			
		||||
        out.f[1] = in.f[1];
 | 
			
		||||
        out.f[2] = in.f[2];
 | 
			
		||||
        out.f[3] = in.f[3];
 | 
			
		||||
        break;
 | 
			
		||||
      case 1:
 | 
			
		||||
        out.f[0] = in.f[1];
 | 
			
		||||
        out.f[1] = in.f[2];
 | 
			
		||||
        out.f[2] = in.f[3];
 | 
			
		||||
        out.f[3] = in.f[0];
 | 
			
		||||
        break;
 | 
			
		||||
      case 2:
 | 
			
		||||
        out.f[0] = in.f[2];
 | 
			
		||||
        out.f[1] = in.f[3];
 | 
			
		||||
        out.f[2] = in.f[0];
 | 
			
		||||
        out.f[3] = in.f[1];
 | 
			
		||||
        break;
 | 
			
		||||
      case 3:
 | 
			
		||||
        out.f[0] = in.f[3];
 | 
			
		||||
        out.f[1] = in.f[0];
 | 
			
		||||
        out.f[2] = in.f[1];
 | 
			
		||||
        out.f[3] = in.f[2];
 | 
			
		||||
        break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
    static inline u128d rotate(u128d in,int n){
 | 
			
		||||
      u128d out;
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0:
 | 
			
		||||
        out.f[0] = in.f[0];
 | 
			
		||||
        out.f[1] = in.f[1];
 | 
			
		||||
        break;
 | 
			
		||||
      case 1:
 | 
			
		||||
        out.f[0] = in.f[1];
 | 
			
		||||
        out.f[1] = in.f[0];
 | 
			
		||||
        break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
      return out;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
 | 
			
		||||
    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
 | 
			
		||||
  }
 | 
			
		||||
  //Real float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
 | 
			
		||||
    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  //Complex double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
 | 
			
		||||
    return Grid::ComplexD(in.f[0],in.f[1]);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //Real double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
 | 
			
		||||
    return 0;
 | 
			
		||||
  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
 | 
			
		||||
    return in.f[0] + in.f[1];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
@@ -282,10 +464,9 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef float SIMD_Ftype;  // Single precision type
 | 
			
		||||
  typedef double SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
 | 
			
		||||
  typedef Optimization::u128d SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef int SIMD_Itype; // Integer type
 | 
			
		||||
 | 
			
		||||
  // prefetch utilities
 | 
			
		||||
 
 | 
			
		||||
@@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
#include <zmmintrin.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid{
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
  
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
@@ -316,6 +318,54 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline __m512 rotate(__m512 in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
 | 
			
		||||
      case 8 : return tRotate<8>(in);break;
 | 
			
		||||
      case 9 : return tRotate<9>(in);break;
 | 
			
		||||
      case 10: return tRotate<10>(in);break;
 | 
			
		||||
      case 11: return tRotate<11>(in);break;
 | 
			
		||||
      case 12: return tRotate<12>(in);break;
 | 
			
		||||
      case 13: return tRotate<13>(in);break;
 | 
			
		||||
      case 14: return tRotate<14>(in);break;
 | 
			
		||||
      case 15: return tRotate<15>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m512d rotate(__m512d in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      case 4: return tRotate<4>(in);break;
 | 
			
		||||
      case 5: return tRotate<5>(in);break;
 | 
			
		||||
      case 6: return tRotate<6>(in);break;
 | 
			
		||||
      case 7: return tRotate<7>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<int n> static inline __m512 tRotate(__m512 in){ 
 | 
			
		||||
      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<int n> static inline __m512d tRotate(__m512d in){ 
 | 
			
		||||
      return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);          
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
@@ -358,7 +408,7 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef __m512 SIMD_Ftype;  // Single precision type
 | 
			
		||||
  typedef __m512d SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef __m512i SIMD_Itype; // Integer type
 | 
			
		||||
 
 | 
			
		||||
@@ -267,10 +267,10 @@ namespace Optimization {
 | 
			
		||||
  struct Permute{
 | 
			
		||||
 | 
			
		||||
    static inline __m128 Permute0(__m128 in){
 | 
			
		||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
			
		||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m128 Permute1(__m128 in){
 | 
			
		||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
			
		||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m128 Permute2(__m128 in){
 | 
			
		||||
      return in;
 | 
			
		||||
@@ -279,7 +279,7 @@ namespace Optimization {
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline __m128d Permute0(__m128d in){
 | 
			
		||||
    static inline __m128d Permute0(__m128d in){ //AB -> BA
 | 
			
		||||
      return _mm_shuffle_pd(in,in,0x1);
 | 
			
		||||
    };
 | 
			
		||||
    static inline __m128d Permute1(__m128d in){
 | 
			
		||||
@@ -294,6 +294,32 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline __m128 rotate(__m128 in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      case 2: return tRotate<2>(in);break;
 | 
			
		||||
      case 3: return tRotate<3>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline __m128d rotate(__m128d in,int n){ 
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: return tRotate<0>(in);break;
 | 
			
		||||
      case 1: return tRotate<1>(in);break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  
 | 
			
		||||
#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 | 
			
		||||
#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
 | 
			
		||||
    
 | 
			
		||||
    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
 | 
			
		||||
    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -299,16 +299,44 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
 | 
			
		||||
    {
 | 
			
		||||
      if      (perm==3) permute3(y,b);
 | 
			
		||||
      else if (perm==2) permute2(y,b);
 | 
			
		||||
      else if (perm==1) permute1(y,b);
 | 
			
		||||
      else if (perm==0) permute0(y,b);
 | 
			
		||||
      if ( perm & RotateBit ) {
 | 
			
		||||
	int dist = perm&0xF;
 | 
			
		||||
        y=rotate(b,dist);
 | 
			
		||||
	return;
 | 
			
		||||
      }
 | 
			
		||||
      switch(perm){
 | 
			
		||||
      case 3: permute3(y,b); break;
 | 
			
		||||
      case 2: permute2(y,b); break;
 | 
			
		||||
      case 1: permute1(y,b); break;
 | 
			
		||||
      case 0: permute0(y,b); break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
  };// end of Grid_simd class definition 
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // General rotate
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template <class S, class V, IfNotComplex<S> =0> 
 | 
			
		||||
  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
 | 
			
		||||
  {
 | 
			
		||||
    nrot = nrot % Grid_simd<S,V>::Nsimd();
 | 
			
		||||
    Grid_simd<S,V> ret;
 | 
			
		||||
    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
 | 
			
		||||
    ret.v = Optimization::Rotate::rotate(b.v,nrot);
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  template <class S, class V, IfComplex<S> =0> 
 | 
			
		||||
  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
 | 
			
		||||
  {
 | 
			
		||||
    nrot = nrot % Grid_simd<S,V>::Nsimd();
 | 
			
		||||
    Grid_simd<S,V> ret;
 | 
			
		||||
    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
 | 
			
		||||
    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  // Splat
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
@@ -339,6 +367,9 @@ namespace Grid {
 | 
			
		||||
  template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,S(0.0,0.0)); }// use xor?
 | 
			
		||||
  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 
 | 
			
		||||
 | 
			
		||||
  template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));} 
 | 
			
		||||
  template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));} 
 | 
			
		||||
 | 
			
		||||
  // if not complex overload here 
 | 
			
		||||
  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
 | 
			
		||||
  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										197
									
								
								lib/simd/Intel512avx.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										197
									
								
								lib/simd/Intel512avx.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,197 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Avx512Asm.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#ifndef GRID_ASM_AV512_H
 | 
			
		||||
#define GRID_ASM_AV512_H
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////	  
 | 
			
		||||
// Knights Landing specials
 | 
			
		||||
////////////////////////////////////////////////////////////	  
 | 
			
		||||
 | 
			
		||||
#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 | 
			
		||||
#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 | 
			
		||||
 | 
			
		||||
#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
 | 
			
		||||
#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
 | 
			
		||||
 | 
			
		||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
 | 
			
		||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
 | 
			
		||||
 | 
			
		||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
 | 
			
		||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
 | 
			
		||||
 | 
			
		||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
			
		||||
  VSHUFMEMf(O,P,tmp) \
 | 
			
		||||
  VMULMEMf(O,P,B,Biirr) \
 | 
			
		||||
  VMULMEMf(O,P,C,Ciirr) \
 | 
			
		||||
  VMULf(tmp,B,Briir) \
 | 
			
		||||
  VMULf(tmp,C,Criir)
 | 
			
		||||
 | 
			
		||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
			
		||||
  VSHUFMEMd(O,P,tmp)  \
 | 
			
		||||
  VMULMEMd(O,P,B,Biirr)  \ 
 | 
			
		||||
  VMULMEMd(O,P,C,Ciirr)  \
 | 
			
		||||
  VMULd(tmp,B,Briir)  \
 | 
			
		||||
  VMULd(tmp,C,Criir) 
 | 
			
		||||
 | 
			
		||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
			
		||||
  VSHUFMEMf(O,P,tmp) \
 | 
			
		||||
  VMADDMEMf(O,P,B,Biirr) \
 | 
			
		||||
  VMADDMEMf(O,P,C,Ciirr) \
 | 
			
		||||
  VMADDf(tmp,B,Briir) \
 | 
			
		||||
  VMADDf(tmp,C,Criir)
 | 
			
		||||
 | 
			
		||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
 | 
			
		||||
  VSHUFMEMd(O,P,tmp) \
 | 
			
		||||
  VMADDMEMd(O,P,B,Biirr) \
 | 
			
		||||
  VMADDMEMd(O,P,C,Ciirr) \
 | 
			
		||||
  VMADDd(tmp,B,Briir) \
 | 
			
		||||
  VMADDd(tmp,C,Criir)
 | 
			
		||||
 | 
			
		||||
// Merges accumulation for complex dot chain; less efficient under avx512
 | 
			
		||||
#define ZEND1f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Criir "," #Criir "," #tmp   ";\n"\
 | 
			
		||||
                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
 | 
			
		||||
                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
 | 
			
		||||
                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
 | 
			
		||||
                         	  "vsubpd  " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
 | 
			
		||||
 | 
			
		||||
#define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 | 
			
		||||
#define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
 | 
			
		||||
#define VMOVRDUPf(OFF,PTR,DEST)         "vmovsldup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
#define VMOVIDUPf(OFF,PTR,DEST)         "vmovshdup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VRDUPd(SRC,DEST)       "vpshufd  $0x44," #SRC"," #DEST  ";\n" // 32 bit level: 1,0,3,2
 | 
			
		||||
#define VRDUPf(SRC,DEST)         "vmovsldup " #SRC ", " #DEST  ";\n"
 | 
			
		||||
#define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
 | 
			
		||||
#define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
 | 
			
		||||
#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
 | 
			
		||||
#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps   " #O"*64("#P "),"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
#define VMULIDUPd(O,P,B,accum) "vmulpd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 | 
			
		||||
  /*
 | 
			
		||||
   * TimesI is used only in the XP recon
 | 
			
		||||
   * Could zero the regs and use RECON_ACCUM
 | 
			
		||||
   */
 | 
			
		||||
 | 
			
		||||
#define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
 | 
			
		||||
#define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 | 
			
		||||
#define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VTIMESI0d(A,DEST, Z)   VSHUFd(A,DEST)	 
 | 
			
		||||
#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 | 
			
		||||
#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VTIMESMINUSI0f(A,DEST,Z)  VSHUFf(A,DEST)					
 | 
			
		||||
#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 | 
			
		||||
#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VTIMESMINUSI0d(A,DEST,Z)  VSHUFd(A,DEST)					
 | 
			
		||||
#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 | 
			
		||||
#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 | 
			
		||||
#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
 | 
			
		||||
#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
 | 
			
		||||
#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
 | 
			
		||||
#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
 | 
			
		||||
#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
 | 
			
		||||
#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
// o_p must point to floating 1.0f/d
 | 
			
		||||
//
 | 
			
		||||
// Ai, Ar -> tmp (r i)
 | 
			
		||||
// tmp *1.0 
 | 
			
		||||
// ACC i - Ar ; ACC r + Ai
 | 
			
		||||
#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 | 
			
		||||
#define VACCTIMESMINUSI1f(A,ACC,tmp)  VMADDMEMf(1,%r10,tmp,ACC)
 | 
			
		||||
#define VACCTIMESMINUSI2f(A,ACC,tmp)  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
 | 
			
		||||
#define VACCTIMESMINUSI1d(A,ACC,tmp)  VMADDMEMd(1,%r10,tmp,ACC)  
 | 
			
		||||
#define VACCTIMESMINUSI2d(A,ACC,tmp)
 | 
			
		||||
 | 
			
		||||
// Ai, Ar -> tmp (r i)
 | 
			
		||||
// tmp *1.0 
 | 
			
		||||
// ACC i + Ar ; ACC r - Ai
 | 
			
		||||
#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
 | 
			
		||||
#define  VACCTIMESI1f(A,ACC,tmp)  VMADDMEMf(0,%r10,tmp,ACC)  
 | 
			
		||||
#define  VACCTIMESI2f(A,ACC,tmp)
 | 
			
		||||
 | 
			
		||||
#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
 | 
			
		||||
#define  VACCTIMESI1d(A,ACC,tmp)  VMADDMEMd(0,%r10,tmp,ACC)  
 | 
			
		||||
#define  VACCTIMESI2d(A,ACC,tmp)
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM3f(A,B) "vshufps     $0xb1," #A "," #B "," #B ";\n"
 | 
			
		||||
 | 
			
		||||
#define VPERM0d(A,B) "vshuff64x2  $0x4e," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM1d(A,B) "vshuff64x2  $0xb1," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM2d(A,B) "vshufpd     $0x55," #A "," #B "," #B ";\n"
 | 
			
		||||
#define VPERM3d(A,B) VMOVd(A,B)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										141
									
								
								lib/simd/Intel512common.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										141
									
								
								lib/simd/Intel512common.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,141 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Avx512Asm.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#ifndef GRID_ASM_INTEL_COMMON_512_H
 | 
			
		||||
#define GRID_ASM_INTEL_COMMON_512_H
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Opcodes common 
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#define MASK_REGS \
 | 
			
		||||
  __asm__ ("mov     $0xAAAA, %%eax \n"\ 
 | 
			
		||||
           "kmovw    %%eax, %%k6 \n"\
 | 
			
		||||
           "mov     $0x5555, %%eax \n"\
 | 
			
		||||
           "kmovw    %%eax, %%k7 \n" : : : "%eax");
 | 
			
		||||
 | 
			
		||||
#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
			
		||||
#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
			
		||||
 | 
			
		||||
#define VTIMESIf(A,DEST, Z) \
 | 
			
		||||
  VTIMESI0f(A,DEST, Z) \
 | 
			
		||||
  VTIMESI1f(A,DEST, Z) \
 | 
			
		||||
  VTIMESI2f(A,DEST, Z) 
 | 
			
		||||
 | 
			
		||||
#define VTIMESId(A,DEST, Z) \
 | 
			
		||||
  VTIMESI0d(A,DEST, Z) \
 | 
			
		||||
  VTIMESI1d(A,DEST, Z) \
 | 
			
		||||
  VTIMESI2d(A,DEST, Z) 
 | 
			
		||||
 | 
			
		||||
#define VTIMESMINUSIf(A,DEST, Z) \
 | 
			
		||||
        VTIMESMINUSI0f(A,DEST, Z) \
 | 
			
		||||
        VTIMESMINUSI1f(A,DEST, Z) \
 | 
			
		||||
        VTIMESMINUSI2f(A,DEST, Z) 
 | 
			
		||||
 | 
			
		||||
#define VTIMESMINUSId(A,DEST, Z) \
 | 
			
		||||
        VTIMESMINUSI0d(A,DEST, Z) \
 | 
			
		||||
        VTIMESMINUSI1d(A,DEST, Z) \
 | 
			
		||||
        VTIMESMINUSI2d(A,DEST, Z) 
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESIf(A,ACC,tmp)			\
 | 
			
		||||
 VACCTIMESI0f(A,ACC,tmp)			\
 | 
			
		||||
 VACCTIMESI1f(A,ACC,tmp)			\
 | 
			
		||||
 VACCTIMESI2f(A,ACC,tmp)			
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESId(A,ACC,tmp)			\
 | 
			
		||||
 VACCTIMESI0d(A,ACC,tmp)			\
 | 
			
		||||
 VACCTIMESI1d(A,ACC,tmp)			\
 | 
			
		||||
 VACCTIMESI2d(A,ACC,tmp)			
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESMINUSIf(A,ACC,tmp)			\
 | 
			
		||||
  VACCTIMESMINUSI0f(A,ACC,tmp)				\
 | 
			
		||||
  VACCTIMESMINUSI1f(A,ACC,tmp)				\
 | 
			
		||||
  VACCTIMESMINUSI2f(A,ACC,tmp)			
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESMINUSId(A,ACC,tmp)			\
 | 
			
		||||
  VACCTIMESMINUSI0d(A,ACC,tmp)				\
 | 
			
		||||
  VACCTIMESMINUSI1d(A,ACC,tmp)				\
 | 
			
		||||
  VACCTIMESMINUSI2d(A,ACC,tmp)			
 | 
			
		||||
 | 
			
		||||
#define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
 | 
			
		||||
#define LOAD64(A,ptr)  LOAD64i(A,ptr)
 | 
			
		||||
 | 
			
		||||
#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 | 
			
		||||
#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
 | 
			
		||||
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
 | 
			
		||||
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 | 
			
		||||
#define VEVICT(O,A)   
 | 
			
		||||
 | 
			
		||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 | 
			
		||||
//  "clevict0 "#O"*64("#A");\n" 
 | 
			
		||||
 | 
			
		||||
#define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
#define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
#define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VSUBf(A,B,DEST)        "vsubps   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
#define VSUBd(A,B,DEST)        "vsubpd   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VADDMEMf(O,A,B,DEST)        "vaddps   "#O"*64("#A ")," #B "," #DEST  ";\n"
 | 
			
		||||
#define VADDMEMd(O,A,B,DEST)        "vaddpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VSUBMEMf(O,A,B,DEST)        "vsubps   "#O"*64("#A ")," #B "," #DEST  ";\n"
 | 
			
		||||
#define VSUBMEMd(O,A,B,DEST)        "vsubpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VMULf(A,B,DEST)        "vmulps   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
#define VMULd(A,B,DEST)        "vmulpd   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VMADDf(A,B,DEST)       "vfmadd231ps   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
#define VMADDd(A,B,DEST)       "vfmadd231pd   " #A "," #B "," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VMULMEMf(O,A,B,DEST)   "vmulps   " #O"*64("#A ")," #B "," #DEST  ";\n"
 | 
			
		||||
#define VMULMEMd(O,A,B,DEST)   "vmulpd   " #O"*64("#A ")," #B "," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VMADDMEMf(O,A,B,DEST)       "vfmadd231ps   " #O"*64("#A "),"#B "," #DEST  ";\n"
 | 
			
		||||
#define VMADDMEMd(O,A,B,DEST)       "vfmadd231pd   " #O"*64("#A "),"#B "," #DEST  ";\n"
 | 
			
		||||
 | 
			
		||||
#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 | 
			
		||||
#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 | 
			
		||||
 | 
			
		||||
#define VPREFETCHNTA(O,A) 
 | 
			
		||||
#define VPREFETCH(O,A)    
 | 
			
		||||
 | 
			
		||||
#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
			
		||||
 | 
			
		||||
// Swaps Re/Im ; could unify this with IMCI
 | 
			
		||||
#define VSHUFd(A,DEST)         "vpshufd  $0x4e," #A "," #DEST  ";\n"    
 | 
			
		||||
#define VSHUFf(A,DEST)         "vpshufd  $0xb1," #A "," #DEST  ";\n"    
 | 
			
		||||
#define VSHUFMEMd(OFF,A,DEST)  "vpshufd  $0x4e, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 | 
			
		||||
#define VSHUFMEMf(OFF,A,DEST)  "vpshufd  $0xb1, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 2,3,0,1
 | 
			
		||||
 | 
			
		||||
#define TRAP " int3 ;\n"
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										154
									
								
								lib/simd/Intel512double.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								lib/simd/Intel512double.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,154 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Avx512Asm.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
// No guard can be multiply included as undef clearage
 | 
			
		||||
#undef VZERO
 | 
			
		||||
#undef VMOV
 | 
			
		||||
#undef VLOAD
 | 
			
		||||
#undef VSTORE
 | 
			
		||||
#define VZERO(A)                  VZEROd(A)
 | 
			
		||||
#define VMOV(A,B)                 VMOVd(A,B)
 | 
			
		||||
#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
 | 
			
		||||
#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
 | 
			
		||||
 | 
			
		||||
#undef VADD
 | 
			
		||||
#undef VSUB
 | 
			
		||||
#undef VMUL
 | 
			
		||||
#undef VMADD
 | 
			
		||||
#define VADD(A,B,C)               VADDd(A,B,C)
 | 
			
		||||
#define VSUB(A,B,C)               VSUBd(A,B,C)
 | 
			
		||||
#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
 | 
			
		||||
#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef VTIMESI
 | 
			
		||||
#undef VTIMESI0 
 | 
			
		||||
#undef VTIMESI1
 | 
			
		||||
#undef VTIMESI2 
 | 
			
		||||
#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
 | 
			
		||||
#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
 | 
			
		||||
#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
 | 
			
		||||
#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VTIMESMINUSI
 | 
			
		||||
#undef VTIMESMINUSI0
 | 
			
		||||
#undef VTIMESMINUSI1
 | 
			
		||||
#undef VTIMESMINUSI2
 | 
			
		||||
#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESI
 | 
			
		||||
#undef VACCTIMESI0
 | 
			
		||||
#undef VACCTIMESI1
 | 
			
		||||
#undef VACCTIMESI2
 | 
			
		||||
#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
 | 
			
		||||
#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
 | 
			
		||||
#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
 | 
			
		||||
#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESMINUSI
 | 
			
		||||
#undef VACCTIMESMINUSI0
 | 
			
		||||
#undef VACCTIMESMINUSI1
 | 
			
		||||
#undef VACCTIMESMINUSI2
 | 
			
		||||
#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESI1MEM
 | 
			
		||||
#undef VACCTIMESI2MEM
 | 
			
		||||
#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESMINUSI1MEM
 | 
			
		||||
#undef VACCTIMESMINUSI2MEM
 | 
			
		||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#undef VPERM0
 | 
			
		||||
#undef VPERM1
 | 
			
		||||
#undef VPERM2
 | 
			
		||||
#undef VPERM3
 | 
			
		||||
#define VPERM0(A,B)               VPERM0d(A,B)
 | 
			
		||||
#define VPERM1(A,B)               VPERM1d(A,B)
 | 
			
		||||
#define VPERM2(A,B)               VPERM2d(A,B)
 | 
			
		||||
#define VPERM3(A,B)               VPERM3d(A,B)
 | 
			
		||||
 | 
			
		||||
#undef VSHUFMEM
 | 
			
		||||
#undef VADDMEM
 | 
			
		||||
#undef VSUBMEM
 | 
			
		||||
#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
 | 
			
		||||
#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
 | 
			
		||||
#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VMOVIDUP
 | 
			
		||||
#undef VMOVRDUP
 | 
			
		||||
#undef VMADDSUB
 | 
			
		||||
#undef VSHUF
 | 
			
		||||
#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
 | 
			
		||||
#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
 | 
			
		||||
#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
 | 
			
		||||
#define VSHUF(A,B)                                       VSHUFd(A,B)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef ZEND1
 | 
			
		||||
#undef ZEND2
 | 
			
		||||
#undef ZLOAD
 | 
			
		||||
#undef ZMUL
 | 
			
		||||
#undef ZMADD
 | 
			
		||||
#undef ZMULMEM2SP
 | 
			
		||||
#undef ZMADDMEM2SP
 | 
			
		||||
 | 
			
		||||
#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
 | 
			
		||||
#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
 | 
			
		||||
#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
 | 
			
		||||
#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
 | 
			
		||||
#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
 | 
			
		||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef VRDUP
 | 
			
		||||
#undef VIDUP
 | 
			
		||||
#undef VMADDSUBMEM
 | 
			
		||||
#undef VMADDMEM
 | 
			
		||||
#undef VMULMEM
 | 
			
		||||
#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST) 
 | 
			
		||||
#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST) 
 | 
			
		||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
 | 
			
		||||
#define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
 | 
			
		||||
#define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
 | 
			
		||||
#undef VMADDSUBRDUP   
 | 
			
		||||
#undef VMADDSUBIDUP   
 | 
			
		||||
#undef VMULRDUP   
 | 
			
		||||
#undef VMULIDUP   
 | 
			
		||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
 | 
			
		||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
 | 
			
		||||
#define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
 | 
			
		||||
#define VMULIDUP(O,P,B,accum)     VMULIDUPd(O,P,B,accum) 
 | 
			
		||||
							
								
								
									
										127
									
								
								lib/simd/Intel512imci.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										127
									
								
								lib/simd/Intel512imci.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,127 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Avx512Asm.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#ifndef GRID_ASM_AV512_H
 | 
			
		||||
#define GRID_ASM_AV512_H
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////	  
 | 
			
		||||
// Knights Corner specials
 | 
			
		||||
////////////////////////////////////////////////////////////	  
 | 
			
		||||
 | 
			
		||||
#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 | 
			
		||||
#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 | 
			
		||||
 | 
			
		||||
#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
 | 
			
		||||
#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
 | 
			
		||||
 | 
			
		||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
 | 
			
		||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
 | 
			
		||||
 | 
			
		||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
 | 
			
		||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
 | 
			
		||||
 | 
			
		||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
			
		||||
  VSHUFMEMf(O,P,tmp) \
 | 
			
		||||
  VMULMEMf(O,P,B,Biirr) \
 | 
			
		||||
  VMULMEMf(O,P,C,Ciirr) \
 | 
			
		||||
  VMULf(tmp,B,Briir) \
 | 
			
		||||
  VMULf(tmp,C,Criir)
 | 
			
		||||
 | 
			
		||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
			
		||||
  VSHUFMEMd(O,P,tmp)  \
 | 
			
		||||
  VMULMEMd(O,P,B,Biirr)  \ 
 | 
			
		||||
  VMULMEMd(O,P,C,Ciirr)  \
 | 
			
		||||
  VMULd(tmp,B,Briir)  \
 | 
			
		||||
  VMULd(tmp,C,Criir) 
 | 
			
		||||
 | 
			
		||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
			
		||||
  VSHUFMEMf(O,P,tmp) \
 | 
			
		||||
  VMADDMEMf(O,P,B,Biirr) \
 | 
			
		||||
  VMADDMEMf(O,P,C,Ciirr) \
 | 
			
		||||
  VMADDf(tmp,B,Briir) \
 | 
			
		||||
  VMADDf(tmp,C,Criir)
 | 
			
		||||
 | 
			
		||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
 | 
			
		||||
  VSHUFMEMd(O,P,tmp) \
 | 
			
		||||
  VMADDMEMd(O,P,B,Biirr) \
 | 
			
		||||
  VMADDMEMd(O,P,C,Ciirr) \
 | 
			
		||||
  VMADDd(tmp,B,Briir) \
 | 
			
		||||
  VMADDd(tmp,C,Criir)
 | 
			
		||||
 | 
			
		||||
#define ZEND1d(Criir,Ciirr, tmp) "vaddpd  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
 | 
			
		||||
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define ZEND1f(Criir,Ciirr, tmp) "vaddps  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
 | 
			
		||||
#define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VTIMESI0f(A,DEST, Z)   
 | 
			
		||||
#define VTIMESI1f(A,DEST, Z)   "vaddps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 | 
			
		||||
#define VTIMESI2f(A,DEST, Z)   "vsubps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VTIMESI0d(A,DEST, Z)   
 | 
			
		||||
#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 | 
			
		||||
#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VTIMESMINUSI0f(A,DEST,Z)  
 | 
			
		||||
#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 | 
			
		||||
#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define VTIMESMINUSI0d(A,DEST,Z)  
 | 
			
		||||
#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 | 
			
		||||
#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 | 
			
		||||
 | 
			
		||||
#define  VACCTIMESI0f(A,ACC,tmp)
 | 
			
		||||
#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
 | 
			
		||||
#define  VACCTIMESI0d(A,ACC,tmp)
 | 
			
		||||
#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESMINUSI0f(A,ACC,tmp)  
 | 
			
		||||
#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
 | 
			
		||||
	   // Acc = Acc - i A
 | 
			
		||||
#define VACCTIMESMINUSI0d(A,ACC,tmp)  
 | 
			
		||||
#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 | 
			
		||||
#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 | 
			
		||||
 | 
			
		||||
//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
 | 
			
		||||
//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
 | 
			
		||||
 | 
			
		||||
#define VPERM0f(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
 | 
			
		||||
#define VPERM1f(A,B) "vpermf32x4  $0xb1," #A "," #B ";\n"
 | 
			
		||||
#define VPERM2f(A,B) "vmovaps     " #A "{badc}," #B ";\n"
 | 
			
		||||
#define VPERM3f(A,B) "vmovaps     " #A "{cdab}," #B ";\n"
 | 
			
		||||
 | 
			
		||||
#define VPERM0d(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
 | 
			
		||||
#define VPERM1d(A,B) "vmovapd     " #A "{badc}," #B ";\n"
 | 
			
		||||
#define VPERM2d(A,B) "vmovapd     " #A "{cdab}," #B ";\n"
 | 
			
		||||
#define VPERM3d(A,B) VMOVd(A,B)
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										155
									
								
								lib/simd/Intel512single.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										155
									
								
								lib/simd/Intel512single.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,155 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Avx512Asm.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
// No guard can be multiply included as undef clearge of macros
 | 
			
		||||
#undef VZERO
 | 
			
		||||
#undef VMOV
 | 
			
		||||
#undef VLOAD
 | 
			
		||||
#undef VSTORE
 | 
			
		||||
#define VZERO(A)                  VZEROf(A)
 | 
			
		||||
#define VMOV(A,B)                 VMOVf(A,B)
 | 
			
		||||
#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
			
		||||
#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
			
		||||
 | 
			
		||||
#undef VADD
 | 
			
		||||
#undef VSUB
 | 
			
		||||
#undef VMUL
 | 
			
		||||
#undef VMADD
 | 
			
		||||
#define VADD(A,B,C)               VADDf(A,B,C)
 | 
			
		||||
#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
			
		||||
#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
 | 
			
		||||
#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef VTIMESI
 | 
			
		||||
#undef VTIMESI0 
 | 
			
		||||
#undef VTIMESI1
 | 
			
		||||
#undef VTIMESI2 
 | 
			
		||||
#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
 | 
			
		||||
#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
 | 
			
		||||
#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
 | 
			
		||||
#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VTIMESMINUSI
 | 
			
		||||
#undef VTIMESMINUSI0
 | 
			
		||||
#undef VTIMESMINUSI1
 | 
			
		||||
#undef VTIMESMINUSI2
 | 
			
		||||
#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESI
 | 
			
		||||
#undef VACCTIMESI0
 | 
			
		||||
#undef VACCTIMESI1
 | 
			
		||||
#undef VACCTIMESI2
 | 
			
		||||
#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
			
		||||
#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
 | 
			
		||||
#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
 | 
			
		||||
#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESMINUSI
 | 
			
		||||
#undef VACCTIMESMINUSI0
 | 
			
		||||
#undef VACCTIMESMINUSI1
 | 
			
		||||
#undef VACCTIMESMINUSI2
 | 
			
		||||
#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESI1MEM
 | 
			
		||||
#undef VACCTIMESI2MEM
 | 
			
		||||
#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESMINUSI1MEM
 | 
			
		||||
#undef VACCTIMESMINUSI2MEM
 | 
			
		||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#undef VPERM0
 | 
			
		||||
#undef VPERM1
 | 
			
		||||
#undef VPERM2
 | 
			
		||||
#undef VPERM3
 | 
			
		||||
#define VPERM0(A,B)               VPERM0f(A,B)
 | 
			
		||||
#define VPERM1(A,B)               VPERM1f(A,B)
 | 
			
		||||
#define VPERM2(A,B)               VPERM2f(A,B)
 | 
			
		||||
#define VPERM3(A,B)               VPERM3f(A,B)
 | 
			
		||||
 | 
			
		||||
#undef VSHUFMEM
 | 
			
		||||
#undef VADDMEM
 | 
			
		||||
#undef VSUBMEM
 | 
			
		||||
#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
			
		||||
#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
 | 
			
		||||
#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VMOVIDUP
 | 
			
		||||
#undef VMOVRDUP
 | 
			
		||||
#undef VMADDSUB
 | 
			
		||||
#undef VSHUF
 | 
			
		||||
#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
 | 
			
		||||
#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
 | 
			
		||||
#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
 | 
			
		||||
#define VSHUF(A,B)                                       VSHUFf(A,B)
 | 
			
		||||
 | 
			
		||||
#undef ZEND1
 | 
			
		||||
#undef ZEND2
 | 
			
		||||
#undef ZLOAD
 | 
			
		||||
#undef ZMUL
 | 
			
		||||
#undef ZMADD
 | 
			
		||||
#undef ZMULMEM2SP
 | 
			
		||||
#undef ZMADDMEM2SP
 | 
			
		||||
 | 
			
		||||
#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
 | 
			
		||||
#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
 | 
			
		||||
#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
 | 
			
		||||
#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
 | 
			
		||||
#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
 | 
			
		||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
 | 
			
		||||
#undef VRDUP
 | 
			
		||||
#undef VIDUP
 | 
			
		||||
#undef VMADDSUBMEM
 | 
			
		||||
#undef VMADDMEM
 | 
			
		||||
#undef VMULMEM
 | 
			
		||||
 | 
			
		||||
#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST) 
 | 
			
		||||
#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST) 
 | 
			
		||||
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
 | 
			
		||||
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
 | 
			
		||||
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
 | 
			
		||||
 | 
			
		||||
#undef VMADDSUBRDUP   
 | 
			
		||||
#undef VMADDSUBIDUP   
 | 
			
		||||
#undef VMULRDUP   
 | 
			
		||||
#undef VMULIDUP   
 | 
			
		||||
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
 | 
			
		||||
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
 | 
			
		||||
#define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
 | 
			
		||||
#define VMULIDUP(O,P,B,accum)     VMULIDUPf(O,P,B,accum) 
 | 
			
		||||
   
 | 
			
		||||
							
								
								
									
										849
									
								
								lib/simd/Intel512wilson.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										849
									
								
								lib/simd/Intel512wilson.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,849 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Avx512Asm.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#ifndef GRID_ASM_INTEL_512_QCD_H
 | 
			
		||||
#define GRID_ASM_INTEL_512_QCD_H
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Register allocations for Wilson Kernel are precision indept
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#define result_00 %zmm0 
 | 
			
		||||
#define result_01 %zmm1
 | 
			
		||||
#define result_02 %zmm2
 | 
			
		||||
  
 | 
			
		||||
#define result_10 %zmm3
 | 
			
		||||
#define result_11 %zmm4
 | 
			
		||||
#define result_12 %zmm5
 | 
			
		||||
 | 
			
		||||
#define result_20 %zmm6
 | 
			
		||||
#define result_21 %zmm7
 | 
			
		||||
#define result_22 %zmm8
 | 
			
		||||
 | 
			
		||||
#define result_30 %zmm9
 | 
			
		||||
#define result_31 %zmm10
 | 
			
		||||
#define result_32 %zmm11
 | 
			
		||||
 | 
			
		||||
#define Chi_00 %zmm12  
 | 
			
		||||
#define Chi_01 %zmm13
 | 
			
		||||
#define Chi_02 %zmm14
 | 
			
		||||
 | 
			
		||||
#define Chi_10 %zmm15
 | 
			
		||||
#define Chi_11 %zmm16
 | 
			
		||||
#define Chi_12 %zmm17  
 | 
			
		||||
 | 
			
		||||
#define UChi_00 %zmm18 
 | 
			
		||||
#define UChi_01 %zmm19
 | 
			
		||||
#define UChi_02 %zmm20
 | 
			
		||||
 | 
			
		||||
#define UChi_10 %zmm21
 | 
			
		||||
#define UChi_11 %zmm22
 | 
			
		||||
#define UChi_12 %zmm23 
 | 
			
		||||
 | 
			
		||||
#define Uir %zmm24 
 | 
			
		||||
#define Uri %zmm25  
 | 
			
		||||
#define T1 %zmm24
 | 
			
		||||
#define T2 %zmm25
 | 
			
		||||
 | 
			
		||||
#define Z0 %zmm26
 | 
			
		||||
#define Z1 %zmm27
 | 
			
		||||
#define Z2 %zmm28
 | 
			
		||||
#define Z3 %zmm29
 | 
			
		||||
#define Z4 %zmm30
 | 
			
		||||
#define Z5 %zmm31
 | 
			
		||||
 | 
			
		||||
#define TMP Chi_00
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
#define Chimu_02 Chi_02
 | 
			
		||||
#define Chimu_10 Chi_10
 | 
			
		||||
#define Chimu_11 Chi_11
 | 
			
		||||
#define Chimu_12 Chi_12
 | 
			
		||||
#define Chimu_20 UChi_00
 | 
			
		||||
#define Chimu_21 UChi_01
 | 
			
		||||
#define Chimu_22 UChi_02
 | 
			
		||||
#define Chimu_30 UChi_10
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
#include <simd/Intel512common.h>
 | 
			
		||||
#include <simd/Intel512avx.h>
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////
 | 
			
		||||
// Macros used to build wilson kernel -- can rationalise and simplify
 | 
			
		||||
// a little as some duplication developed during trying different
 | 
			
		||||
// variants during optimisation. Could cut back to only those used.
 | 
			
		||||
//////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
//  const SiteSpinor * ptr = & in._odata[offset];	
 | 
			
		||||
#define LOAD_CHIMU(PTR)	 LOAD_CHIMUi(PTR)
 | 
			
		||||
#define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 | 
			
		||||
#define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 | 
			
		||||
#define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
 | 
			
		||||
#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMUi \
 | 
			
		||||
	   LOAD_CHIMU01i	\
 | 
			
		||||
	   LOAD_CHIMU23i	);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU01i\
 | 
			
		||||
	   VLOAD(0,%r8,Chimu_00)		\
 | 
			
		||||
	   VLOAD(1,%r8,Chimu_01)		\
 | 
			
		||||
	   VLOAD(2,%r8,Chimu_02)		\
 | 
			
		||||
	   VLOAD(3,%r8,Chimu_10)		\
 | 
			
		||||
	   VLOAD(4,%r8,Chimu_11)		\
 | 
			
		||||
	   VLOAD(5,%r8,Chimu_12)		
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU23i\
 | 
			
		||||
	   VLOAD(6,%r8,Chimu_20)		\
 | 
			
		||||
	   VLOAD(7,%r8,Chimu_21)		\
 | 
			
		||||
	   VLOAD(8,%r8,Chimu_22)		\
 | 
			
		||||
	   VLOAD(9,%r8,Chimu_30)		\
 | 
			
		||||
	   VLOAD(10,%r8,Chimu_31)		\
 | 
			
		||||
	   VLOAD(11,%r8,Chimu_32)		
 | 
			
		||||
 | 
			
		||||
#define SHUF_CHIMU23i\
 | 
			
		||||
	   VSHUFMEM(6,%r8,Chimu_20)		\
 | 
			
		||||
	   VSHUFMEM(7,%r8,Chimu_21)		\
 | 
			
		||||
	   VSHUFMEM(8,%r8,Chimu_22)		\
 | 
			
		||||
	   VSHUFMEM(9,%r8,Chimu_30)		\
 | 
			
		||||
	   VSHUFMEM(10,%r8,Chimu_31)		\
 | 
			
		||||
	   VSHUFMEM(11,%r8,Chimu_32)		
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//  const SiteHalfSpinor *ptr = &buf[offset];	
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIi				\
 | 
			
		||||
  VLOAD(0,%r8,Chi_00)					\
 | 
			
		||||
  VLOAD(1,%r8,Chi_01)					\
 | 
			
		||||
  VLOAD(2,%r8,Chi_02)					\
 | 
			
		||||
  VLOAD(3,%r8,Chi_10)					\
 | 
			
		||||
  VLOAD(4,%r8,Chi_11)					\
 | 
			
		||||
  VLOAD(5,%r8,Chi_12)	
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
#define SAVE_UCHIi(PTR)				\
 | 
			
		||||
  LOAD64(%r8,PTR)				\
 | 
			
		||||
  __asm__ (					\
 | 
			
		||||
  VSTORE(0,%r8,UChi_00)				\
 | 
			
		||||
  VSTORE(1,%r8,UChi_01)				\
 | 
			
		||||
  VSTORE(2,%r8,UChi_02)				\
 | 
			
		||||
  VSTORE(3,%r8,UChi_10)				\
 | 
			
		||||
  VSTORE(4,%r8,UChi_11)				\
 | 
			
		||||
  VSTORE(5,%r8,UChi_12)				\
 | 
			
		||||
						);
 | 
			
		||||
 | 
			
		||||
#define SAVE_CHIi(PTR)				\
 | 
			
		||||
  LOAD64(%r8,PTR)				\
 | 
			
		||||
  __asm__ (					\
 | 
			
		||||
  VSTORE(0,%r8,Chi_00)				\
 | 
			
		||||
  VSTORE(1,%r8,Chi_01)				\
 | 
			
		||||
  VSTORE(2,%r8,Chi_02)				\
 | 
			
		||||
  VSTORE(3,%r8,Chi_10)				\
 | 
			
		||||
  VSTORE(4,%r8,Chi_11)				\
 | 
			
		||||
  VSTORE(5,%r8,Chi_12)				\
 | 
			
		||||
						);
 | 
			
		||||
 | 
			
		||||
#define SAVE_RESULTi(PTR)\
 | 
			
		||||
	   LOAD64(%r8,PTR)			\
 | 
			
		||||
  __asm__ (					\
 | 
			
		||||
	   VSTORE(0,%r8,result_00)		\
 | 
			
		||||
	   VSTORE(1,%r8,result_01)		\
 | 
			
		||||
	   VSTORE(2,%r8,result_02)		\
 | 
			
		||||
	   VSTORE(3,%r8,result_10)		\
 | 
			
		||||
	   VSTORE(4,%r8,result_11)		\
 | 
			
		||||
	   VSTORE(5,%r8,result_12)		\
 | 
			
		||||
	   VSTORE(6,%r8,result_20)		\
 | 
			
		||||
	   VSTORE(7,%r8,result_21)		\
 | 
			
		||||
	   VSTORE(8,%r8,result_22)		\
 | 
			
		||||
	   VSTORE(9,%r8,result_30)		\
 | 
			
		||||
	   VSTORE(10,%r8,result_31)		\
 | 
			
		||||
	   VSTORE(11,%r8,result_32) 		\
 | 
			
		||||
						);
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////
 | 
			
		||||
// Dirac algebra
 | 
			
		||||
//////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
//      hspin(0)=fspin(0)+timesI(fspin(3));
 | 
			
		||||
//      hspin(1)=fspin(1)+timesI(fspin(2));
 | 
			
		||||
#define XP_PROJMEM(PTR) \
 | 
			
		||||
  LOAD64(%r8,PTR)							\
 | 
			
		||||
  __asm__ (								\
 | 
			
		||||
	   LOAD_CHIi						\
 | 
			
		||||
	   SHUF_CHIMU23i						\
 | 
			
		||||
	   VACCTIMESI1(Chi_00,Chi_00,Chimu_30)		\
 | 
			
		||||
	   VACCTIMESI1(Chi_01,Chi_01,Chimu_31)		\
 | 
			
		||||
	   VACCTIMESI1(Chi_02,Chi_02,Chimu_32)		\
 | 
			
		||||
	   VACCTIMESI1(Chi_10,Chi_10,Chimu_20)		\
 | 
			
		||||
	   VACCTIMESI1(Chi_11,Chi_11,Chimu_21)		\
 | 
			
		||||
	   VACCTIMESI1(Chi_12,Chi_12,Chimu_22)		\
 | 
			
		||||
	   VACCTIMESI2(Chi_00,Chi_00,Chimu_30)		\
 | 
			
		||||
	   VACCTIMESI2(Chi_01,Chi_01,Chimu_31)		\
 | 
			
		||||
	   VACCTIMESI2(Chi_02,Chi_02,Chimu_32)		\
 | 
			
		||||
	   VACCTIMESI2(Chi_10,Chi_10,Chimu_20)		\
 | 
			
		||||
	   VACCTIMESI2(Chi_11,Chi_11,Chimu_21)		\
 | 
			
		||||
	   VACCTIMESI2(Chi_12,Chi_12,Chimu_22)		);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define YP_PROJMEM(ptr) \
 | 
			
		||||
  LOAD64(%r8,ptr)		\
 | 
			
		||||
  __asm__ (					\
 | 
			
		||||
  LOAD_CHIMU01i					\
 | 
			
		||||
  VSUBMEM(9,%r8 ,Chimu_00,Chi_00)		\
 | 
			
		||||
  VSUBMEM(10,%r8,Chimu_01,Chi_01)		\
 | 
			
		||||
  VSUBMEM(11,%r8,Chimu_02,Chi_02)		\
 | 
			
		||||
  VADDMEM(6,%r8,Chimu_10,Chi_10)		\
 | 
			
		||||
  VADDMEM(7,%r8,Chimu_11,Chi_11)		\
 | 
			
		||||
  VADDMEM(8,%r8,Chimu_12,Chi_12)		);
 | 
			
		||||
 | 
			
		||||
#define ZP_PROJMEM(PTR) \
 | 
			
		||||
  LOAD64(%r8,PTR)							\
 | 
			
		||||
  __asm__ (								\
 | 
			
		||||
	   LOAD_CHIi						\
 | 
			
		||||
	   SHUF_CHIMU23i						\
 | 
			
		||||
	   VACCTIMESI1(Chi_00,Chi_00,Chimu_20)				\
 | 
			
		||||
	   VACCTIMESI1(Chi_01,Chi_01,Chimu_21)		   	        \
 | 
			
		||||
	   VACCTIMESI1(Chi_02,Chi_02,Chimu_22)				\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30)			\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31)			\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32)			\
 | 
			
		||||
	   VACCTIMESI2(Chi_00,Chi_00,Chimu_20)				\
 | 
			
		||||
	   VACCTIMESI2(Chi_01,Chi_01,Chimu_21)				\
 | 
			
		||||
	   VACCTIMESI2(Chi_02,Chi_02,Chimu_22)				\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30)		\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31)		\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32)	);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define TP_PROJMEM(ptr)				\
 | 
			
		||||
  LOAD64(%r8,ptr)				\
 | 
			
		||||
  __asm__ (					\
 | 
			
		||||
	   LOAD_CHIMU01i			\
 | 
			
		||||
	   VADDMEM(6,%r8 ,Chimu_00,Chi_00)	\
 | 
			
		||||
	   VADDMEM(7,%r8,Chimu_01,Chi_01)	\
 | 
			
		||||
	   VADDMEM(8,%r8,Chimu_02,Chi_02)	\
 | 
			
		||||
	   VADDMEM(9,%r8,Chimu_10,Chi_10)	\
 | 
			
		||||
	   VADDMEM(10,%r8,Chimu_11,Chi_11)	\
 | 
			
		||||
	   VADDMEM(11,%r8,Chimu_12,Chi_12)	);
 | 
			
		||||
 | 
			
		||||
//      hspin(0)=fspin(0)-timesI(fspin(3))
 | 
			
		||||
//      hspin(1)=fspin(1)-timesI(fspin(2))
 | 
			
		||||
 | 
			
		||||
#define XM_PROJMEM(PTR) \
 | 
			
		||||
  LOAD64(%r8,PTR)\
 | 
			
		||||
  __asm__ (								\
 | 
			
		||||
	   SHUF_CHIMU23i						\
 | 
			
		||||
	   LOAD_CHIi \
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
 | 
			
		||||
 | 
			
		||||
#define YM_PROJMEM(ptr)				\
 | 
			
		||||
  LOAD64(%r8,ptr)				\
 | 
			
		||||
  __asm__ (					\
 | 
			
		||||
  LOAD_CHIMU01i					\
 | 
			
		||||
  VADDMEM(9,%r8 ,Chimu_00,Chi_00)		\
 | 
			
		||||
  VADDMEM(10,%r8,Chimu_01,Chi_01)		\
 | 
			
		||||
  VADDMEM(11,%r8,Chimu_02,Chi_02)		\
 | 
			
		||||
  VSUBMEM(6,%r8,Chimu_10,Chi_10)		\
 | 
			
		||||
  VSUBMEM(7,%r8,Chimu_11,Chi_11)		\
 | 
			
		||||
  VSUBMEM(8,%r8,Chimu_12,Chi_12)			);
 | 
			
		||||
 | 
			
		||||
#define ZM_PROJMEM(PTR) \
 | 
			
		||||
  LOAD64(%r8,PTR)							\
 | 
			
		||||
  __asm__ (								\
 | 
			
		||||
	   SHUF_CHIMU23i						\
 | 
			
		||||
           LOAD_CHIi \
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
 | 
			
		||||
	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
 | 
			
		||||
	   VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
 | 
			
		||||
	   VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
 | 
			
		||||
	   VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
 | 
			
		||||
	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
 | 
			
		||||
	   VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
 | 
			
		||||
	   VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
 | 
			
		||||
	   VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
 | 
			
		||||
 | 
			
		||||
#define TM_PROJMEM(ptr)				\
 | 
			
		||||
  LOAD64(%r8,ptr)				\
 | 
			
		||||
  __asm__ (					\
 | 
			
		||||
  LOAD_CHIMU01i					\
 | 
			
		||||
  VSUBMEM(6,%r8,Chimu_00,Chi_00)		\
 | 
			
		||||
  VSUBMEM(7,%r8,Chimu_01,Chi_01)		\
 | 
			
		||||
  VSUBMEM(8,%r8,Chimu_02,Chi_02)		\
 | 
			
		||||
  VSUBMEM(9,%r8,Chimu_10,Chi_10)		\
 | 
			
		||||
  VSUBMEM(10,%r8,Chimu_11,Chi_11)		\
 | 
			
		||||
  VSUBMEM(11,%r8,Chimu_12,Chi_12)		);
 | 
			
		||||
 | 
			
		||||
//      fspin(0)=hspin(0)
 | 
			
		||||
//      fspin(1)=hspin(1)
 | 
			
		||||
//      fspin(2)=timesMinusI(hspin(1))
 | 
			
		||||
//      fspin(3)=timesMinusI(hspin(0))
 | 
			
		||||
#define XP_RECON __asm__ (			\
 | 
			
		||||
			  VZERO(TMP)		\
 | 
			
		||||
			  VTIMESMINUSI0(UChi_00,result_30,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI0(UChi_10,result_20,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI0(UChi_01,result_31,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI0(UChi_11,result_21,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI0(UChi_02,result_32,TMP)   \
 | 
			
		||||
			  VTIMESMINUSI0(UChi_12,result_22,TMP)	\
 | 
			
		||||
			  VMOV(UChi_00,result_00)	\
 | 
			
		||||
			  VMOV(UChi_10,result_10)	\
 | 
			
		||||
			  VMOV(UChi_01,result_01)	\
 | 
			
		||||
			  VMOV(UChi_11,result_11)	\
 | 
			
		||||
			  VMOV(UChi_02,result_02)	\
 | 
			
		||||
			  VMOV(UChi_12,result_12)	\
 | 
			
		||||
			  VTIMESMINUSI1(UChi_10,result_20,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI1(UChi_11,result_21,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI1(UChi_12,result_22,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI1(UChi_00,result_30,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI1(UChi_01,result_31,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI1(UChi_02,result_32,TMP)   \
 | 
			
		||||
			  VTIMESMINUSI2(UChi_10,result_20,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI2(UChi_11,result_21,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI2(UChi_12,result_22,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI2(UChi_00,result_30,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI2(UChi_01,result_31,TMP)	\
 | 
			
		||||
			  VTIMESMINUSI2(UChi_02,result_32,TMP)   \
 | 
			
		||||
						);
 | 
			
		||||
  // NB could save 6 ops using addsub => 12 cycles
 | 
			
		||||
#define XP_RECON_ACCUM __asm__ ( \
 | 
			
		||||
  VZERO(TMP)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
 | 
			
		||||
  VADD(UChi_00,result_00,result_00)\
 | 
			
		||||
  VADD(UChi_10,result_10,result_10)\
 | 
			
		||||
  VADD(UChi_01,result_01,result_01)\
 | 
			
		||||
  VADD(UChi_11,result_11,result_11)\
 | 
			
		||||
  VADD(UChi_02,result_02,result_02)\
 | 
			
		||||
  VADD(UChi_12,result_12,result_12)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
 | 
			
		||||
				 );
 | 
			
		||||
 | 
			
		||||
#define XM_RECON __asm__ ( \
 | 
			
		||||
  VZERO(TMP)\
 | 
			
		||||
  VTIMESI0(UChi_00,result_30,TMP)\
 | 
			
		||||
  VTIMESI0(UChi_10,result_20,TMP)\
 | 
			
		||||
  VTIMESI0(UChi_01,result_31,TMP)\
 | 
			
		||||
  VTIMESI0(UChi_11,result_21,TMP)\
 | 
			
		||||
  VTIMESI0(UChi_02,result_32,TMP)\
 | 
			
		||||
  VTIMESI0(UChi_12,result_22,TMP)\
 | 
			
		||||
  VMOV(UChi_00,result_00)\
 | 
			
		||||
  VMOV(UChi_10,result_10)\
 | 
			
		||||
  VMOV(UChi_01,result_01)\
 | 
			
		||||
  VMOV(UChi_11,result_11)\
 | 
			
		||||
  VMOV(UChi_02,result_02)\
 | 
			
		||||
  VMOV(UChi_12,result_12)\
 | 
			
		||||
  VTIMESI1(UChi_00,result_30,TMP)\
 | 
			
		||||
  VTIMESI1(UChi_10,result_20,TMP)\
 | 
			
		||||
  VTIMESI1(UChi_01,result_31,TMP)\
 | 
			
		||||
  VTIMESI1(UChi_11,result_21,TMP)\
 | 
			
		||||
  VTIMESI1(UChi_02,result_32,TMP)\
 | 
			
		||||
  VTIMESI1(UChi_12,result_22,TMP)\
 | 
			
		||||
  VTIMESI2(UChi_10,result_20,TMP)\
 | 
			
		||||
  VTIMESI2(UChi_11,result_21,TMP)\
 | 
			
		||||
  VTIMESI2(UChi_12,result_22,TMP)\
 | 
			
		||||
  VTIMESI2(UChi_00,result_30,TMP)\
 | 
			
		||||
  VTIMESI2(UChi_01,result_31,TMP)\
 | 
			
		||||
  VTIMESI2(UChi_02,result_32,TMP)\
 | 
			
		||||
			   );
 | 
			
		||||
 | 
			
		||||
#define XM_RECON_ACCUM __asm__ ( \
 | 
			
		||||
  VACCTIMESI0(UChi_10,result_20,Z0)\
 | 
			
		||||
  VACCTIMESI0(UChi_00,result_30,Z3)\
 | 
			
		||||
  VACCTIMESI0(UChi_11,result_21,Z1)\
 | 
			
		||||
  VACCTIMESI0(UChi_01,result_31,Z4)\
 | 
			
		||||
  VACCTIMESI0(UChi_12,result_22,Z2)\
 | 
			
		||||
  VACCTIMESI0(UChi_02,result_32,Z5)\
 | 
			
		||||
  \
 | 
			
		||||
  VADD(UChi_10,result_10,result_10)\
 | 
			
		||||
  VADD(UChi_00,result_00,result_00)\
 | 
			
		||||
  VADD(UChi_11,result_11,result_11)\
 | 
			
		||||
  VADD(UChi_01,result_01,result_01)\
 | 
			
		||||
  VADD(UChi_12,result_12,result_12)\
 | 
			
		||||
  VADD(UChi_02,result_02,result_02)\
 | 
			
		||||
  \
 | 
			
		||||
  VACCTIMESI1(UChi_10,result_20,Z0)\
 | 
			
		||||
  VACCTIMESI1(UChi_00,result_30,Z3)\
 | 
			
		||||
  VACCTIMESI1(UChi_11,result_21,Z1)\
 | 
			
		||||
  VACCTIMESI1(UChi_01,result_31,Z4)\
 | 
			
		||||
  VACCTIMESI1(UChi_12,result_22,Z2)\
 | 
			
		||||
  VACCTIMESI1(UChi_02,result_32,Z5)\
 | 
			
		||||
  VACCTIMESI2(UChi_10,result_20,Z0)\
 | 
			
		||||
  VACCTIMESI2(UChi_11,result_21,Z1)\
 | 
			
		||||
  VACCTIMESI2(UChi_12,result_22,Z2)\
 | 
			
		||||
  VACCTIMESI2(UChi_00,result_30,Z3)\
 | 
			
		||||
  VACCTIMESI2(UChi_01,result_31,Z4)\
 | 
			
		||||
  VACCTIMESI2(UChi_02,result_32,Z5)\
 | 
			
		||||
				 );
 | 
			
		||||
 | 
			
		||||
#define YP_RECON_ACCUM __asm__ ( \
 | 
			
		||||
  VADD(UChi_00,result_00,result_00)\
 | 
			
		||||
  VADD(UChi_10,result_10,result_10)\
 | 
			
		||||
  VADD(UChi_01,result_01,result_01)\
 | 
			
		||||
  VADD(UChi_11,result_11,result_11)\
 | 
			
		||||
  VADD(UChi_02,result_02,result_02)\
 | 
			
		||||
  VADD(UChi_12,result_12,result_12)\
 | 
			
		||||
  VADD(UChi_10,result_20,result_20)\
 | 
			
		||||
  VADD(UChi_11,result_21,result_21)\
 | 
			
		||||
  VADD(UChi_12,result_22,result_22)\
 | 
			
		||||
  VSUB(UChi_00,result_30,result_30)\
 | 
			
		||||
  VSUB(UChi_01,result_31,result_31)\
 | 
			
		||||
  VSUB(UChi_02,result_32,result_32) );
 | 
			
		||||
 | 
			
		||||
#define YM_RECON_ACCUM __asm__ ( \
 | 
			
		||||
  VADD(UChi_00,result_00,result_00)\
 | 
			
		||||
  VADD(UChi_10,result_10,result_10)\
 | 
			
		||||
  VADD(UChi_01,result_01,result_01)\
 | 
			
		||||
  VADD(UChi_11,result_11,result_11)\
 | 
			
		||||
  VADD(UChi_02,result_02,result_02)\
 | 
			
		||||
  VADD(UChi_12,result_12,result_12)\
 | 
			
		||||
  VSUB(UChi_10,result_20,result_20)\
 | 
			
		||||
  VSUB(UChi_11,result_21,result_21)\
 | 
			
		||||
  VSUB(UChi_12,result_22,result_22)\
 | 
			
		||||
  VADD(UChi_00,result_30,result_30)\
 | 
			
		||||
  VADD(UChi_01,result_31,result_31)\
 | 
			
		||||
  VADD(UChi_02,result_32,result_32) );
 | 
			
		||||
 | 
			
		||||
#define ZP_RECON_ACCUM __asm__ ( \
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
 | 
			
		||||
  VACCTIMESI0(UChi_10,result_30,Z3)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
 | 
			
		||||
  VACCTIMESI0(UChi_11,result_31,Z4)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
 | 
			
		||||
  VACCTIMESI0(UChi_12,result_32,Z5)\
 | 
			
		||||
  VADD(UChi_00,result_00,result_00)\
 | 
			
		||||
  VADD(UChi_10,result_10,result_10)\
 | 
			
		||||
  VADD(UChi_01,result_01,result_01)\
 | 
			
		||||
  VADD(UChi_11,result_11,result_11)\
 | 
			
		||||
  VADD(UChi_02,result_02,result_02)\
 | 
			
		||||
  VADD(UChi_12,result_12,result_12)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
 | 
			
		||||
  VACCTIMESI1(UChi_10,result_30,Z3)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
 | 
			
		||||
  VACCTIMESI1(UChi_11,result_31,Z4)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
 | 
			
		||||
  VACCTIMESI1(UChi_12,result_32,Z5)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
 | 
			
		||||
  VACCTIMESI2(UChi_10,result_30,Z3)\
 | 
			
		||||
  VACCTIMESI2(UChi_11,result_31,Z4)\
 | 
			
		||||
  VACCTIMESI2(UChi_12,result_32,Z5)\
 | 
			
		||||
				 );
 | 
			
		||||
 | 
			
		||||
#define ZM_RECON_ACCUM __asm__ ( \
 | 
			
		||||
  VACCTIMESI0(UChi_00,result_20,Z0)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
 | 
			
		||||
  VACCTIMESI0(UChi_01,result_21,Z1)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
 | 
			
		||||
  VACCTIMESI0(UChi_02,result_22,Z2)\
 | 
			
		||||
  VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
 | 
			
		||||
  VADD(UChi_00,result_00,result_00)\
 | 
			
		||||
  VADD(UChi_10,result_10,result_10)\
 | 
			
		||||
  VADD(UChi_01,result_01,result_01)\
 | 
			
		||||
  VADD(UChi_11,result_11,result_11)\
 | 
			
		||||
  VADD(UChi_02,result_02,result_02)\
 | 
			
		||||
  VADD(UChi_12,result_12,result_12)\
 | 
			
		||||
  VACCTIMESI1(UChi_00,result_20,Z0)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
 | 
			
		||||
  VACCTIMESI1(UChi_01,result_21,Z1)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
 | 
			
		||||
  VACCTIMESI1(UChi_02,result_22,Z2)\
 | 
			
		||||
  VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
 | 
			
		||||
  VACCTIMESI2(UChi_00,result_20,Z0)\
 | 
			
		||||
  VACCTIMESI2(UChi_01,result_21,Z1)\
 | 
			
		||||
  VACCTIMESI2(UChi_02,result_22,Z2)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
 | 
			
		||||
  VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
 | 
			
		||||
				 );
 | 
			
		||||
 | 
			
		||||
#define TP_RECON_ACCUM __asm__ ( \
 | 
			
		||||
  VADD(UChi_00,result_00,result_00)\
 | 
			
		||||
  VADD(UChi_10,result_10,result_10)\
 | 
			
		||||
  VADD(UChi_01,result_01,result_01)\
 | 
			
		||||
  VADD(UChi_11,result_11,result_11)\
 | 
			
		||||
  VADD(UChi_02,result_02,result_02)\
 | 
			
		||||
  VADD(UChi_12,result_12,result_12)\
 | 
			
		||||
  VADD(UChi_00,result_20,result_20)\
 | 
			
		||||
  VADD(UChi_10,result_30,result_30)\
 | 
			
		||||
  VADD(UChi_01,result_21,result_21)\
 | 
			
		||||
  VADD(UChi_11,result_31,result_31)\
 | 
			
		||||
  VADD(UChi_02,result_22,result_22)\
 | 
			
		||||
  VADD(UChi_12,result_32,result_32) );
 | 
			
		||||
 | 
			
		||||
#define TM_RECON_ACCUM __asm__ ( \
 | 
			
		||||
  VADD(UChi_00,result_00,result_00)\
 | 
			
		||||
  VADD(UChi_10,result_10,result_10)\
 | 
			
		||||
  VADD(UChi_01,result_01,result_01)\
 | 
			
		||||
  VADD(UChi_11,result_11,result_11)\
 | 
			
		||||
  VADD(UChi_02,result_02,result_02)\
 | 
			
		||||
  VADD(UChi_12,result_12,result_12)\
 | 
			
		||||
  VSUB(UChi_00,result_20,result_20)\
 | 
			
		||||
  VSUB(UChi_10,result_30,result_30)\
 | 
			
		||||
  VSUB(UChi_01,result_21,result_21)\
 | 
			
		||||
  VSUB(UChi_11,result_31,result_31)\
 | 
			
		||||
  VSUB(UChi_02,result_22,result_22)\
 | 
			
		||||
  VSUB(UChi_12,result_32,result_32) );
 | 
			
		||||
 | 
			
		||||
#define PREFETCH_CHIMU(A) \
 | 
			
		||||
  LOAD64(%r9,A)						\
 | 
			
		||||
	   __asm__ (						\
 | 
			
		||||
  VPREFETCHG(12,%r9)\
 | 
			
		||||
  VPREFETCHG(13,%r9)\
 | 
			
		||||
  VPREFETCHG(14,%r9)\
 | 
			
		||||
  VPREFETCHG(15,%r9)\
 | 
			
		||||
  VPREFETCHG(16,%r9)\
 | 
			
		||||
  VPREFETCHG(17,%r9)\
 | 
			
		||||
  VPREFETCHG(18,%r9)\
 | 
			
		||||
  VPREFETCHG(19,%r9)\
 | 
			
		||||
  VPREFETCHG(20,%r9)\
 | 
			
		||||
  VPREFETCHG(21,%r9)\
 | 
			
		||||
  VPREFETCHG(22,%r9)\
 | 
			
		||||
  VPREFETCHG(23,%r9));
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR0 __asm__ ( 	\
 | 
			
		||||
  VPERM0(Chi_00,Chi_00)	\
 | 
			
		||||
  VPERM0(Chi_01,Chi_01)	\
 | 
			
		||||
  VPERM0(Chi_02,Chi_02)	\
 | 
			
		||||
  VPERM0(Chi_10,Chi_10)	\
 | 
			
		||||
  VPERM0(Chi_11,Chi_11)	\
 | 
			
		||||
  VPERM0(Chi_12,Chi_12) );
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR1 __asm__ (	\
 | 
			
		||||
  VPERM1(Chi_00,Chi_00)	\
 | 
			
		||||
  VPERM1(Chi_01,Chi_01)	\
 | 
			
		||||
  VPERM1(Chi_02,Chi_02)	\
 | 
			
		||||
  VPERM1(Chi_10,Chi_10)	\
 | 
			
		||||
  VPERM1(Chi_11,Chi_11)	\
 | 
			
		||||
  VPERM1(Chi_12,Chi_12));
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR2 __asm__ (	\
 | 
			
		||||
  VPERM2(Chi_00,Chi_00)	\
 | 
			
		||||
  VPERM2(Chi_01,Chi_01)	\
 | 
			
		||||
  VPERM2(Chi_02,Chi_02)	\
 | 
			
		||||
  VPERM2(Chi_10,Chi_10)	\
 | 
			
		||||
  VPERM2(Chi_11,Chi_11)	\
 | 
			
		||||
  VPERM2(Chi_12,Chi_12) );
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR3 __asm__ (	\
 | 
			
		||||
  VPERM3(Chi_00,Chi_00)	\
 | 
			
		||||
  VPERM3(Chi_01,Chi_01)	\
 | 
			
		||||
  VPERM3(Chi_02,Chi_02)	\
 | 
			
		||||
  VPERM3(Chi_10,Chi_10)	\
 | 
			
		||||
  VPERM3(Chi_11,Chi_11)	\
 | 
			
		||||
  VPERM3(Chi_12,Chi_12) );
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define MULT_ADDSUB_2SPIN(ptr,pf)					\
 | 
			
		||||
  LOAD64(%r8,ptr)						\
 | 
			
		||||
  LOAD64(%r9,pf)						\
 | 
			
		||||
	   __asm__ (						\
 | 
			
		||||
	   VPREFETCH2(9,%r8)				   \
 | 
			
		||||
	   VPREFETCH2(10,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(11,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(12,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(13,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(14,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(15,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(16,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(17,%r8)					   \
 | 
			
		||||
	   VSHUF(Chi_00,T1)				\
 | 
			
		||||
	   VMOVIDUP(0,%r8,Z0 )					\
 | 
			
		||||
           VMOVIDUP(3,%r8,Z1 )					\
 | 
			
		||||
           VMOVIDUP(6,%r8,Z2 )	          VSHUF(Chi_10,T2)		\
 | 
			
		||||
	   /*6*/							\
 | 
			
		||||
           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 )	\
 | 
			
		||||
           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 )	\
 | 
			
		||||
           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 )	\
 | 
			
		||||
           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
 | 
			
		||||
           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
 | 
			
		||||
           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
 | 
			
		||||
	   VPREFETCHG(0,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(1,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(2,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(3,%r9)					   \
 | 
			
		||||
	   /*18*/						\
 | 
			
		||||
           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
 | 
			
		||||
           VMADDSUB(Z3,Chi_10,UChi_10)				\
 | 
			
		||||
           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 )	\
 | 
			
		||||
           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
 | 
			
		||||
           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
 | 
			
		||||
           VMADDSUB(Z5,Chi_10,UChi_12)				\
 | 
			
		||||
	   VPREFETCHG(4,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(5,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(6,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(7,%r9)					   \
 | 
			
		||||
	   /*28*/						\
 | 
			
		||||
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
 | 
			
		||||
           VMADDSUB(Z0,T2,UChi_10)				\
 | 
			
		||||
           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 )	\
 | 
			
		||||
           VMADDSUB(Z1,T2,UChi_11)				\
 | 
			
		||||
           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 )	\
 | 
			
		||||
           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 )	\
 | 
			
		||||
	   VPREFETCH2(12,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(13,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(14,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(15,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(16,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(17,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(18,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(19,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(20,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(21,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(22,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(23,%r9)					   \
 | 
			
		||||
           /*38*/						\
 | 
			
		||||
           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)	\
 | 
			
		||||
           VMADDSUB(Z3,Chi_11,UChi_10)				\
 | 
			
		||||
           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 )	\
 | 
			
		||||
           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
 | 
			
		||||
           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
 | 
			
		||||
           VMADDSUB(Z5,Chi_11,UChi_12)				\
 | 
			
		||||
	   VPREFETCHG(9,%r8)				   \
 | 
			
		||||
	   VPREFETCHG(10,%r8)					   \
 | 
			
		||||
	   VPREFETCHG(11,%r8)					   \
 | 
			
		||||
	   VPREFETCHG(12,%r8)					   \
 | 
			
		||||
	   VPREFETCHG(13,%r8)					   \
 | 
			
		||||
	   VPREFETCHG(14,%r8)					   \
 | 
			
		||||
	   VPREFETCHG(15,%r8)					   \
 | 
			
		||||
	   VPREFETCHG(16,%r8)					   \
 | 
			
		||||
	   VPREFETCHG(17,%r8)					   \
 | 
			
		||||
	   /*48*/						\
 | 
			
		||||
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
 | 
			
		||||
           VMADDSUB(Z0,T2,UChi_10)			      \
 | 
			
		||||
           VMADDSUB(Z1,T1,UChi_01)			      \
 | 
			
		||||
           VMADDSUB(Z1,T2,UChi_11)			      \
 | 
			
		||||
           VMADDSUB(Z2,T1,UChi_02)			      \
 | 
			
		||||
           VMADDSUB(Z2,T2,UChi_12)			      \
 | 
			
		||||
	   VPREFETCHG(8,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(9,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(10,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(11,%r9)					   \
 | 
			
		||||
	   /*55*/					      \
 | 
			
		||||
           VMADDSUB(Z3,Chi_02,UChi_00)			      \
 | 
			
		||||
           VMADDSUB(Z3,Chi_12,UChi_10)			      \
 | 
			
		||||
           VMADDSUB(Z4,Chi_02,UChi_01)			      \
 | 
			
		||||
           VMADDSUB(Z4,Chi_12,UChi_11)			      \
 | 
			
		||||
           VMADDSUB(Z5,Chi_02,UChi_02)			      \
 | 
			
		||||
           VMADDSUB(Z5,Chi_12,UChi_12)			      \
 | 
			
		||||
	   /*61 insns*/							);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define MULT_ADDSUB_2SPIN_LS(ptr,pf)				   \
 | 
			
		||||
  LOAD64(%r8,ptr)						   \
 | 
			
		||||
  LOAD64(%r9,pf)						   \
 | 
			
		||||
  __asm__ (							   \
 | 
			
		||||
           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
 | 
			
		||||
           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
 | 
			
		||||
           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
 | 
			
		||||
           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 | 
			
		||||
	   VPREFETCHG(0,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(1,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(2,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(3,%r9)					   \
 | 
			
		||||
	   /*8*/						   \
 | 
			
		||||
           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
 | 
			
		||||
           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
 | 
			
		||||
           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
 | 
			
		||||
           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 | 
			
		||||
	   VPREFETCHG(4,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(5,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(6,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(7,%r9)					   \
 | 
			
		||||
	   /*16*/					  	   \
 | 
			
		||||
           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
 | 
			
		||||
           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
 | 
			
		||||
           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
 | 
			
		||||
	   VPREFETCHG(8,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(9,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(10,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(11,%r9)					   \
 | 
			
		||||
           /*22*/						   \
 | 
			
		||||
           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
 | 
			
		||||
           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
 | 
			
		||||
           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
 | 
			
		||||
           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 | 
			
		||||
	   VPREFETCH2(12,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(13,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(14,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(15,%r9)					   \
 | 
			
		||||
	   /*30*/						   \
 | 
			
		||||
           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
 | 
			
		||||
           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
 | 
			
		||||
	   VPREFETCH2(16,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(17,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(18,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(19,%r9)					   \
 | 
			
		||||
           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 | 
			
		||||
	   /*36*/					           \
 | 
			
		||||
           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
 | 
			
		||||
           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
 | 
			
		||||
           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 | 
			
		||||
	   VPREFETCH2(20,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(21,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(22,%r9)					   \
 | 
			
		||||
	   VPREFETCH2(23,%r9)					   \
 | 
			
		||||
	   VPREFETCHG(2,%r8)					   \
 | 
			
		||||
	   VPREFETCHG(3,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(4,%r8)					   \
 | 
			
		||||
	   VPREFETCH2(5,%r8)					   \
 | 
			
		||||
	   /*42 insns*/						);
 | 
			
		||||
 | 
			
		||||
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
 | 
			
		||||
  LOAD64(%r8,ptr)						   \
 | 
			
		||||
  LOAD64(%r9,pf)						   \
 | 
			
		||||
  __asm__ (							   \
 | 
			
		||||
           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
 | 
			
		||||
           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
 | 
			
		||||
           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
 | 
			
		||||
           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 | 
			
		||||
	   /*8*/						   \
 | 
			
		||||
           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
 | 
			
		||||
           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
 | 
			
		||||
           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
 | 
			
		||||
           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 | 
			
		||||
	   /*16*/					  	   \
 | 
			
		||||
           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
 | 
			
		||||
           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
 | 
			
		||||
           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
 | 
			
		||||
           /*22*/						   \
 | 
			
		||||
           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
 | 
			
		||||
           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
 | 
			
		||||
           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
 | 
			
		||||
           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 | 
			
		||||
	   /*30*/						   \
 | 
			
		||||
           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
 | 
			
		||||
           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
 | 
			
		||||
           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 | 
			
		||||
	   /*36*/					           \
 | 
			
		||||
           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
 | 
			
		||||
           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
 | 
			
		||||
           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 | 
			
		||||
	   /*	   VPREFETCHG(2,%r8)*/				   \
 | 
			
		||||
	   /*	   VPREFETCHG(3,%r8)*/				   \
 | 
			
		||||
	   /*42 insns*/						);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define Z6 Chi_00
 | 
			
		||||
#define MULT_ADDSUB_2SPIN_NEW(ptr,pf)			       \
 | 
			
		||||
  LOAD64(%r8,ptr)					       \
 | 
			
		||||
  __asm__ (							  \
 | 
			
		||||
   VSHUFMEM(0,%r8,Z0)					          \
 | 
			
		||||
   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
 | 
			
		||||
   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
 | 
			
		||||
   VMUL(Z0,Chi_00,Z1)         VMUL(Z0,Chi_10,Z2)		  \
 | 
			
		||||
   VSHUFMEM(3,%r8,Z0)						  \
 | 
			
		||||
   VMUL(Z0,Chi_00,Z3)         VMUL(Z0,Chi_10,Z4)		  \
 | 
			
		||||
   VSHUFMEM(6,%r8,Z0)						  \
 | 
			
		||||
   VMUL(Z0,Chi_00,Z5)         VMUL(Z0,Chi_10,Z6)		  \
 | 
			
		||||
   VMULMEM(0,%r8,T1,UChi_00)  VMULMEM(0,%r8,T2,UChi_10)		  \
 | 
			
		||||
   VMULMEM(3,%r8,T1,UChi_01)  VMULMEM(3,%r8,T2,UChi_11)		  \
 | 
			
		||||
   VMULMEM(6,%r8,T1,UChi_02)  VMULMEM(6,%r8,T2,UChi_12)		  \
 | 
			
		||||
   /*11 cycles*/						  \
 | 
			
		||||
   VSHUFMEM(1,%r8,Z0)						  \
 | 
			
		||||
   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
 | 
			
		||||
   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
 | 
			
		||||
   VMADD(Z0,Chi_01,Z1)        VMADD(Z0,Chi_11,Z2)		  \
 | 
			
		||||
   VSHUFMEM(4,%r8,Z0)						  \
 | 
			
		||||
   VMADD(Z0,Chi_01,Z3)        VMADD(Z0,Chi_11,Z4)		  \
 | 
			
		||||
   VSHUFMEM(7,%r8,Z0)						  \
 | 
			
		||||
   VMADD(Z0,Chi_01,Z5)        VMADD(Z0,Chi_11,Z6)		  \
 | 
			
		||||
   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
 | 
			
		||||
   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
 | 
			
		||||
   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
 | 
			
		||||
   /*22 cycles*/						  \
 | 
			
		||||
   VSHUFMEM(2,%r8,Z0)						  \
 | 
			
		||||
   VRDUP(Chi_02,T1)        VIDUP(Chi_02,Chi_02)			  \
 | 
			
		||||
   VRDUP(Chi_12,T2)        VIDUP(Chi_12,Chi_12)			  \
 | 
			
		||||
   VMADD(Z0,Chi_02,Z1)        VMADD(Z0,Chi_12,Z2)		  \
 | 
			
		||||
   VSHUFMEM(5,%r8,Z0)						  \
 | 
			
		||||
   VMADD(Z0,Chi_02,Z3)        VMADD(Z0,Chi_12,Z4)		  \
 | 
			
		||||
   VSHUFMEM(8,%r8,Z0)						  \
 | 
			
		||||
   VMADD(Z0,Chi_02,Z5)        VMADD(Z0,Chi_12,Z6)		  \
 | 
			
		||||
   /*33 cycles*/						  \
 | 
			
		||||
   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
 | 
			
		||||
   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
 | 
			
		||||
   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
 | 
			
		||||
  /*stall*/						       \
 | 
			
		||||
  /*stall*/						       \
 | 
			
		||||
  /*stall*/						       \
 | 
			
		||||
  VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
 | 
			
		||||
  VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
 | 
			
		||||
  VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -103,9 +103,11 @@ void LebesgueOrder::IterateI(int ND,
 | 
			
		||||
    } else {
 | 
			
		||||
      for(int d=0;d<ND;d++){
 | 
			
		||||
	x[d]=xi[d]+xo[d];
 | 
			
		||||
//	std::cout << x[d]<<" ";
 | 
			
		||||
      }
 | 
			
		||||
//      std::cout << "\n";
 | 
			
		||||
      IndexInteger index;
 | 
			
		||||
      grid->IndexFromCoor(x,index,grid->_rdimensions);
 | 
			
		||||
      Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
 | 
			
		||||
      _LebesgueReorder.push_back(index);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
@@ -188,6 +190,7 @@ void LebesgueOrder::ZGraph(void)
 | 
			
		||||
  }
 | 
			
		||||
  assert( _LebesgueReorder.size() == vol );
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
  std::vector<int> coor(4);
 | 
			
		||||
  for(IndexInteger asite=0;asite<vol;asite++){
 | 
			
		||||
    grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
 | 
			
		||||
@@ -198,5 +201,6 @@ void LebesgueOrder::ZGraph(void)
 | 
			
		||||
		<< coor[3]<<"]"
 | 
			
		||||
		<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  */
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -44,8 +44,8 @@ template<class vsimd,class scalar>
 | 
			
		||||
inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y, 
 | 
			
		||||
		    std::vector<scalar *> &extracted,int offset){
 | 
			
		||||
  // FIXME: bounce off memory is painful
 | 
			
		||||
  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  scalar*buf = (scalar *)y;
 | 
			
		||||
@@ -59,8 +59,10 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y, 
 | 
			
		||||
		  std::vector<scalar *> &extracted,int offset){
 | 
			
		||||
 | 
			
		||||
  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
 | 
			
		||||
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
 | 
			
		||||
                     // replicate n-fold. Use to allow Integer masks to 
 | 
			
		||||
                     // predicate floating point of various width assignments and maintain conformable.
 | 
			
		||||
@@ -85,6 +87,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
			
		||||
  scalar *buf = (scalar *)&y;
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    extracted[i]=buf[i*s];
 | 
			
		||||
#ifdef PARANOID
 | 
			
		||||
    for(int ii=1;ii<s;ii++){
 | 
			
		||||
      if ( buf[i*s]!=buf[i*s+ii] ){
 | 
			
		||||
	std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
 | 
			
		||||
@@ -96,6 +99,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
			
		||||
      }
 | 
			
		||||
      assert(buf[i*s]==buf[i*s+ii]);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
@@ -106,7 +110,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type  &y,std::vector<scalar> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  static const int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
  scalar *buf = (scalar *)&y;
 | 
			
		||||
 | 
			
		||||
@@ -125,9 +129,9 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
			
		||||
  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nextr);
 | 
			
		||||
@@ -148,8 +152,8 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  static const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int s = Nsimd/Nextr;
 | 
			
		||||
@@ -172,8 +176,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
			
		||||
  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  int Nextr = extracted.size();
 | 
			
		||||
  int splat=Nsimd/Nextr;
 | 
			
		||||
@@ -197,7 +201,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
@@ -224,20 +228,17 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  static const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  scalar_type *pointer;
 | 
			
		||||
  scalar_type *vp = (scalar_type *)&vec;
 | 
			
		||||
 | 
			
		||||
  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
			
		||||
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    pointer=(scalar_type *)&extracted[i][offset];
 | 
			
		||||
    for(int w=0;w<words;w++){
 | 
			
		||||
      vp[w*Nsimd+i] = pointer[w];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
      vp[w*Nsimd+i] = ((scalar_type *)&extracted[i][offset])[w];
 | 
			
		||||
  }}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
 
 | 
			
		||||
@@ -18,7 +18,7 @@ TESTS=`ls T*.cc`
 | 
			
		||||
TESTLIST=`echo ${TESTS} | sed s/.cc//g `
 | 
			
		||||
 | 
			
		||||
echo > Make.inc
 | 
			
		||||
echo bin_PROGRAMS = ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
 | 
			
		||||
echo bin_PROGRAMS += ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
 | 
			
		||||
echo >> Make.inc
 | 
			
		||||
 | 
			
		||||
for f in $TESTS
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
 | 
			
		||||
bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
 | 
			
		||||
bin_PROGRAMS += Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Test_cayley_cg_SOURCES=Test_cayley_cg.cc
 | 
			
		||||
@@ -50,6 +50,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
 | 
			
		||||
Test_cshift_red_black_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
 | 
			
		||||
Test_cshift_red_black_rotate_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
 | 
			
		||||
Test_cshift_rotate_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
 | 
			
		||||
Test_dwf_cg_prec_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
@@ -90,6 +98,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 | 
			
		||||
Test_dwf_lanczos_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
 | 
			
		||||
Test_dwf_rb5d_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Test_gamma_SOURCES=Test_gamma.cc
 | 
			
		||||
Test_gamma_LDADD=-lGrid
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -8,8 +8,20 @@ endif
 | 
			
		||||
AM_CXXFLAGS = -I$(top_srcdir)/lib
 | 
			
		||||
AM_LDFLAGS = -L$(top_builddir)/lib
 | 
			
		||||
 | 
			
		||||
if USE_LAPACK
 | 
			
		||||
AM_CXXFLAGS += -DUSE_LAPACK
 | 
			
		||||
if USE_LAPACK_LIB
 | 
			
		||||
#if test "X${ac_LAPACK}X" != XyesX 
 | 
			
		||||
AM_CXXFLAGS += -I$(ac_LAPACK)/include
 | 
			
		||||
AM_LDFLAGS += -L$(ac_LAPACK)/lib
 | 
			
		||||
#fi
 | 
			
		||||
endif
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
if BUILD_ZMM
 | 
			
		||||
  bin_PROGRAMS=Test_zmm
 | 
			
		||||
else
 | 
			
		||||
  bin_PROGRAMS=
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
include Make.inc
 | 
			
		||||
 
 | 
			
		||||
@@ -96,13 +96,13 @@ int main (int argc, char ** argv)
 | 
			
		||||
	  std::vector<int> peer(4);
 | 
			
		||||
	  Complex tmp  =cm;
 | 
			
		||||
	  Integer index=real(tmp);
 | 
			
		||||
	  Fine.CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
 | 
			
		||||
	  if (nrm > 0){
 | 
			
		||||
	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    index=real(scm);
 | 
			
		||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	  }
 | 
			
		||||
	}}}}
 | 
			
		||||
 
 | 
			
		||||
@@ -132,7 +132,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
	  std::vector<int> peer(4);
 | 
			
		||||
	  Complex ctmp = cm;
 | 
			
		||||
	  Integer index=real(ctmp);
 | 
			
		||||
	  Fine.CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
 | 
			
		||||
	  if (nrm > 0){
 | 
			
		||||
	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
			
		||||
@@ -140,7 +140,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    index=real(scm);
 | 
			
		||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    exit(-1);
 | 
			
		||||
	  }
 | 
			
		||||
@@ -180,7 +180,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
	  std::vector<int> peer(4);
 | 
			
		||||
	  Complex ctmp=cmeo;
 | 
			
		||||
	  Integer index=real(ctmp);
 | 
			
		||||
	  Fine.CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
 | 
			
		||||
	  double nrm = abs(cmeo()()()-scm);
 | 
			
		||||
	  if (nrm != 0) {
 | 
			
		||||
@@ -189,7 +189,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    index=real(scm);
 | 
			
		||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    exx=1;
 | 
			
		||||
 | 
			
		||||
@@ -205,7 +205,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    index=real(scm);
 | 
			
		||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    exx=1;
 | 
			
		||||
	  } else if (1) { 
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										223
									
								
								tests/Test_cshift_red_black_rotate.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										223
									
								
								tests/Test_cshift_red_black_rotate.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,223 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_cshift_red_black.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  int Nd = latt_size.size();
 | 
			
		||||
  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
  std::vector<int> mask(Nd,1);
 | 
			
		||||
  mask[0]=0;
 | 
			
		||||
 | 
			
		||||
  GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
  LatticeComplex U(&Fine);
 | 
			
		||||
  LatticeComplex ShiftU(&Fine);
 | 
			
		||||
  LatticeComplex rbShiftU(&Fine);
 | 
			
		||||
  LatticeComplex Ue(&RBFine); 
 | 
			
		||||
  LatticeComplex Uo(&RBFine);
 | 
			
		||||
  LatticeComplex ShiftUe(&RBFine);
 | 
			
		||||
  LatticeComplex ShiftUo(&RBFine);
 | 
			
		||||
  LatticeComplex lex(&Fine);
 | 
			
		||||
  lex=zero;
 | 
			
		||||
  Integer stride =1;
 | 
			
		||||
  {
 | 
			
		||||
    double nrm;
 | 
			
		||||
    LatticeComplex coor(&Fine);
 | 
			
		||||
 | 
			
		||||
    for(int d=0;d<Nd;d++){
 | 
			
		||||
      //      Integer i=10000;
 | 
			
		||||
      Integer i=0;
 | 
			
		||||
      LatticeCoordinate(coor,d);
 | 
			
		||||
      lex = lex + coor*stride+i;
 | 
			
		||||
      stride=stride*latt_size[d];
 | 
			
		||||
    }
 | 
			
		||||
    U=lex;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even,Ue,U);
 | 
			
		||||
  pickCheckerboard(Odd,Uo,U);
 | 
			
		||||
 | 
			
		||||
  //  std::cout<<GridLogMessage << U<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  TComplex cm;
 | 
			
		||||
  TComplex cmeo;
 | 
			
		||||
  for(int dir=0;dir<Nd;dir++){
 | 
			
		||||
    //    if ( dir!=1 ) continue;
 | 
			
		||||
    for(int shift=0;shift<latt_size[dir];shift++){
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage<<"Even grid"<<std::endl;
 | 
			
		||||
	ShiftUe = Cshift(Ue,dir,shift);    // Shift everything cb by cb
 | 
			
		||||
	std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
 | 
			
		||||
	ShiftUo = Cshift(Uo,dir,shift);    
 | 
			
		||||
	std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
 | 
			
		||||
	setCheckerboard(rbShiftU,ShiftUe);
 | 
			
		||||
	setCheckerboard(rbShiftU,ShiftUo);
 | 
			
		||||
	std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
 | 
			
		||||
	ShiftU  = Cshift(U,dir,shift);    // Shift everything
 | 
			
		||||
	std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
 | 
			
		||||
 | 
			
		||||
	std::vector<int> coor(4);
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << "Checking the non-checkerboard shift"<<std::endl;
 | 
			
		||||
	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 | 
			
		||||
	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 | 
			
		||||
	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 | 
			
		||||
	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 | 
			
		||||
	  
 | 
			
		||||
	  peekSite(cm,ShiftU,coor);
 | 
			
		||||
 | 
			
		||||
	  /////////	  double nrm=norm2(U);
 | 
			
		||||
 | 
			
		||||
	  std::vector<int> scoor(coor);
 | 
			
		||||
	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 | 
			
		||||
	  
 | 
			
		||||
	  Integer slex = scoor[0]
 | 
			
		||||
	    + latt_size[0]*scoor[1]
 | 
			
		||||
	    + latt_size[0]*latt_size[1]*scoor[2]
 | 
			
		||||
	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 | 
			
		||||
 | 
			
		||||
	  Complex scm(slex);
 | 
			
		||||
	  
 | 
			
		||||
	  double nrm = abs(scm-cm()()());
 | 
			
		||||
	  std::vector<int> peer(4);
 | 
			
		||||
	  Complex ctmp = cm;
 | 
			
		||||
	  Integer index=real(ctmp);
 | 
			
		||||
	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
 | 
			
		||||
	  if (nrm > 0){
 | 
			
		||||
	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
			
		||||
		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
			
		||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    index=real(scm);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    exit(-1);
 | 
			
		||||
	  }
 | 
			
		||||
	}}}}
 | 
			
		||||
 | 
			
		||||
	int exx=0;
 | 
			
		||||
	std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
 | 
			
		||||
	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 | 
			
		||||
	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 | 
			
		||||
	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 | 
			
		||||
	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 | 
			
		||||
	  
 | 
			
		||||
	  peekSite(cm,rbShiftU,coor);
 | 
			
		||||
 | 
			
		||||
	  Integer checkerboard = RBFine.CheckerBoard(coor);
 | 
			
		||||
 | 
			
		||||
	  //	  std::cout << " coor "<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] \n ";
 | 
			
		||||
	  //	  std::cout << "shift "<< shift <<" dir "<<dir<< " checker board "<< checkerboard << " ";
 | 
			
		||||
	  //	  std::cout << "Uo "   << ShiftUo.checkerboard << " Ue "<<ShiftUe.checkerboard<<std::endl;
 | 
			
		||||
	  if ( checkerboard == ShiftUo.checkerboard ) {
 | 
			
		||||
	    peekSite(cmeo,ShiftUo,coor);
 | 
			
		||||
	  } else { 
 | 
			
		||||
	    peekSite(cmeo,ShiftUe,coor);
 | 
			
		||||
	  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	  std::vector<int> scoor(coor);
 | 
			
		||||
	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 | 
			
		||||
	  
 | 
			
		||||
	  Integer slex = scoor[0]
 | 
			
		||||
	    + latt_size[0]*scoor[1]
 | 
			
		||||
	    + latt_size[0]*latt_size[1]*scoor[2]
 | 
			
		||||
	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 | 
			
		||||
 | 
			
		||||
	  Complex scm(slex);
 | 
			
		||||
 | 
			
		||||
	  std::vector<int> peer(4);
 | 
			
		||||
	  Complex ctmp=cmeo;
 | 
			
		||||
	  Integer index=real(ctmp);
 | 
			
		||||
	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
 | 
			
		||||
	  double nrm = abs(cmeo()()()-scm);
 | 
			
		||||
	  if (nrm != 0) {
 | 
			
		||||
	    std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir
 | 
			
		||||
		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
			
		||||
		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    index=real(scm);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    exx=1;
 | 
			
		||||
 | 
			
		||||
	  }
 | 
			
		||||
 | 
			
		||||
	  ctmp=cm;
 | 
			
		||||
	  index=real(ctmp);
 | 
			
		||||
	  nrm = abs(scm-cm()()());
 | 
			
		||||
 | 
			
		||||
	  if (nrm > 0){
 | 
			
		||||
	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
			
		||||
		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
			
		||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    index=real(scm);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    exx=1;
 | 
			
		||||
	  } else if (1) { 
 | 
			
		||||
	    std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
 | 
			
		||||
		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
			
		||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	  }
 | 
			
		||||
	}}}}
 | 
			
		||||
	if (exx) exit(-1);
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										125
									
								
								tests/Test_cshift_rotate.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								tests/Test_cshift_rotate.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,125 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_cshift.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
  LatticeComplex U(&Fine);
 | 
			
		||||
  LatticeComplex ShiftU(&Fine);
 | 
			
		||||
 | 
			
		||||
  LatticeComplex lex(&Fine);
 | 
			
		||||
  lex=zero;
 | 
			
		||||
  Integer stride =1;
 | 
			
		||||
  {
 | 
			
		||||
    double nrm;
 | 
			
		||||
    LatticeComplex coor(&Fine);
 | 
			
		||||
 | 
			
		||||
    for(int d=0;d<4;d++){
 | 
			
		||||
      LatticeCoordinate(coor,d);
 | 
			
		||||
      lex = lex + coor*stride;
 | 
			
		||||
      stride=stride*latt_size[d];
 | 
			
		||||
    }
 | 
			
		||||
    U=lex;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  TComplex cm;
 | 
			
		||||
  
 | 
			
		||||
  for(int dir=0;dir<4;dir++){
 | 
			
		||||
    for(int shift=0;shift<latt_size[dir];shift++){
 | 
			
		||||
      if ( Fine.IsBoss() ) 
 | 
			
		||||
	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
 | 
			
		||||
 | 
			
		||||
	ShiftU  = Cshift(U,dir,shift);    // Shift everything
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	std::cout << "U[0]" << U[0]<<std::endl;
 | 
			
		||||
	std::cout << "U[1]" << U[1]<<std::endl;
 | 
			
		||||
	std::cout << "ShiftU[0]" << ShiftU[0]<<std::endl;
 | 
			
		||||
	std::cout << "ShiftU[1]" << ShiftU[1]<<std::endl;
 | 
			
		||||
	*/
 | 
			
		||||
	std::vector<int> coor(4);
 | 
			
		||||
 | 
			
		||||
	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 | 
			
		||||
	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 | 
			
		||||
	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 | 
			
		||||
	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 | 
			
		||||
	  
 | 
			
		||||
	  peekSite(cm,ShiftU,coor);
 | 
			
		||||
 | 
			
		||||
	  double nrm=norm2(U);
 | 
			
		||||
 | 
			
		||||
	  std::vector<int> scoor(coor);
 | 
			
		||||
	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 | 
			
		||||
	  
 | 
			
		||||
	  Integer slex = scoor[0]
 | 
			
		||||
	    + latt_size[0]*scoor[1]
 | 
			
		||||
	    + latt_size[0]*latt_size[1]*scoor[2]
 | 
			
		||||
	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 | 
			
		||||
 | 
			
		||||
	  Complex scm(slex);
 | 
			
		||||
	  
 | 
			
		||||
	  nrm = abs(scm-cm()()());
 | 
			
		||||
	  std::vector<int> peer(4);
 | 
			
		||||
	  Complex tmp  =cm;
 | 
			
		||||
	  Integer index=real(tmp);
 | 
			
		||||
	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
 | 
			
		||||
	  if (nrm > 0){
 | 
			
		||||
	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	    index=real(scm);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
			
		||||
	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	  }
 | 
			
		||||
	  /*
 | 
			
		||||
	  else {
 | 
			
		||||
	    std::cerr<<"PASS shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
			
		||||
	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
			
		||||
	  }
 | 
			
		||||
	  */
 | 
			
		||||
	}}}}
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -42,6 +42,8 @@ public:
 | 
			
		||||
			  int, domaindecompose,
 | 
			
		||||
			  int, domainsize,
 | 
			
		||||
			  int, order,
 | 
			
		||||
			  int, Ls,
 | 
			
		||||
			  double, mq,
 | 
			
		||||
			  double, lo,
 | 
			
		||||
			  double, hi,
 | 
			
		||||
			  int, steps);
 | 
			
		||||
@@ -263,11 +265,6 @@ public:
 | 
			
		||||
      resid = norm2(r) /norm2(src); 
 | 
			
		||||
      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Npoly*outer*2 1/2 vol matmuls.
 | 
			
		||||
// 71 iters => 20*71 = 1400 matmuls.
 | 
			
		||||
// 2*71 = 140 comms.
 | 
			
		||||
 | 
			
		||||
      // Even domain solve
 | 
			
		||||
      r= where(subset==(Integer)0,r,zz);
 | 
			
		||||
      _SmootherOperator.AdjOp(r,vec1);
 | 
			
		||||
@@ -332,7 +329,7 @@ public:
 | 
			
		||||
    CoarseVector Ctmp(_CoarseOperator.Grid());
 | 
			
		||||
    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
 | 
			
		||||
 | 
			
		||||
    ConjugateGradient<CoarseVector>  CG(1.0e-3,100000);
 | 
			
		||||
    ConjugateGradient<CoarseVector>  CG(3.0e-3,100000);
 | 
			
		||||
    //    ConjugateGradient<FineField>    fCG(3.0e-2,1000);
 | 
			
		||||
 | 
			
		||||
    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
 | 
			
		||||
@@ -345,14 +342,14 @@ public:
 | 
			
		||||
 | 
			
		||||
    //    Chebyshev<FineField> Cheby    (0.5,70.0,30,InverseApproximation);
 | 
			
		||||
    //    Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
 | 
			
		||||
    Chebyshev<FineField> Cheby    (2.0,70.0,15,InverseApproximation);
 | 
			
		||||
    Chebyshev<FineField> ChebyAccu(2.0,70.0,15,InverseApproximation);
 | 
			
		||||
    Chebyshev<FineField> Cheby    (params.lo,params.hi,params.order,InverseApproximation);
 | 
			
		||||
    Chebyshev<FineField> ChebyAccu(params.lo,params.hi,params.order,InverseApproximation);
 | 
			
		||||
    //    Cheby.JacksonSmooth();
 | 
			
		||||
    //    ChebyAccu.JacksonSmooth();
 | 
			
		||||
 | 
			
		||||
    _Aggregates.ProjectToSubspace  (Csrc,in);
 | 
			
		||||
    _Aggregates.PromoteFromSubspace(Csrc,out);
 | 
			
		||||
    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
 | 
			
		||||
    //    _Aggregates.ProjectToSubspace  (Csrc,in);
 | 
			
		||||
    //    _Aggregates.PromoteFromSubspace(Csrc,out);
 | 
			
		||||
    //    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    //    ofstream fout("smoother");
 | 
			
		||||
    //    Cheby.csv(fout);
 | 
			
		||||
@@ -479,7 +476,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  read(RD,"params",params);
 | 
			
		||||
  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
 | 
			
		||||
 | 
			
		||||
  const int Ls=8;
 | 
			
		||||
  const int Ls=params.Ls;
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
@@ -490,10 +487,12 @@ int main (int argc, char ** argv)
 | 
			
		||||
  ///////////////////////////////////////////////////
 | 
			
		||||
  // Construct a coarsened grid; utility for this?
 | 
			
		||||
  ///////////////////////////////////////////////////
 | 
			
		||||
  const int block=2;
 | 
			
		||||
  std::vector<int> block ({2,2,2,2});
 | 
			
		||||
  const int nbasis= 32;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> clatt = GridDefaultLatt();
 | 
			
		||||
  for(int d=0;d<clatt.size();d++){
 | 
			
		||||
    clatt[d] = clatt[d]/block;
 | 
			
		||||
    clatt[d] = clatt[d]/block[d];
 | 
			
		||||
  }
 | 
			
		||||
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
 | 
			
		||||
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
 | 
			
		||||
@@ -539,7 +538,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  //  SU3::HotConfiguration(RNG4,Umu);
 | 
			
		||||
  //  Umu=zero;
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.01;
 | 
			
		||||
  RealD mass=params.mq;
 | 
			
		||||
  RealD M5=1.8;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
@@ -548,9 +547,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
  const int nbasis = 32;
 | 
			
		||||
  //  const int nbasis = 4;
 | 
			
		||||
 | 
			
		||||
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
 | 
			
		||||
  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
 | 
			
		||||
  typedef CoarseOperator::CoarseVector                                 CoarseVector;
 | 
			
		||||
@@ -564,7 +560,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  assert ( (nbasis & 0x1)==0);
 | 
			
		||||
  int nb=nbasis/2;
 | 
			
		||||
  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
 | 
			
		||||
  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
 | 
			
		||||
  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
 | 
			
		||||
  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
 | 
			
		||||
  for(int n=0;n<nb;n++){
 | 
			
		||||
    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
 | 
			
		||||
    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
 | 
			
		||||
@@ -600,7 +597,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
 | 
			
		||||
  ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
 | 
			
		||||
  CG(PosdefLdop,c_src,c_res);
 | 
			
		||||
  //  CG(PosdefLdop,c_src,c_res);
 | 
			
		||||
 | 
			
		||||
  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  //  std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
 | 
			
		||||
@@ -625,17 +622,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  Precon.SmootherTest(src);
 | 
			
		||||
  //  Precon.SmootherTest(src);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  PreconDD.SmootherTest(src);
 | 
			
		||||
  //  PreconDD.SmootherTest(src);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  PreconDD.SAP(src,result);
 | 
			
		||||
  //  PreconDD.SAP(src,result);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
 | 
			
		||||
@@ -663,18 +660,18 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
 | 
			
		||||
  result=zero;
 | 
			
		||||
  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
 | 
			
		||||
  PGCRDD(HermIndefOp,src,result);
 | 
			
		||||
  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
 | 
			
		||||
  //  result=zero;
 | 
			
		||||
  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
 | 
			
		||||
  //  PGCRDD(HermIndefOp,src,result);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
 | 
			
		||||
  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
 | 
			
		||||
  //  result=zero;
 | 
			
		||||
  //  PGCR(HermIndefOp,src,result);
 | 
			
		||||
  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,8);
 | 
			
		||||
  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
 | 
			
		||||
  result=zero;
 | 
			
		||||
  PGCR(HermIndefOp,src,result);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
 | 
			
		||||
 
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user