mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge remote-tracking branch 'origin/develop' into temporary-smearing
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -94,7 +94,7 @@ Thumbs.db
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# build directory #
 | 
					# build directory #
 | 
				
			||||||
###################
 | 
					###################
 | 
				
			||||||
build/*
 | 
					build*/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# IDE related files #
 | 
					# IDE related files #
 | 
				
			||||||
#####################
 | 
					#####################
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										90
									
								
								.travis.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										90
									
								
								.travis.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,90 @@
 | 
				
			|||||||
 | 
					language: cpp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cache:
 | 
				
			||||||
 | 
					  directories:
 | 
				
			||||||
 | 
					    - clang
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					matrix:
 | 
				
			||||||
 | 
					  include:
 | 
				
			||||||
 | 
					    - os:        osx
 | 
				
			||||||
 | 
					      osx_image: xcode7.2
 | 
				
			||||||
 | 
					      compiler: clang
 | 
				
			||||||
 | 
					    - os:        osx
 | 
				
			||||||
 | 
					      osx_image: xcode7.2
 | 
				
			||||||
 | 
					      compiler: gcc
 | 
				
			||||||
 | 
					      env: VERSION=-5
 | 
				
			||||||
 | 
					    - compiler: gcc
 | 
				
			||||||
 | 
					      addons:
 | 
				
			||||||
 | 
					        apt:
 | 
				
			||||||
 | 
					          sources:
 | 
				
			||||||
 | 
					            - ubuntu-toolchain-r-test
 | 
				
			||||||
 | 
					          packages:
 | 
				
			||||||
 | 
					            - g++-4.9
 | 
				
			||||||
 | 
					            - libmpfr-dev
 | 
				
			||||||
 | 
					            - libgmp-dev
 | 
				
			||||||
 | 
					            - libmpc-dev
 | 
				
			||||||
 | 
					            - binutils-dev
 | 
				
			||||||
 | 
					      env: VERSION=-4.9
 | 
				
			||||||
 | 
					    - compiler: gcc
 | 
				
			||||||
 | 
					      addons:
 | 
				
			||||||
 | 
					        apt:
 | 
				
			||||||
 | 
					          sources:
 | 
				
			||||||
 | 
					            - ubuntu-toolchain-r-test
 | 
				
			||||||
 | 
					          packages:
 | 
				
			||||||
 | 
					            - g++-5
 | 
				
			||||||
 | 
					            - libmpfr-dev
 | 
				
			||||||
 | 
					            - libgmp-dev
 | 
				
			||||||
 | 
					            - libmpc-dev
 | 
				
			||||||
 | 
					            - binutils-dev
 | 
				
			||||||
 | 
					      env: VERSION=-5
 | 
				
			||||||
 | 
					    - compiler: clang
 | 
				
			||||||
 | 
					      addons:
 | 
				
			||||||
 | 
					        apt:
 | 
				
			||||||
 | 
					          sources:
 | 
				
			||||||
 | 
					            - ubuntu-toolchain-r-test
 | 
				
			||||||
 | 
					          packages:
 | 
				
			||||||
 | 
					            - g++-4.8
 | 
				
			||||||
 | 
					            - libmpfr-dev
 | 
				
			||||||
 | 
					            - libgmp-dev
 | 
				
			||||||
 | 
					            - libmpc-dev
 | 
				
			||||||
 | 
					            - binutils-dev
 | 
				
			||||||
 | 
					      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 | 
				
			||||||
 | 
					    - compiler: clang
 | 
				
			||||||
 | 
					      addons:
 | 
				
			||||||
 | 
					        apt:
 | 
				
			||||||
 | 
					          sources:
 | 
				
			||||||
 | 
					            - ubuntu-toolchain-r-test
 | 
				
			||||||
 | 
					          packages:
 | 
				
			||||||
 | 
					            - g++-4.8
 | 
				
			||||||
 | 
					            - libmpfr-dev
 | 
				
			||||||
 | 
					            - libgmp-dev
 | 
				
			||||||
 | 
					            - libmpc-dev
 | 
				
			||||||
 | 
					            - binutils-dev
 | 
				
			||||||
 | 
					      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					before_install:
 | 
				
			||||||
 | 
					    - export GRIDDIR=`pwd`
 | 
				
			||||||
 | 
					    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
 | 
				
			||||||
 | 
					    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
 | 
				
			||||||
 | 
					    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
 | 
				
			||||||
 | 
					    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
 | 
				
			||||||
 | 
					    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
 | 
				
			||||||
 | 
					    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					install:
 | 
				
			||||||
 | 
					    - export CC=$CC$VERSION
 | 
				
			||||||
 | 
					    - export CXX=$CXX$VERSION
 | 
				
			||||||
 | 
					    - echo $PATH
 | 
				
			||||||
 | 
					    - which $CC
 | 
				
			||||||
 | 
					    - $CC  --version
 | 
				
			||||||
 | 
					    - which $CXX
 | 
				
			||||||
 | 
					    - $CXX --version
 | 
				
			||||||
 | 
					    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					script:
 | 
				
			||||||
 | 
					    - ./scripts/reconfigure_script
 | 
				
			||||||
 | 
					    - mkdir build
 | 
				
			||||||
 | 
					    - cd build
 | 
				
			||||||
 | 
					    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
 | 
				
			||||||
 | 
					    - make -j4
 | 
				
			||||||
 | 
					    - ./benchmarks/Benchmark_dwf --threads 1
 | 
				
			||||||
@@ -1,4 +1,4 @@
 | 
				
			|||||||
# Grid
 | 
					# Grid [](https://travis-ci.org/paboyle/Grid)
 | 
				
			||||||
Data parallel C++ mathematical object library
 | 
					Data parallel C++ mathematical object library
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Last update 2015/7/30
 | 
					Last update 2015/7/30
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
    *************************************************************************************/
 | 
					    *************************************************************************************/
 | 
				
			||||||
    /*  END LEGAL */
 | 
					    /*  END LEGAL */
 | 
				
			||||||
#include <Grid.h>
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#include <PerfCount.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
using namespace std;
 | 
					using namespace std;
 | 
				
			||||||
using namespace Grid;
 | 
					using namespace Grid;
 | 
				
			||||||
@@ -45,6 +46,10 @@ struct scal {
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
bool overlapComms = false;
 | 
					bool overlapComms = false;
 | 
				
			||||||
 | 
					typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
 | 
				
			||||||
 | 
					typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
 | 
				
			||||||
 | 
					typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int main (int argc, char ** argv)
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -64,6 +69,12 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
					  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
 | 
				
			||||||
 | 
					  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<int> seeds4({1,2,3,4});
 | 
					  std::vector<int> seeds4({1,2,3,4});
 | 
				
			||||||
  std::vector<int> seeds5({5,6,7,8});
 | 
					  std::vector<int> seeds5({5,6,7,8});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -78,7 +89,9 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  ColourMatrix cm = Complex(1.0,0.0);
 | 
					  ColourMatrix cm = Complex(1.0,0.0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
 | 
					  LatticeGaugeField Umu(UGrid); 
 | 
				
			||||||
 | 
					  random(RNG4,Umu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  LatticeGaugeField Umu5d(FGrid); 
 | 
					  LatticeGaugeField Umu5d(FGrid); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // replicate across fifth dimension
 | 
					  // replicate across fifth dimension
 | 
				
			||||||
@@ -119,14 +132,21 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  
 | 
					  
 | 
				
			||||||
  RealD NP = UGrid->_Nprocessors;
 | 
					  RealD NP = UGrid->_Nprocessors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int doasm=1;doasm<2;doasm++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    QCD::WilsonKernelsStatic::AsmOpt=doasm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
					  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
				
			||||||
  int ncall=1000;
 | 
					  int ncall =10;
 | 
				
			||||||
  {
 | 
					  if (1) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    double t0=usecond();
 | 
					    double t0=usecond();
 | 
				
			||||||
    for(int i=0;i<ncall;i++){
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      __SSC_START;
 | 
				
			||||||
      Dw.Dhop(src,result,0);
 | 
					      Dw.Dhop(src,result,0);
 | 
				
			||||||
 | 
					      __SSC_STOP;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    double t1=usecond();
 | 
					    double t1=usecond();
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
@@ -140,9 +160,121 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
					    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
    err = ref-result; 
 | 
					    err = ref-result; 
 | 
				
			||||||
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
				
			||||||
    Dw.Report();
 | 
					    //    Dw.Report();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (1)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
 | 
				
			||||||
 | 
					    LatticeFermionF ssrc(sFGrid);
 | 
				
			||||||
 | 
					    LatticeFermionF sref(sFGrid);
 | 
				
			||||||
 | 
					    LatticeFermionF sresult(sFGrid);
 | 
				
			||||||
 | 
					    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					    for(int x=0;x<latt4[0];x++){
 | 
				
			||||||
 | 
					    for(int y=0;y<latt4[1];y++){
 | 
				
			||||||
 | 
					    for(int z=0;z<latt4[2];z++){
 | 
				
			||||||
 | 
					    for(int t=0;t<latt4[3];t++){
 | 
				
			||||||
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
 | 
					      std::vector<int> site({s,x,y,z,t});
 | 
				
			||||||
 | 
					      SpinColourVectorF tmp;
 | 
				
			||||||
 | 
					      peekSite(tmp,src,site);
 | 
				
			||||||
 | 
					      pokeSite(tmp,ssrc,site);
 | 
				
			||||||
 | 
					    }}}}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    double t0=usecond();
 | 
				
			||||||
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      __SSC_START;
 | 
				
			||||||
 | 
					      sDw.Dhop(ssrc,sresult,0);
 | 
				
			||||||
 | 
					      __SSC_STOP;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    double t1=usecond();
 | 
				
			||||||
 | 
					    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					    double flops=1344*volume*ncall;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
 | 
					    //  sDw.Report();
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					    if(0){
 | 
				
			||||||
 | 
					      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
				
			||||||
 | 
						sDw.Dhop(ssrc,sresult,0);
 | 
				
			||||||
 | 
						PerformanceCounter Counter(i);
 | 
				
			||||||
 | 
						Counter.Start();
 | 
				
			||||||
 | 
						sDw.Dhop(ssrc,sresult,0);
 | 
				
			||||||
 | 
						Counter.Stop();
 | 
				
			||||||
 | 
						Counter.Report();
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    RealF sum=0;
 | 
				
			||||||
 | 
					    for(int x=0;x<latt4[0];x++){
 | 
				
			||||||
 | 
					    for(int y=0;y<latt4[1];y++){
 | 
				
			||||||
 | 
					    for(int z=0;z<latt4[2];z++){
 | 
				
			||||||
 | 
					    for(int t=0;t<latt4[3];t++){
 | 
				
			||||||
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
 | 
					      std::vector<int> site({s,x,y,z,t});
 | 
				
			||||||
 | 
					      SpinColourVectorF normal, simd;
 | 
				
			||||||
 | 
					      peekSite(normal,result,site);
 | 
				
			||||||
 | 
					      peekSite(simd,sresult,site);
 | 
				
			||||||
 | 
					      sum=sum+norm2(normal-simd);
 | 
				
			||||||
 | 
					      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
 | 
				
			||||||
 | 
					      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
 | 
				
			||||||
 | 
					      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
 | 
				
			||||||
 | 
					    }}}}}
 | 
				
			||||||
 | 
					    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (1) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      LatticeFermionF sr_eo(sFGrid);
 | 
				
			||||||
 | 
					      LatticeFermionF serr(sFGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      LatticeFermion ssrc_e (sFrbGrid);
 | 
				
			||||||
 | 
					      LatticeFermion ssrc_o (sFrbGrid);
 | 
				
			||||||
 | 
					      LatticeFermion sr_e   (sFrbGrid);
 | 
				
			||||||
 | 
					      LatticeFermion sr_o   (sFrbGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      pickCheckerboard(Even,ssrc_e,ssrc);
 | 
				
			||||||
 | 
					      pickCheckerboard(Odd,ssrc_o,ssrc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      setCheckerboard(sr_eo,ssrc_o);
 | 
				
			||||||
 | 
					      setCheckerboard(sr_eo,ssrc_e);
 | 
				
			||||||
 | 
					      serr = sr_eo-ssrc; 
 | 
				
			||||||
 | 
					      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      sr_e = zero;
 | 
				
			||||||
 | 
					      sr_o = zero;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      double t0=usecond();
 | 
				
			||||||
 | 
					      for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
						sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      double t1=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					      double flops=(1344.0*volume*ncall)/2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
 | 
				
			||||||
 | 
					      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
 | 
				
			||||||
 | 
					      sDw.Dhop  (ssrc  ,sresult,DaggerNo);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      pickCheckerboard(Even,ssrc_e,sresult);
 | 
				
			||||||
 | 
					      pickCheckerboard(Odd ,ssrc_o,sresult);
 | 
				
			||||||
 | 
					      ssrc_e = ssrc_e - sr_e;
 | 
				
			||||||
 | 
					      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
 | 
				
			||||||
 | 
					      ssrc_o = ssrc_o - sr_o;
 | 
				
			||||||
 | 
					      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (1)
 | 
					  if (1)
 | 
				
			||||||
  { // Naive wilson dag implementation
 | 
					  { // Naive wilson dag implementation
 | 
				
			||||||
@@ -197,7 +329,6 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
 | 
					    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					 | 
				
			||||||
  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
					  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
				
			||||||
  Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
					  Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
				
			||||||
  Dw.Dhop  (src  ,result,DaggerNo);
 | 
					  Dw.Dhop  (src  ,result,DaggerNo);
 | 
				
			||||||
@@ -217,5 +348,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
 | 
					  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 | 
					  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Grid_finalize();
 | 
					  Grid_finalize();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										154
									
								
								benchmarks/Benchmark_dwf_ntpf.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								benchmarks/Benchmark_dwf_ntpf.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,154 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./benchmarks/Benchmark_dwf.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#include <PerfCount.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace std;
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class d>
 | 
				
			||||||
 | 
					struct scal {
 | 
				
			||||||
 | 
					  d internal;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Gamma::GammaMatrix Gmu [] = {
 | 
				
			||||||
 | 
					    Gamma::GammaX,
 | 
				
			||||||
 | 
					    Gamma::GammaY,
 | 
				
			||||||
 | 
					    Gamma::GammaZ,
 | 
				
			||||||
 | 
					    Gamma::GammaT
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					bool overlapComms = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
 | 
				
			||||||
 | 
					    overlapComms = true;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int threads = GridThread::GetThreads();
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> latt4 = GridDefaultLatt();
 | 
				
			||||||
 | 
					  const int Ls=16;
 | 
				
			||||||
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> seeds4({1,2,3,4});
 | 
				
			||||||
 | 
					  std::vector<int> seeds5({5,6,7,8});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeFermion src   (FGrid); random(RNG5,src);
 | 
				
			||||||
 | 
					  LatticeFermion result(FGrid); result=zero;
 | 
				
			||||||
 | 
					  LatticeFermion    ref(FGrid);    ref=zero;
 | 
				
			||||||
 | 
					  LatticeFermion    tmp(FGrid);
 | 
				
			||||||
 | 
					  LatticeFermion    err(FGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ColourMatrix cm = Complex(1.0,0.0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid); 
 | 
				
			||||||
 | 
					  random(RNG4,Umu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu5d(FGrid); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // replicate across fifth dimension
 | 
				
			||||||
 | 
					  for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
				
			||||||
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
 | 
					      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////////
 | 
				
			||||||
 | 
					  // Naive wilson implementation
 | 
				
			||||||
 | 
					  ////////////////////////////////////
 | 
				
			||||||
 | 
					  std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
				
			||||||
 | 
					  for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (1)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    ref = zero;
 | 
				
			||||||
 | 
					    for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      tmp = U[mu]*Cshift(src,mu+1,1);
 | 
				
			||||||
 | 
					      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      tmp =adj(U[mu])*src;
 | 
				
			||||||
 | 
					      tmp =Cshift(tmp,mu+1,-1);
 | 
				
			||||||
 | 
					      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    ref = -0.5*ref;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD mass=0.1;
 | 
				
			||||||
 | 
					  RealD M5  =1.8;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  typename DomainWallFermionR::ImplParams params; 
 | 
				
			||||||
 | 
					  params.overlapCommsCompute = overlapComms;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  RealD NP = UGrid->_Nprocessors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  QCD::WilsonKernelsStatic::AsmOpt=1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
				
			||||||
 | 
					  int ncall =50;
 | 
				
			||||||
 | 
					  if (1) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    double t0=usecond();
 | 
				
			||||||
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      Dw.Dhop(src,result,0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    double t1=usecond();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					    double flops=1344*volume*ncall;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
 | 
					    err = ref-result; 
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
				
			||||||
 | 
					    //    Dw.Report();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  Grid_finalize();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										172
									
								
								benchmarks/Benchmark_zmm.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								benchmarks/Benchmark_zmm.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,172 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./tests/Test_zmm.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#include <PerfCount.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main(int argc,char **argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					  std::ofstream os("zmm.dat");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
 | 
				
			||||||
 | 
					  for(int L=4;L<=32;L+=4){
 | 
				
			||||||
 | 
					    for(int m=1;m<=2;m++){
 | 
				
			||||||
 | 
					      for(int Ls=8;Ls<=16;Ls+=8){
 | 
				
			||||||
 | 
						std::vector<int> grid({L,L,m*L,m*L});
 | 
				
			||||||
 | 
						for(int i=0;i<4;i++) { 
 | 
				
			||||||
 | 
						  std::cout << grid[i]<<"x";
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						std::cout << Ls<<std::endl;
 | 
				
			||||||
 | 
						bench(os,grid,Ls);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
				
			||||||
 | 
					  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
				
			||||||
 | 
					  int threads = GridThread::GetThreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> seeds4({1,2,3,4});
 | 
				
			||||||
 | 
					  std::vector<int> seeds5({5,6,7,8});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeFermion src (FGrid);
 | 
				
			||||||
 | 
					  LatticeFermion tmp (FGrid);
 | 
				
			||||||
 | 
					  LatticeFermion srce(FrbGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeFermion resulto(FrbGrid); resulto=zero;
 | 
				
			||||||
 | 
					  LatticeFermion resulta(FrbGrid); resulta=zero;
 | 
				
			||||||
 | 
					  LatticeFermion junk(FrbGrid); junk=zero;
 | 
				
			||||||
 | 
					  LatticeFermion diff(FrbGrid); 
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  double mfc, mfa, mfo, mfl1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
				
			||||||
 | 
					  random(RNG5,src);
 | 
				
			||||||
 | 
					#if 1
 | 
				
			||||||
 | 
					  random(RNG4,Umu);
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					  int mmu=2;
 | 
				
			||||||
 | 
					  std::vector<LatticeColourMatrix> U(4,UGrid);
 | 
				
			||||||
 | 
					  for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
				
			||||||
 | 
					    if ( mu!=mmu ) U[mu] = zero;
 | 
				
			||||||
 | 
					    if ( mu==mmu ) U[mu] = 1.0;
 | 
				
			||||||
 | 
					    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					 pickCheckerboard(Even,srce,src);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD mass=0.1;
 | 
				
			||||||
 | 
					  RealD M5  =1.8;
 | 
				
			||||||
 | 
					  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
				
			||||||
 | 
					  int ncall=50;
 | 
				
			||||||
 | 
					  double t0=usecond();
 | 
				
			||||||
 | 
					  for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					    Dw.DhopOE(srce,resulto,0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  double t1=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					  double flops=1344*volume/2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  mfc = flops*ncall/(t1-t0);
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  QCD::WilsonKernelsStatic::AsmOpt=1;
 | 
				
			||||||
 | 
					  t0=usecond();
 | 
				
			||||||
 | 
					  for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					    Dw.DhopOE(srce,resulta,0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  t1=usecond();
 | 
				
			||||||
 | 
					  mfa = flops*ncall/(t1-t0);
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
 | 
				
			||||||
 | 
					  /*
 | 
				
			||||||
 | 
					  int dag=DaggerNo;
 | 
				
			||||||
 | 
					  t0=usecond();
 | 
				
			||||||
 | 
					  for(int i=0;i<1;i++){
 | 
				
			||||||
 | 
					    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  t1=usecond();
 | 
				
			||||||
 | 
					  mfo = flops*100/(t1-t0);
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  t0=usecond();
 | 
				
			||||||
 | 
					  for(int i=0;i<1;i++){
 | 
				
			||||||
 | 
					    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  t1=usecond();
 | 
				
			||||||
 | 
					  mfl1= flops*100/(t1-t0);
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
 | 
				
			||||||
 | 
					  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
 | 
				
			||||||
 | 
					     << mfc<<" "
 | 
				
			||||||
 | 
					     << mfa<<" "
 | 
				
			||||||
 | 
					     << mfo<<" "
 | 
				
			||||||
 | 
					     << mfl1<<std::endl;
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
 | 
					  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
				
			||||||
 | 
					    Dw.DhopOE(srce,resulta,0);
 | 
				
			||||||
 | 
					    PerformanceCounter Counter(i);
 | 
				
			||||||
 | 
					    Counter.Start();
 | 
				
			||||||
 | 
					    Dw.DhopOE(srce,resulta,0);
 | 
				
			||||||
 | 
					    Counter.Stop();
 | 
				
			||||||
 | 
					    Counter.Report();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  //resulta = (-0.5) * resulta;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  diff = resulto-resulta;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<std::endl;
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -1,5 +1,5 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
 | 
					bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Benchmark_comms_SOURCES=Benchmark_comms.cc
 | 
					Benchmark_comms_SOURCES=Benchmark_comms.cc
 | 
				
			||||||
@@ -10,6 +10,10 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 | 
				
			|||||||
Benchmark_dwf_LDADD=-lGrid
 | 
					Benchmark_dwf_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
 | 
				
			||||||
 | 
					Benchmark_dwf_ntpf_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 | 
					Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 | 
				
			||||||
Benchmark_memory_asynch_LDADD=-lGrid
 | 
					Benchmark_memory_asynch_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -25,3 +29,7 @@ Benchmark_su3_LDADD=-lGrid
 | 
				
			|||||||
Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 | 
					Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 | 
				
			||||||
Benchmark_wilson_LDADD=-lGrid
 | 
					Benchmark_wilson_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Benchmark_zmm_SOURCES=Benchmark_zmm.cc
 | 
				
			||||||
 | 
					Benchmark_zmm_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										109
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										109
									
								
								configure
									
									
									
									
										vendored
									
									
								
							@@ -626,12 +626,18 @@ ac_subst_vars='am__EXEEXT_FALSE
 | 
				
			|||||||
am__EXEEXT_TRUE
 | 
					am__EXEEXT_TRUE
 | 
				
			||||||
LTLIBOBJS
 | 
					LTLIBOBJS
 | 
				
			||||||
LIBOBJS
 | 
					LIBOBJS
 | 
				
			||||||
 | 
					USE_LAPACK_LIB_FALSE
 | 
				
			||||||
 | 
					USE_LAPACK_LIB_TRUE
 | 
				
			||||||
 | 
					USE_LAPACK_FALSE
 | 
				
			||||||
 | 
					USE_LAPACK_TRUE
 | 
				
			||||||
BUILD_CHROMA_REGRESSION_FALSE
 | 
					BUILD_CHROMA_REGRESSION_FALSE
 | 
				
			||||||
BUILD_CHROMA_REGRESSION_TRUE
 | 
					BUILD_CHROMA_REGRESSION_TRUE
 | 
				
			||||||
BUILD_COMMS_NONE_FALSE
 | 
					BUILD_COMMS_NONE_FALSE
 | 
				
			||||||
BUILD_COMMS_NONE_TRUE
 | 
					BUILD_COMMS_NONE_TRUE
 | 
				
			||||||
BUILD_COMMS_MPI_FALSE
 | 
					BUILD_COMMS_MPI_FALSE
 | 
				
			||||||
BUILD_COMMS_MPI_TRUE
 | 
					BUILD_COMMS_MPI_TRUE
 | 
				
			||||||
 | 
					BUILD_COMMS_SHMEM_FALSE
 | 
				
			||||||
 | 
					BUILD_COMMS_SHMEM_TRUE
 | 
				
			||||||
BUILD_ZMM_FALSE
 | 
					BUILD_ZMM_FALSE
 | 
				
			||||||
BUILD_ZMM_TRUE
 | 
					BUILD_ZMM_TRUE
 | 
				
			||||||
EGREP
 | 
					EGREP
 | 
				
			||||||
@@ -751,7 +757,9 @@ enable_simd
 | 
				
			|||||||
enable_precision
 | 
					enable_precision
 | 
				
			||||||
enable_comms
 | 
					enable_comms
 | 
				
			||||||
enable_rng
 | 
					enable_rng
 | 
				
			||||||
 | 
					enable_timers
 | 
				
			||||||
enable_chroma
 | 
					enable_chroma
 | 
				
			||||||
 | 
					enable_lapack
 | 
				
			||||||
'
 | 
					'
 | 
				
			||||||
      ac_precious_vars='build_alias
 | 
					      ac_precious_vars='build_alias
 | 
				
			||||||
host_alias
 | 
					host_alias
 | 
				
			||||||
@@ -1410,7 +1418,9 @@ Optional Features:
 | 
				
			|||||||
  --enable-comms=none|mpi Select communications
 | 
					  --enable-comms=none|mpi Select communications
 | 
				
			||||||
  --enable-rng=ranlux48|mt19937
 | 
					  --enable-rng=ranlux48|mt19937
 | 
				
			||||||
                          Select Random Number Generator to be used
 | 
					                          Select Random Number Generator to be used
 | 
				
			||||||
 | 
					  --enable-timers=yes|no  Enable system dependent high res timers
 | 
				
			||||||
  --enable-chroma         Expect chroma compiled under c++11
 | 
					  --enable-chroma         Expect chroma compiled under c++11
 | 
				
			||||||
 | 
					  --enable-lapack         Enable lapack yes/no
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Some influential environment variables:
 | 
					Some influential environment variables:
 | 
				
			||||||
  CXX         C++ compiler command
 | 
					  CXX         C++ compiler command
 | 
				
			||||||
@@ -6410,7 +6420,7 @@ if test "${enable_simd+set}" = set; then :
 | 
				
			|||||||
  enableval=$enable_simd; \
 | 
					  enableval=$enable_simd; \
 | 
				
			||||||
	ac_SIMD=${enable_simd}
 | 
						ac_SIMD=${enable_simd}
 | 
				
			||||||
else
 | 
					else
 | 
				
			||||||
  ac_SIMD=AVX2
 | 
					  ac_SIMD=DEBUG
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -6477,7 +6487,7 @@ $as_echo "#define AVX512 1" >>confdefs.h
 | 
				
			|||||||
$as_echo "#define IMCI 1" >>confdefs.h
 | 
					$as_echo "#define IMCI 1" >>confdefs.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
       supported="cross compilation"
 | 
					       supported="cross compilation"
 | 
				
			||||||
       ac_ZMM=yes;
 | 
					       ac_ZMM=no;
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
     NEONv8)
 | 
					     NEONv8)
 | 
				
			||||||
       echo Configuring for experimental ARMv8a support
 | 
					       echo Configuring for experimental ARMv8a support
 | 
				
			||||||
@@ -6561,12 +6571,26 @@ $as_echo "#define GRID_COMMS_NONE 1" >>confdefs.h
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
$as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
 | 
					$as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     shmem)
 | 
				
			||||||
 | 
					       echo Configuring for SHMEM communications
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$as_echo "#define GRID_COMMS_SHMEM 1" >>confdefs.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
     *)
 | 
					     *)
 | 
				
			||||||
     as_fn_error $? "${ac_COMMS} unsupported --enable-comms option" "$LINENO" 5;
 | 
					     as_fn_error $? "${ac_COMMS} unsupported --enable-comms option" "$LINENO" 5;
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
esac
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 if  test "X${ac_COMMS}X" == "XshmemX" ; then
 | 
				
			||||||
 | 
					  BUILD_COMMS_SHMEM_TRUE=
 | 
				
			||||||
 | 
					  BUILD_COMMS_SHMEM_FALSE='#'
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  BUILD_COMMS_SHMEM_TRUE='#'
 | 
				
			||||||
 | 
					  BUILD_COMMS_SHMEM_FALSE=
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 if  test "X${ac_COMMS}X" == "XmpiX" ; then
 | 
					 if  test "X${ac_COMMS}X" == "XmpiX" ; then
 | 
				
			||||||
  BUILD_COMMS_MPI_TRUE=
 | 
					  BUILD_COMMS_MPI_TRUE=
 | 
				
			||||||
  BUILD_COMMS_MPI_FALSE='#'
 | 
					  BUILD_COMMS_MPI_FALSE='#'
 | 
				
			||||||
@@ -6610,6 +6634,34 @@ $as_echo "#define RNG_MT19937 1" >>confdefs.h
 | 
				
			|||||||
     as_fn_error $? "${ac_RNG} unsupported --enable-rng option" "$LINENO" 5;
 | 
					     as_fn_error $? "${ac_RNG} unsupported --enable-rng option" "$LINENO" 5;
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
esac
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# SDE timing mode
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Check whether --enable-timers was given.
 | 
				
			||||||
 | 
					if test "${enable_timers+set}" = set; then :
 | 
				
			||||||
 | 
					  enableval=$enable_timers; \
 | 
				
			||||||
 | 
						ac_TIMERS=${enable_timers}
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  ac_TIMERS=yes
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					case ${ac_TIMERS} in
 | 
				
			||||||
 | 
					     yes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$as_echo "#define TIMERS_ON 1" >>confdefs.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     no)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$as_echo "#define TIMERS_OFF 1" >>confdefs.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     *)
 | 
				
			||||||
 | 
					     as_fn_error $? "${ac_TIMERS} unsupported --enable-timers option" "$LINENO" 5;
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Chroma regression tests
 | 
					# Chroma regression tests
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
@@ -6642,6 +6694,46 @@ else
 | 
				
			|||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Lapack
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Check whether --enable-lapack was given.
 | 
				
			||||||
 | 
					if test "${enable_lapack+set}" = set; then :
 | 
				
			||||||
 | 
					  enableval=$enable_lapack; ac_LAPACK=${enable_lapack}
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  ac_LAPACK=no
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					case ${ac_LAPACK} in
 | 
				
			||||||
 | 
					     yes)
 | 
				
			||||||
 | 
					       echo Enabling lapack
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     no)
 | 
				
			||||||
 | 
					       echo Disabling lapack
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     *)
 | 
				
			||||||
 | 
					       echo Enabling lapack at ${ac_LAPACK}
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 if  test "X${ac_LAPACK}X" != "XnoX" ; then
 | 
				
			||||||
 | 
					  USE_LAPACK_TRUE=
 | 
				
			||||||
 | 
					  USE_LAPACK_FALSE='#'
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  USE_LAPACK_TRUE='#'
 | 
				
			||||||
 | 
					  USE_LAPACK_FALSE=
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 if  test "X${ac_LAPACK}X" != "XyesX" ; then
 | 
				
			||||||
 | 
					  USE_LAPACK_LIB_TRUE=
 | 
				
			||||||
 | 
					  USE_LAPACK_LIB_FALSE='#'
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  USE_LAPACK_LIB_TRUE='#'
 | 
				
			||||||
 | 
					  USE_LAPACK_LIB_FALSE=
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
###################################################################
 | 
					###################################################################
 | 
				
			||||||
# Checks for doxygen support
 | 
					# Checks for doxygen support
 | 
				
			||||||
# if present enables the "make doxyfile" command
 | 
					# if present enables the "make doxyfile" command
 | 
				
			||||||
@@ -6809,6 +6901,10 @@ if test -z "${BUILD_ZMM_TRUE}" && test -z "${BUILD_ZMM_FALSE}"; then
 | 
				
			|||||||
  as_fn_error $? "conditional \"BUILD_ZMM\" was never defined.
 | 
					  as_fn_error $? "conditional \"BUILD_ZMM\" was never defined.
 | 
				
			||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
					Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					if test -z "${BUILD_COMMS_SHMEM_TRUE}" && test -z "${BUILD_COMMS_SHMEM_FALSE}"; then
 | 
				
			||||||
 | 
					  as_fn_error $? "conditional \"BUILD_COMMS_SHMEM\" was never defined.
 | 
				
			||||||
 | 
					Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
 | 
					if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
 | 
				
			||||||
  as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
 | 
					  as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
 | 
				
			||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
					Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
				
			||||||
@@ -6821,6 +6917,14 @@ if test -z "${BUILD_CHROMA_REGRESSION_TRUE}" && test -z "${BUILD_CHROMA_REGRESSI
 | 
				
			|||||||
  as_fn_error $? "conditional \"BUILD_CHROMA_REGRESSION\" was never defined.
 | 
					  as_fn_error $? "conditional \"BUILD_CHROMA_REGRESSION\" was never defined.
 | 
				
			||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
					Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					if test -z "${USE_LAPACK_TRUE}" && test -z "${USE_LAPACK_FALSE}"; then
 | 
				
			||||||
 | 
					  as_fn_error $? "conditional \"USE_LAPACK\" was never defined.
 | 
				
			||||||
 | 
					Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					if test -z "${USE_LAPACK_LIB_TRUE}" && test -z "${USE_LAPACK_LIB_FALSE}"; then
 | 
				
			||||||
 | 
					  as_fn_error $? "conditional \"USE_LAPACK_LIB\" was never defined.
 | 
				
			||||||
 | 
					Usually this means the macro was only invoked conditionally." "$LINENO" 5
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: "${CONFIG_STATUS=./config.status}"
 | 
					: "${CONFIG_STATUS=./config.status}"
 | 
				
			||||||
ac_write_fail=0
 | 
					ac_write_fail=0
 | 
				
			||||||
@@ -8167,6 +8271,7 @@ The following features are enabled:
 | 
				
			|||||||
- communications type           : ${ac_COMMS}
 | 
					- communications type           : ${ac_COMMS}
 | 
				
			||||||
- default precision             : ${ac_PRECISION}
 | 
					- default precision             : ${ac_PRECISION}
 | 
				
			||||||
- RNG choice                    : ${ac_RNG}
 | 
					- RNG choice                    : ${ac_RNG}
 | 
				
			||||||
 | 
					- LAPACK	                : ${ac_LAPACK}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
"
 | 
					"
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										49
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								configure.ac
									
									
									
									
									
								
							@@ -71,7 +71,7 @@ AC_CHECK_FUNCS([gettimeofday])
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
 | 
					AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
 | 
				
			||||||
	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 | 
						[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 | 
				
			||||||
	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
 | 
						[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
supported=no
 | 
					supported=no
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -124,7 +124,7 @@ case ${ac_SIMD} in
 | 
				
			|||||||
       echo Configuring for IMCI
 | 
					       echo Configuring for IMCI
 | 
				
			||||||
       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
 | 
					       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
 | 
				
			||||||
       supported="cross compilation"
 | 
					       supported="cross compilation"
 | 
				
			||||||
       ac_ZMM=yes;
 | 
					       ac_ZMM=no;
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
     NEONv8)
 | 
					     NEONv8)
 | 
				
			||||||
       echo Configuring for experimental ARMv8a support 
 | 
					       echo Configuring for experimental ARMv8a support 
 | 
				
			||||||
@@ -178,11 +178,16 @@ case ${ac_COMMS} in
 | 
				
			|||||||
       echo Configuring for MPI communications
 | 
					       echo Configuring for MPI communications
 | 
				
			||||||
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
 | 
					       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
 | 
					     shmem)
 | 
				
			||||||
 | 
					       echo Configuring for SHMEM communications
 | 
				
			||||||
 | 
					       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
     *)
 | 
					     *)
 | 
				
			||||||
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
 | 
					     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
esac
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 | 
				
			||||||
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
 | 
					AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
 | 
				
			||||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 | 
					AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -203,6 +208,25 @@ case ${ac_RNG} in
 | 
				
			|||||||
     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
 | 
					     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
esac
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# SDE timing mode
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
 | 
				
			||||||
 | 
						[Enable system dependent high res timers])],\
 | 
				
			||||||
 | 
						[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 | 
				
			||||||
 | 
					case ${ac_TIMERS} in
 | 
				
			||||||
 | 
					     yes)
 | 
				
			||||||
 | 
					     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     no)
 | 
				
			||||||
 | 
					     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     *)
 | 
				
			||||||
 | 
					     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Chroma regression tests
 | 
					# Chroma regression tests
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
@@ -222,6 +246,26 @@ esac
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
 | 
					AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Lapack
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					case ${ac_LAPACK} in
 | 
				
			||||||
 | 
					     yes)
 | 
				
			||||||
 | 
					       echo Enabling lapack
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     no)
 | 
				
			||||||
 | 
					       echo Disabling lapack
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     *)
 | 
				
			||||||
 | 
					       echo Enabling lapack at ${ac_LAPACK}
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
 | 
				
			||||||
 | 
					AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
###################################################################
 | 
					###################################################################
 | 
				
			||||||
# Checks for doxygen support
 | 
					# Checks for doxygen support
 | 
				
			||||||
# if present enables the "make doxyfile" command
 | 
					# if present enables the "make doxyfile" command
 | 
				
			||||||
@@ -265,6 +309,7 @@ The following features are enabled:
 | 
				
			|||||||
- communications type           : ${ac_COMMS}
 | 
					- communications type           : ${ac_COMMS}
 | 
				
			||||||
- default precision             : ${ac_PRECISION}
 | 
					- default precision             : ${ac_PRECISION}
 | 
				
			||||||
- RNG choice                    : ${ac_RNG} 
 | 
					- RNG choice                    : ${ac_RNG} 
 | 
				
			||||||
 | 
					- LAPACK	                : ${ac_LAPACK} 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
"
 | 
					"
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -36,11 +36,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include <malloc.h>
 | 
					#include <malloc.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <immintrin.h>
 | 
					 | 
				
			||||||
#ifdef HAVE_MM_MALLOC_H
 | 
					#ifdef HAVE_MM_MALLOC_H
 | 
				
			||||||
#include <mm_malloc.h>
 | 
					#include <mm_malloc.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef GRID_COMMS_SHMEM
 | 
				
			||||||
 | 
					extern "C" { 
 | 
				
			||||||
 | 
					#include <mpp/shmem.h>
 | 
				
			||||||
 | 
					extern void * shmem_align(size_t, size_t);
 | 
				
			||||||
 | 
					extern void  shmem_free(void *);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -72,21 +79,59 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 | 
					  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  pointer allocate(size_type __n, const void* = 0)
 | 
					  pointer allocate(size_type __n, const void* _p= 0)
 | 
				
			||||||
  { 
 | 
					  { 
 | 
				
			||||||
 | 
					#ifdef GRID_COMMS_SHMEM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PARANOID_SYMMETRIC_HEAP
 | 
				
			||||||
 | 
					#ifdef PARANOID_SYMMETRIC_HEAP
 | 
				
			||||||
 | 
					    static void * bcast;
 | 
				
			||||||
 | 
					    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    bcast = (void *) ptr;
 | 
				
			||||||
 | 
					    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if ( bcast != ptr ) {
 | 
				
			||||||
 | 
					      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
 | 
				
			||||||
 | 
					      BACKTRACEFILE();
 | 
				
			||||||
 | 
					      exit(0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert( bcast == (void *) ptr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif 
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef HAVE_MM_MALLOC_H
 | 
					#ifdef HAVE_MM_MALLOC_H
 | 
				
			||||||
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 | 
					    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 | 
					    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					    _Tp tmp;
 | 
				
			||||||
 | 
					#undef FIRST_TOUCH_OPTIMISE
 | 
				
			||||||
 | 
					#ifdef FIRST_TOUCH_OPTIMISE
 | 
				
			||||||
 | 
					#pragma omp parallel for 
 | 
				
			||||||
 | 
					  for(int i=0;i<__n;i++){
 | 
				
			||||||
 | 
					    ptr[i]=tmp;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					#endif 
 | 
				
			||||||
    return ptr;
 | 
					    return ptr;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  void deallocate(pointer __p, size_type) { 
 | 
					  void deallocate(pointer __p, size_type) { 
 | 
				
			||||||
 | 
					#ifdef GRID_COMMS_SHMEM
 | 
				
			||||||
 | 
					    shmem_free((void *)__p);
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
#ifdef HAVE_MM_MALLOC_H
 | 
					#ifdef HAVE_MM_MALLOC_H
 | 
				
			||||||
    _mm_free((void *)__p); 
 | 
					    _mm_free((void *)__p); 
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    free((void *)__p);
 | 
					    free((void *)__p);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void construct(pointer __p, const _Tp& __val) { };
 | 
					  void construct(pointer __p, const _Tp& __val) { };
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										180
									
								
								lib/Config.h.in
									
									
									
									
									
								
							
							
						
						
									
										180
									
								
								lib/Config.h.in
									
									
									
									
									
								
							@@ -1,180 +0,0 @@
 | 
				
			|||||||
/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* AVX Intrinsics */
 | 
					 | 
				
			||||||
#undef AVX1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* AVX2 Intrinsics */
 | 
					 | 
				
			||||||
#undef AVX2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* AVX512 Intrinsics for Knights Landing */
 | 
					 | 
				
			||||||
#undef AVX512
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* AVX Intrinsics with FMA4 */
 | 
					 | 
				
			||||||
#undef AVXFMA4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* EMPTY_SIMD only for DEBUGGING */
 | 
					 | 
				
			||||||
#undef EMPTY_SIMD
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* GRID_COMMS_MPI */
 | 
					 | 
				
			||||||
#undef GRID_COMMS_MPI
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* GRID_COMMS_NONE */
 | 
					 | 
				
			||||||
#undef GRID_COMMS_NONE
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* GRID_DEFAULT_PRECISION is DOUBLE */
 | 
					 | 
				
			||||||
#undef GRID_DEFAULT_PRECISION_DOUBLE
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* GRID_DEFAULT_PRECISION is SINGLE */
 | 
					 | 
				
			||||||
#undef GRID_DEFAULT_PRECISION_SINGLE
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support Altivec instructions */
 | 
					 | 
				
			||||||
#undef HAVE_ALTIVEC
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support AVX (Advanced Vector Extensions) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_AVX
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support AVX2 (Advanced Vector Extensions 2) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_AVX2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
 | 
					 | 
				
			||||||
   don't. */
 | 
					 | 
				
			||||||
#undef HAVE_DECL_BE64TOH
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
 | 
					 | 
				
			||||||
   */
 | 
					 | 
				
			||||||
#undef HAVE_DECL_NTOHLL
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <endian.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_ENDIAN_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <execinfo.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_EXECINFO_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support FMA3 (Fused Multiply-Add) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_FMA
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the `gettimeofday' function. */
 | 
					 | 
				
			||||||
#undef HAVE_GETTIMEOFDAY
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <gmp.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_GMP_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <inttypes.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_INTTYPES_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <malloc.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_MALLOC_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_MALLOC_MALLOC_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <memory.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_MEMORY_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support mmx instructions */
 | 
					 | 
				
			||||||
#undef HAVE_MMX
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <mm_malloc.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_MM_MALLOC_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support SSE (Streaming SIMD Extensions) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_SSE
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_SSE2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_SSE3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_SSE4_1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_SSE4_2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
 | 
					 | 
				
			||||||
#undef HAVE_SSSE3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <stdint.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_STDINT_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <stdlib.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_STDLIB_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <strings.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_STRINGS_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <string.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_STRING_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <sys/stat.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_SYS_STAT_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <sys/types.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_SYS_TYPES_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the <unistd.h> header file. */
 | 
					 | 
				
			||||||
#undef HAVE_UNISTD_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* IMCI Intrinsics for Knights Corner */
 | 
					 | 
				
			||||||
#undef IMCI
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* NEON ARMv8 Experimental support */
 | 
					 | 
				
			||||||
#undef NEONv8
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Name of package */
 | 
					 | 
				
			||||||
#undef PACKAGE
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the address where bug reports for this package should be sent. */
 | 
					 | 
				
			||||||
#undef PACKAGE_BUGREPORT
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the full name of this package. */
 | 
					 | 
				
			||||||
#undef PACKAGE_NAME
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the full name and version of this package. */
 | 
					 | 
				
			||||||
#undef PACKAGE_STRING
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the one symbol short name of this package. */
 | 
					 | 
				
			||||||
#undef PACKAGE_TARNAME
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the home page for this package. */
 | 
					 | 
				
			||||||
#undef PACKAGE_URL
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the version of this package. */
 | 
					 | 
				
			||||||
#undef PACKAGE_VERSION
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* RNG_MT19937 */
 | 
					 | 
				
			||||||
#undef RNG_MT19937
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* RNG_RANLUX */
 | 
					 | 
				
			||||||
#undef RNG_RANLUX
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* SSE4 Intrinsics */
 | 
					 | 
				
			||||||
#undef SSE4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to 1 if you have the ANSI C header files. */
 | 
					 | 
				
			||||||
#undef STDC_HEADERS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Version number of package */
 | 
					 | 
				
			||||||
#undef VERSION
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
 | 
					 | 
				
			||||||
   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
 | 
					 | 
				
			||||||
   #define below would cause a syntax error. */
 | 
					 | 
				
			||||||
#undef _UINT32_T
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
 | 
					 | 
				
			||||||
   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
 | 
					 | 
				
			||||||
   #define below would cause a syntax error. */
 | 
					 | 
				
			||||||
#undef _UINT64_T
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
 | 
					 | 
				
			||||||
#undef size_t
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the type of an unsigned integer type of width exactly 32 bits if
 | 
					 | 
				
			||||||
   such a type exists and the standard includes do not define it. */
 | 
					 | 
				
			||||||
#undef uint32_t
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Define to the type of an unsigned integer type of width exactly 64 bits if
 | 
					 | 
				
			||||||
   such a type exists and the standard includes do not define it. */
 | 
					 | 
				
			||||||
#undef uint64_t
 | 
					 | 
				
			||||||
@@ -37,4 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#ifdef GRID_COMMS_MPI
 | 
					#ifdef GRID_COMMS_MPI
 | 
				
			||||||
#include <cshift/Cshift_mpi.h>
 | 
					#include <cshift/Cshift_mpi.h>
 | 
				
			||||||
#endif 
 | 
					#endif 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef GRID_COMMS_SHMEM
 | 
				
			||||||
 | 
					#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
 | 
				
			||||||
 | 
					#endif 
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -62,10 +62,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include <serialisation/Serialisation.h>
 | 
					#include <serialisation/Serialisation.h>
 | 
				
			||||||
#include <Config.h>
 | 
					#include <Config.h>
 | 
				
			||||||
#include <Timer.h>
 | 
					#include <Timer.h>
 | 
				
			||||||
 | 
					#include <PerfCount.h>
 | 
				
			||||||
#include <Log.h>
 | 
					#include <Log.h>
 | 
				
			||||||
#include <AlignedAllocator.h>
 | 
					#include <AlignedAllocator.h>
 | 
				
			||||||
#include <Simd.h>
 | 
					#include <Simd.h>
 | 
				
			||||||
#include <Threads.h>
 | 
					#include <Threads.h>
 | 
				
			||||||
 | 
					#include <Lexicographic.h>
 | 
				
			||||||
#include <Communicator.h> 
 | 
					#include <Communicator.h> 
 | 
				
			||||||
#include <Cartesian.h>    
 | 
					#include <Cartesian.h>    
 | 
				
			||||||
#include <Tensors.h>      
 | 
					#include <Tensors.h>      
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										49
									
								
								lib/Init.cc
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								lib/Init.cc
									
									
									
									
									
								
							@@ -45,12 +45,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include <algorithm>
 | 
					#include <algorithm>
 | 
				
			||||||
#include <iterator>
 | 
					#include <iterator>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define __X86_64
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef HAVE_EXECINFO_H
 | 
					 | 
				
			||||||
#include <execinfo.h>
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////
 | 
				
			||||||
@@ -150,6 +144,10 @@ void GridParseLayout(char **argv,int argc,
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
 | 
					  if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
 | 
				
			||||||
    std::vector<int> ompthreads(0);
 | 
					    std::vector<int> ompthreads(0);
 | 
				
			||||||
 | 
					#ifndef GRID_OMP
 | 
				
			||||||
 | 
					    std::cout << GridLogWarning << "'--threads' option used but Grid was"
 | 
				
			||||||
 | 
					              << " not compiled with thread support" << std::endl;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
 | 
					    arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
 | 
				
			||||||
    GridCmdOptionIntVector(arg,ompthreads);
 | 
					    GridCmdOptionIntVector(arg,ompthreads);
 | 
				
			||||||
    assert(ompthreads.size()==1);
 | 
					    assert(ompthreads.size()==1);
 | 
				
			||||||
@@ -174,9 +172,8 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 | 
				
			|||||||
/////////////////////////////////////////////////////////
 | 
					/////////////////////////////////////////////////////////
 | 
				
			||||||
void Grid_init(int *argc,char ***argv)
 | 
					void Grid_init(int *argc,char ***argv)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
#ifdef GRID_COMMS_MPI
 | 
					  CartesianCommunicator::Init(argc,argv);
 | 
				
			||||||
  MPI_Init(argc,argv);
 | 
					
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  // Parse command line args.
 | 
					  // Parse command line args.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  GridLogger::StopWatch.Start();
 | 
					  GridLogger::StopWatch.Start();
 | 
				
			||||||
@@ -194,9 +191,10 @@ void Grid_init(int *argc,char ***argv)
 | 
				
			|||||||
    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
 | 
					    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
 | 
				
			||||||
    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
 | 
					    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
 | 
				
			||||||
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
 | 
					    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
 | 
				
			||||||
    std::cout<<GridLogMessage<<"--omp n         : default number of OMP threads"<<std::endl;    
 | 
					    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
 | 
				
			||||||
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
 | 
					    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
 | 
				
			||||||
    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;    
 | 
					    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
 | 
				
			||||||
 | 
					    exit(EXIT_SUCCESS);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
 | 
					  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
 | 
				
			||||||
@@ -213,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
 | 
				
			|||||||
    Grid_quiesce_nodes();
 | 
					    Grid_quiesce_nodes();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
 | 
					  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
 | 
				
			||||||
    QCD::WilsonFermionStatic::HandOptDslash=1;
 | 
					    QCD::WilsonKernelsStatic::HandOpt=1;
 | 
				
			||||||
    QCD::WilsonFermion5DStatic::HandOptDslash=1;
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
					  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
				
			||||||
    LebesgueOrder::UseLebesgueOrder=1;
 | 
					    LebesgueOrder::UseLebesgueOrder=1;
 | 
				
			||||||
@@ -287,13 +284,7 @@ void Grid_finalize(void)
 | 
				
			|||||||
  Grid_unquiesce_nodes();
 | 
					  Grid_unquiesce_nodes();
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
double usecond(void) {
 | 
					 | 
				
			||||||
  struct timeval tv;
 | 
					 | 
				
			||||||
  gettimeofday(&tv,NULL);
 | 
					 | 
				
			||||||
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define _NBACKTRACE (256)
 | 
					 | 
				
			||||||
void * Grid_backtrace_buffer[_NBACKTRACE];
 | 
					void * Grid_backtrace_buffer[_NBACKTRACE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
					void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
				
			||||||
@@ -305,11 +296,11 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
				
			|||||||
  // Linux/Posix
 | 
					  // Linux/Posix
 | 
				
			||||||
#ifdef __linux__
 | 
					#ifdef __linux__
 | 
				
			||||||
  // And x86 64bit
 | 
					  // And x86 64bit
 | 
				
			||||||
    ucontext_t * uc= (ucontext_t *)ptr;
 | 
					#ifdef __x86_64__
 | 
				
			||||||
 | 
					  ucontext_t * uc= (ucontext_t *)ptr;
 | 
				
			||||||
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
 | 
					  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
 | 
				
			||||||
  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 | 
					  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 | 
				
			||||||
#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
 | 
					#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
 | 
				
			||||||
 | 
					 | 
				
			||||||
  REG(rdi);
 | 
					  REG(rdi);
 | 
				
			||||||
  REG(rsi);
 | 
					  REG(rsi);
 | 
				
			||||||
  REG(rbp);
 | 
					  REG(rbp);
 | 
				
			||||||
@@ -330,17 +321,15 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
				
			|||||||
  REG(r14);
 | 
					  REG(r14);
 | 
				
			||||||
  REG(r15);
 | 
					  REG(r15);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef HAVE_EXECINFO_H
 | 
					 | 
				
			||||||
  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
 | 
					 | 
				
			||||||
  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
 | 
					 | 
				
			||||||
  for (int i = 0; i < symbols; i++){
 | 
					 | 
				
			||||||
    printf ("%s\n", strings[i]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					  BACKTRACE();
 | 
				
			||||||
  exit(0);
 | 
					  exit(0);
 | 
				
			||||||
  return;
 | 
					  return;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					#ifdef GRID_FPE
 | 
				
			||||||
 | 
					#define _GNU_SOURCE
 | 
				
			||||||
 | 
					#include <fenv.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
void Grid_debug_handler_init(void)
 | 
					void Grid_debug_handler_init(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  struct sigaction sa,osa;
 | 
					  struct sigaction sa,osa;
 | 
				
			||||||
@@ -349,5 +338,9 @@ void Grid_debug_handler_init(void)
 | 
				
			|||||||
  sa.sa_flags    = SA_SIGINFO;
 | 
					  sa.sa_flags    = SA_SIGINFO;
 | 
				
			||||||
  sigaction(SIGSEGV,&sa,NULL);
 | 
					  sigaction(SIGSEGV,&sa,NULL);
 | 
				
			||||||
  sigaction(SIGTRAP,&sa,NULL);
 | 
					  sigaction(SIGTRAP,&sa,NULL);
 | 
				
			||||||
 | 
					#ifdef GRID_FPE
 | 
				
			||||||
 | 
					  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
 | 
				
			||||||
 | 
					  sigaction(SIGFPE,&sa,NULL);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										32
									
								
								lib/Lexicographic.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								lib/Lexicographic.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,32 @@
 | 
				
			|||||||
 | 
					#ifndef GRID_LEXICOGRAPHIC_H
 | 
				
			||||||
 | 
					#define GRID_LEXICOGRAPHIC_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  class Lexicographic {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
 | 
				
			||||||
 | 
					      int nd= dims.size();
 | 
				
			||||||
 | 
					      coor.resize(nd);
 | 
				
			||||||
 | 
					      for(int d=0;d<nd;d++){
 | 
				
			||||||
 | 
						coor[d] = index % dims[d];
 | 
				
			||||||
 | 
						index   = index / dims[d];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
 | 
				
			||||||
 | 
					      int nd=dims.size();
 | 
				
			||||||
 | 
					      int stride=1;
 | 
				
			||||||
 | 
					      index=0;
 | 
				
			||||||
 | 
					      for(int d=0;d<nd;d++){
 | 
				
			||||||
 | 
						index = index+stride*coor[d];
 | 
				
			||||||
 | 
						stride=stride*dims[d];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
@@ -73,13 +73,16 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
 | 
				
			|||||||
////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////
 | 
				
			||||||
void Grid_quiesce_nodes(void)
 | 
					void Grid_quiesce_nodes(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
					  int me=0;
 | 
				
			||||||
#ifdef GRID_COMMS_MPI
 | 
					#ifdef GRID_COMMS_MPI
 | 
				
			||||||
  int me;
 | 
					 | 
				
			||||||
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
 | 
					  MPI_Comm_rank(MPI_COMM_WORLD,&me);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef GRID_COMMS_SHMEM
 | 
				
			||||||
 | 
					  me = shmem_my_pe();
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  if ( me ) { 
 | 
					  if ( me ) { 
 | 
				
			||||||
    std::cout.setstate(std::ios::badbit);
 | 
					    std::cout.setstate(std::ios::badbit);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void Grid_unquiesce_nodes(void)
 | 
					void Grid_unquiesce_nodes(void)
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										191
									
								
								lib/Log.h
									
									
									
									
									
								
							
							
						
						
									
										191
									
								
								lib/Log.h
									
									
									
									
									
								
							@@ -32,75 +32,80 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#ifndef GRID_LOG_H
 | 
					#ifndef GRID_LOG_H
 | 
				
			||||||
#define GRID_LOG_H
 | 
					#define GRID_LOG_H
 | 
				
			||||||
namespace Grid {
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Dress the output; use std::chrono for time stamping via the StopWatch class
 | 
					#ifdef HAVE_EXECINFO_H
 | 
				
			||||||
 | 
					#include <execinfo.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Dress the output; use std::chrono for time stamping via the StopWatch class
 | 
				
			||||||
 | 
					int Rank(void); // used for early stage debug before library init
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  class Colours{
 | 
					class Colours{
 | 
				
			||||||
  protected:
 | 
					protected:
 | 
				
			||||||
    bool is_active;
 | 
					  bool is_active;
 | 
				
			||||||
  public:
 | 
					public:
 | 
				
			||||||
    std::map<std::string, std::string> colour;
 | 
					  std::map<std::string, std::string> colour;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Colours(bool activate=false){
 | 
					  Colours(bool activate=false){
 | 
				
			||||||
      Active(activate);
 | 
					    Active(activate);
 | 
				
			||||||
    };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    void Active(bool activate){
 | 
					 | 
				
			||||||
      is_active=activate;
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      if (is_active){
 | 
					 | 
				
			||||||
	colour["BLACK"]  ="\033[30m";
 | 
					 | 
				
			||||||
	colour["RED"]    ="\033[31m";
 | 
					 | 
				
			||||||
	colour["GREEN"]  ="\033[32m";
 | 
					 | 
				
			||||||
	colour["YELLOW"] ="\033[33m";
 | 
					 | 
				
			||||||
	colour["BLUE"]   ="\033[34m";
 | 
					 | 
				
			||||||
	colour["PURPLE"] ="\033[35m";
 | 
					 | 
				
			||||||
	colour["CYAN"]   ="\033[36m";
 | 
					 | 
				
			||||||
	colour["WHITE"]  ="\033[37m";
 | 
					 | 
				
			||||||
	colour["NORMAL"] ="\033[0;39m";
 | 
					 | 
				
			||||||
      } else {
 | 
					 | 
				
			||||||
      colour["BLACK"] ="";
 | 
					 | 
				
			||||||
      colour["RED"]   ="";
 | 
					 | 
				
			||||||
      colour["GREEN"] ="";
 | 
					 | 
				
			||||||
      colour["YELLOW"]="";
 | 
					 | 
				
			||||||
      colour["BLUE"]  ="";
 | 
					 | 
				
			||||||
      colour["PURPLE"]="";
 | 
					 | 
				
			||||||
      colour["CYAN"]  ="";
 | 
					 | 
				
			||||||
      colour["WHITE"] ="";
 | 
					 | 
				
			||||||
      colour["NORMAL"]="";
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
    };
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  void Active(bool activate){
 | 
				
			||||||
 | 
					    is_active=activate;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  class Logger {
 | 
					    if (is_active){
 | 
				
			||||||
  protected:
 | 
					     colour["BLACK"]  ="\033[30m";
 | 
				
			||||||
    Colours &Painter;
 | 
					     colour["RED"]    ="\033[31m";
 | 
				
			||||||
    int active;
 | 
					     colour["GREEN"]  ="\033[32m";
 | 
				
			||||||
    std::string name, topName;
 | 
					     colour["YELLOW"] ="\033[33m";
 | 
				
			||||||
    std::string COLOUR;
 | 
					     colour["BLUE"]   ="\033[34m";
 | 
				
			||||||
 | 
					     colour["PURPLE"] ="\033[35m";
 | 
				
			||||||
 | 
					     colour["CYAN"]   ="\033[36m";
 | 
				
			||||||
 | 
					     colour["WHITE"]  ="\033[37m";
 | 
				
			||||||
 | 
					     colour["NORMAL"] ="\033[0;39m";
 | 
				
			||||||
 | 
					   } else {
 | 
				
			||||||
 | 
					    colour["BLACK"] ="";
 | 
				
			||||||
 | 
					    colour["RED"]   ="";
 | 
				
			||||||
 | 
					    colour["GREEN"] ="";
 | 
				
			||||||
 | 
					    colour["YELLOW"]="";
 | 
				
			||||||
 | 
					    colour["BLUE"]  ="";
 | 
				
			||||||
 | 
					    colour["PURPLE"]="";
 | 
				
			||||||
 | 
					    colour["CYAN"]  ="";
 | 
				
			||||||
 | 
					    colour["WHITE"] ="";
 | 
				
			||||||
 | 
					    colour["NORMAL"]="";
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  public:
 | 
					 | 
				
			||||||
    static GridStopWatch StopWatch;
 | 
					 | 
				
			||||||
    static std::ostream devnull;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::string background() {return Painter.colour["NORMAL"];}
 | 
					};
 | 
				
			||||||
    std::string evidence() {return Painter.colour["YELLOW"];}
 | 
					 | 
				
			||||||
    std::string colour() {return Painter.colour[COLOUR];}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
 | 
					};
 | 
				
			||||||
      : active(on),
 | 
					
 | 
				
			||||||
	name(nm),
 | 
					
 | 
				
			||||||
	topName(topNm),
 | 
					class Logger {
 | 
				
			||||||
	Painter(col_class),
 | 
					protected:
 | 
				
			||||||
	COLOUR(col){} ;
 | 
					  Colours &Painter;
 | 
				
			||||||
 | 
					  int active;
 | 
				
			||||||
 | 
					  std::string name, topName;
 | 
				
			||||||
 | 
					  std::string COLOUR;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					public:
 | 
				
			||||||
 | 
					  static GridStopWatch StopWatch;
 | 
				
			||||||
 | 
					  static std::ostream devnull;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::string background() {return Painter.colour["NORMAL"];}
 | 
				
			||||||
 | 
					  std::string evidence() {return Painter.colour["YELLOW"];}
 | 
				
			||||||
 | 
					  std::string colour() {return Painter.colour[COLOUR];}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
 | 
				
			||||||
 | 
					  : active(on),
 | 
				
			||||||
 | 
					  name(nm),
 | 
				
			||||||
 | 
					  topName(topNm),
 | 
				
			||||||
 | 
					  Painter(col_class),
 | 
				
			||||||
 | 
					  COLOUR(col){} ;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  void Active(int on) {active = on;};
 | 
					  void Active(int on) {active = on;};
 | 
				
			||||||
  int  isActive(void) {return active;};
 | 
					  int  isActive(void) {return active;};
 | 
				
			||||||
@@ -108,36 +113,68 @@ namespace Grid {
 | 
				
			|||||||
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
 | 
					  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if ( log.active ) {
 | 
					    if ( log.active ) {
 | 
				
			||||||
            StopWatch.Stop();
 | 
					      StopWatch.Stop();
 | 
				
			||||||
            GridTime now = StopWatch.Elapsed();
 | 
					      GridTime now = StopWatch.Elapsed();
 | 
				
			||||||
            StopWatch.Start();
 | 
					      StopWatch.Start();
 | 
				
			||||||
            stream << log.background()<< log.topName << log.background()<< " : ";
 | 
					      stream << log.background()<< log.topName << log.background()<< " : ";
 | 
				
			||||||
            stream << log.colour() <<std::setw(10) << std::left << log.name << log.background() << " : ";
 | 
					      stream << log.colour() <<std::setw(10) << std::left << log.name << log.background() << " : ";
 | 
				
			||||||
            stream << log.evidence()<< now << log.background() << " : " << log.colour();
 | 
					      stream << log.evidence()<< now << log.background() << " : " << log.colour();
 | 
				
			||||||
            return stream;
 | 
					      return stream;
 | 
				
			||||||
        } else { 
 | 
					    } else { 
 | 
				
			||||||
            return devnull;
 | 
					      return devnull;
 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GridLogger: public Logger {
 | 
					class GridLogger: public Logger {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
 | 
					  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
 | 
				
			||||||
    Logger("Grid", on, nm, col_class, col_key){};
 | 
					  Logger("Grid", on, nm, col_class, col_key){};
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void GridLogConfigure(std::vector<std::string> &logstreams);
 | 
					void GridLogConfigure(std::vector<std::string> &logstreams);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  extern GridLogger GridLogError;
 | 
					extern GridLogger GridLogError;
 | 
				
			||||||
  extern GridLogger GridLogWarning;
 | 
					extern GridLogger GridLogWarning;
 | 
				
			||||||
  extern GridLogger GridLogMessage;
 | 
					extern GridLogger GridLogMessage;
 | 
				
			||||||
  extern GridLogger GridLogDebug  ;
 | 
					extern GridLogger GridLogDebug  ;
 | 
				
			||||||
  extern GridLogger GridLogPerformance;
 | 
					extern GridLogger GridLogPerformance;
 | 
				
			||||||
  extern GridLogger GridLogIterative  ;
 | 
					extern GridLogger GridLogIterative  ;
 | 
				
			||||||
  extern GridLogger GridLogIntegrator  ;
 | 
					extern GridLogger GridLogIntegrator  ;
 | 
				
			||||||
  extern Colours    GridLogColours;
 | 
					extern Colours    GridLogColours;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define _NBACKTRACE (256)
 | 
				
			||||||
 | 
					extern void * Grid_backtrace_buffer[_NBACKTRACE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define BACKTRACEFILE() {\
 | 
				
			||||||
 | 
					char string[20];					\
 | 
				
			||||||
 | 
					std::sprintf(string,"backtrace.%d",Rank());				\
 | 
				
			||||||
 | 
					std::FILE * fp = std::fopen(string,"w");				\
 | 
				
			||||||
 | 
					BACKTRACEFP(fp)\
 | 
				
			||||||
 | 
					std::fclose(fp);	    \
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef HAVE_EXECINFO_H
 | 
				
			||||||
 | 
					#define BACKTRACEFP(fp) { \
 | 
				
			||||||
 | 
					int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
 | 
				
			||||||
 | 
					char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
 | 
				
			||||||
 | 
					for (int i = 0; i < symbols; i++){\
 | 
				
			||||||
 | 
					  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
 | 
				
			||||||
 | 
					}\
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#else 
 | 
				
			||||||
 | 
					#define BACKTRACEFP(fp) { \
 | 
				
			||||||
 | 
					std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
 | 
				
			||||||
 | 
					std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
 | 
				
			||||||
 | 
					std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
 | 
				
			||||||
 | 
					std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define BACKTRACE() BACKTRACEFP(stdout) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							@@ -6,6 +6,10 @@ if BUILD_COMMS_MPI
 | 
				
			|||||||
  extra_sources+=communicator/Communicator_mpi.cc
 | 
					  extra_sources+=communicator/Communicator_mpi.cc
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if BUILD_COMMS_SHMEM
 | 
				
			||||||
 | 
					  extra_sources+=communicator/Communicator_shmem.cc
 | 
				
			||||||
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if BUILD_COMMS_NONE
 | 
					if BUILD_COMMS_NONE
 | 
				
			||||||
  extra_sources+=communicator/Communicator_none.cc
 | 
					  extra_sources+=communicator/Communicator_none.cc
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										
											BIN
										
									
								
								lib/Old/Endeavour.tgz
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								lib/Old/Endeavour.tgz
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							@@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 | 
					#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 | 
				
			||||||
 | 
					#define RawConfig(A,B) (A<<8|B)
 | 
				
			||||||
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 | 
					const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 | 
				
			||||||
#ifdef __linux__
 | 
					#ifdef __linux__
 | 
				
			||||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
 | 
					  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
 | 
				
			||||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
 | 
					  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
 | 
				
			||||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
 | 
					  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
 | 
				
			||||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
 | 
					  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
 | 
					    // 4
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
 | 
					#ifdef AVX512
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
 | 
					    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
 | 
					    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
 | 
					    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
 | 
					    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
 | 
					    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
 | 
				
			||||||
  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
 | 
					    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
 | 
					    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
 | 
					    // 11
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
 | 
					#else
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
 | 
				
			||||||
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					    // 11
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					    //15
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
 | 
				
			||||||
 | 
					    //19
 | 
				
			||||||
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 | 
					  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										112
									
								
								lib/PerfCount.h
									
									
									
									
									
								
							
							
						
						
									
										112
									
								
								lib/PerfCount.h
									
									
									
									
									
								
							@@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include <ctime>
 | 
					#include <ctime>
 | 
				
			||||||
#include <chrono>
 | 
					#include <chrono>
 | 
				
			||||||
#include <string.h>
 | 
					#include <string.h>
 | 
				
			||||||
 | 
					#include <unistd.h>
 | 
				
			||||||
#include <sys/ioctl.h>
 | 
					#include <sys/ioctl.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef __linux__
 | 
					#ifdef __linux__
 | 
				
			||||||
@@ -43,8 +43,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#else
 | 
					#else
 | 
				
			||||||
#include <sys/syscall.h>
 | 
					#include <sys/syscall.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
namespace Grid {
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef __linux__
 | 
					#ifdef __linux__
 | 
				
			||||||
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 | 
					static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 | 
				
			||||||
@@ -58,6 +58,49 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef TIMERS_OFF
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					inline uint64_t cyclecount(void){ 
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 | 
				
			||||||
 | 
					#define __SSC_STOP  __SSC_MARK(0x110)
 | 
				
			||||||
 | 
					#define __SSC_START __SSC_MARK(0x111)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define __SSC_MARK(mark) 
 | 
				
			||||||
 | 
					#define __SSC_STOP  
 | 
				
			||||||
 | 
					#define __SSC_START 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * cycle counters arch dependent
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef __bgq__
 | 
				
			||||||
 | 
					inline uint64_t cyclecount(void){ 
 | 
				
			||||||
 | 
					   uint64_t tmp;
 | 
				
			||||||
 | 
					   asm volatile ("mfspr %0,0x10C" : "=&r" (tmp)  );
 | 
				
			||||||
 | 
					   return tmp;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#elif defined __x86_64__
 | 
				
			||||||
 | 
					#include <x86intrin.h>
 | 
				
			||||||
 | 
					inline uint64_t cyclecount(void){ 
 | 
				
			||||||
 | 
					  return __rdtsc();
 | 
				
			||||||
 | 
					  //  unsigned int dummy;
 | 
				
			||||||
 | 
					  // return __rdtscp(&dummy);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					inline uint64_t cyclecount(void){ 
 | 
				
			||||||
 | 
					   return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PerformanceCounter {
 | 
					class PerformanceCounter {
 | 
				
			||||||
private:
 | 
					private:
 | 
				
			||||||
@@ -67,6 +110,7 @@ private:
 | 
				
			|||||||
    uint32_t type;
 | 
					    uint32_t type;
 | 
				
			||||||
    uint64_t config;
 | 
					    uint64_t config;
 | 
				
			||||||
    const char *name;
 | 
					    const char *name;
 | 
				
			||||||
 | 
					    int normalisation;
 | 
				
			||||||
  } PerformanceCounterConfig; 
 | 
					  } PerformanceCounterConfig; 
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  static const PerformanceCounterConfig PerformanceCounterConfigs [];
 | 
					  static const PerformanceCounterConfig PerformanceCounterConfigs [];
 | 
				
			||||||
@@ -74,26 +118,12 @@ private:
 | 
				
			|||||||
public:
 | 
					public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  enum PerformanceCounterType {
 | 
					  enum PerformanceCounterType {
 | 
				
			||||||
    CPUCYCLES=0,
 | 
					    CACHE_REFERENCES=0,
 | 
				
			||||||
    INSTRUCTIONS,
 | 
					    CACHE_MISSES=1,
 | 
				
			||||||
    //    STALL_CYCLES,
 | 
					    CPUCYCLES=2,
 | 
				
			||||||
    CACHE_REFERENCES,
 | 
					    INSTRUCTIONS=3,
 | 
				
			||||||
    CACHE_MISSES,
 | 
					    L1D_READ_ACCESS=4,
 | 
				
			||||||
    L1D_READ_MISS,
 | 
					    PERFORMANCE_COUNTER_NUM_TYPES=19
 | 
				
			||||||
    L1D_READ_ACCESS,
 | 
					 | 
				
			||||||
    L1D_WRITE_MISS,
 | 
					 | 
				
			||||||
    L1D_WRITE_ACCESS,
 | 
					 | 
				
			||||||
    L1D_PREFETCH_MISS,
 | 
					 | 
				
			||||||
    L1D_PREFETCH_ACCESS,
 | 
					 | 
				
			||||||
    LL_READ_MISS,
 | 
					 | 
				
			||||||
    //    LL_READ_ACCESS,
 | 
					 | 
				
			||||||
    LL_WRITE_MISS,
 | 
					 | 
				
			||||||
    LL_WRITE_ACCESS,
 | 
					 | 
				
			||||||
    LL_PREFETCH_MISS,
 | 
					 | 
				
			||||||
    LL_PREFETCH_ACCESS,
 | 
					 | 
				
			||||||
    L1I_READ_MISS,
 | 
					 | 
				
			||||||
    L1I_READ_ACCESS,
 | 
					 | 
				
			||||||
    PERFORMANCE_COUNTER_NUM_TYPES
 | 
					 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
@@ -101,7 +131,9 @@ public:
 | 
				
			|||||||
  int PCT;
 | 
					  int PCT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  long long count;
 | 
					  long long count;
 | 
				
			||||||
 | 
					  long long cycles;
 | 
				
			||||||
  int fd;
 | 
					  int fd;
 | 
				
			||||||
 | 
					  int cyclefd;
 | 
				
			||||||
  unsigned long long elapsed;
 | 
					  unsigned long long elapsed;
 | 
				
			||||||
  uint64_t begin;
 | 
					  uint64_t begin;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -114,7 +146,9 @@ public:
 | 
				
			|||||||
    assert(_pct>=0);
 | 
					    assert(_pct>=0);
 | 
				
			||||||
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
 | 
					    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
 | 
				
			||||||
    fd=-1;
 | 
					    fd=-1;
 | 
				
			||||||
 | 
					    cyclefd=-1;
 | 
				
			||||||
    count=0;
 | 
					    count=0;
 | 
				
			||||||
 | 
					    cycles=0;
 | 
				
			||||||
    PCT =_pct;
 | 
					    PCT =_pct;
 | 
				
			||||||
    Open();
 | 
					    Open();
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
@@ -139,6 +173,15 @@ public:
 | 
				
			|||||||
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
 | 
					      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
 | 
				
			||||||
      perror("Error is");
 | 
					      perror("Error is");
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    int norm = PerformanceCounterConfigs[PCT].normalisation;
 | 
				
			||||||
 | 
					    pe.type  = PerformanceCounterConfigs[norm].type;
 | 
				
			||||||
 | 
					    pe.config= PerformanceCounterConfigs[norm].config;
 | 
				
			||||||
 | 
					    name = PerformanceCounterConfigs[norm].name;
 | 
				
			||||||
 | 
					    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
 | 
				
			||||||
 | 
					    if (cyclefd == -1) {
 | 
				
			||||||
 | 
					      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
 | 
				
			||||||
 | 
					      perror("Error is");
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -146,10 +189,12 @@ public:
 | 
				
			|||||||
  {
 | 
					  {
 | 
				
			||||||
#ifdef __linux__
 | 
					#ifdef __linux__
 | 
				
			||||||
    if ( fd!= -1) {
 | 
					    if ( fd!= -1) {
 | 
				
			||||||
      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 | 
					      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 | 
				
			||||||
      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 | 
					      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 | 
				
			||||||
 | 
					      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
 | 
				
			||||||
 | 
					      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    begin  =__rdtsc();
 | 
					    begin  =cyclecount();
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    begin = 0;
 | 
					    begin = 0;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
@@ -157,12 +202,15 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  void Stop(void) {
 | 
					  void Stop(void) {
 | 
				
			||||||
    count=0;
 | 
					    count=0;
 | 
				
			||||||
 | 
					    cycles=0;
 | 
				
			||||||
#ifdef __linux__
 | 
					#ifdef __linux__
 | 
				
			||||||
    if ( fd!= -1) {
 | 
					    if ( fd!= -1) {
 | 
				
			||||||
      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 | 
					      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 | 
				
			||||||
 | 
					      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
 | 
				
			||||||
      ::read(fd, &count, sizeof(long long));
 | 
					      ::read(fd, &count, sizeof(long long));
 | 
				
			||||||
 | 
					      ::read(cyclefd, &cycles, sizeof(long long));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    elapsed = __rdtsc() - begin;
 | 
					    elapsed = cyclecount() - begin;
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    elapsed = 0;
 | 
					    elapsed = 0;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
@@ -170,16 +218,20 @@ public:
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  void Report(void) {
 | 
					  void Report(void) {
 | 
				
			||||||
#ifdef __linux__
 | 
					#ifdef __linux__
 | 
				
			||||||
    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
 | 
					    int N = PerformanceCounterConfigs[PCT].normalisation;
 | 
				
			||||||
 | 
					    const char * sn = PerformanceCounterConfigs[N].name ;
 | 
				
			||||||
 | 
					    const char * sc = PerformanceCounterConfigs[PCT].name;
 | 
				
			||||||
 | 
					      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
 | 
				
			||||||
 | 
							  sc, count, sc,sn, (double)count/(double)cycles);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    printf("%llu cycles \n", elapsed );
 | 
					    std::printf("%llu cycles \n", elapsed );
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ~PerformanceCounter()
 | 
					  ~PerformanceCounter()
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
#ifdef __linux__
 | 
					#ifdef __linux__
 | 
				
			||||||
    close(fd);
 | 
					    ::close(fd);    ::close(cyclefd);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -42,10 +42,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
 | 
					#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
 | 
				
			||||||
 | 
					#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
 | 
				
			||||||
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
 | 
					#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
 | 
				
			||||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 | 
					#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 | 
				
			||||||
#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
 | 
					#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define RotateBit (0x100)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  typedef uint32_t Integer;
 | 
					  typedef uint32_t Integer;
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										1594
									
								
								lib/Stencil.h
									
									
									
									
									
								
							
							
						
						
									
										1594
									
								
								lib/Stencil.h
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										27
									
								
								lib/Timer.h
									
									
									
									
									
								
							
							
						
						
									
										27
									
								
								lib/Timer.h
									
									
									
									
									
								
							@@ -39,11 +39,18 @@ namespace Grid {
 | 
				
			|||||||
  // Dress the output; use std::chrono
 | 
					  // Dress the output; use std::chrono
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// C++11 time facilities better?
 | 
					// C++11 time facilities better?
 | 
				
			||||||
double usecond(void);
 | 
					inline double usecond(void) {
 | 
				
			||||||
 | 
					  struct timeval tv;
 | 
				
			||||||
 | 
					#ifdef TIMERS_ON
 | 
				
			||||||
 | 
					  gettimeofday(&tv,NULL);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef  std::chrono::system_clock          GridClock;
 | 
					typedef  std::chrono::system_clock          GridClock;
 | 
				
			||||||
typedef  std::chrono::time_point<GridClock> GridTimePoint;
 | 
					typedef  std::chrono::time_point<GridClock> GridTimePoint;
 | 
				
			||||||
typedef  std::chrono::milliseconds          GridTime;
 | 
					typedef  std::chrono::milliseconds          GridTime;
 | 
				
			||||||
 | 
					typedef  std::chrono::microseconds          GridUsecs;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 | 
					inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -55,29 +62,39 @@ class GridStopWatch {
 | 
				
			|||||||
private:
 | 
					private:
 | 
				
			||||||
  bool running;
 | 
					  bool running;
 | 
				
			||||||
  GridTimePoint start;
 | 
					  GridTimePoint start;
 | 
				
			||||||
  GridTime accumulator;
 | 
					  GridUsecs accumulator;
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  GridStopWatch () { 
 | 
					  GridStopWatch () { 
 | 
				
			||||||
    Reset();
 | 
					    Reset();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void     Start(void) { 
 | 
					  void     Start(void) { 
 | 
				
			||||||
    assert(running == false);
 | 
					    assert(running == false);
 | 
				
			||||||
 | 
					#ifdef TIMERS_ON
 | 
				
			||||||
    start = GridClock::now(); 
 | 
					    start = GridClock::now(); 
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    running = true;
 | 
					    running = true;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  void     Stop(void)  { 
 | 
					  void     Stop(void)  { 
 | 
				
			||||||
    assert(running == true);
 | 
					    assert(running == true);
 | 
				
			||||||
    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
 | 
					#ifdef TIMERS_ON
 | 
				
			||||||
 | 
					    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    running = false; 
 | 
					    running = false; 
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  void     Reset(void){
 | 
					  void     Reset(void){
 | 
				
			||||||
    running = false;
 | 
					    running = false;
 | 
				
			||||||
 | 
					#ifdef TIMERS_ON
 | 
				
			||||||
    start = GridClock::now();
 | 
					    start = GridClock::now();
 | 
				
			||||||
    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
 | 
					#endif
 | 
				
			||||||
 | 
					    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  GridTime Elapsed(void) {
 | 
					  GridTime Elapsed(void) {
 | 
				
			||||||
    assert(running == false);
 | 
					    assert(running == false);
 | 
				
			||||||
    return accumulator;
 | 
					    return std::chrono::duration_cast<GridTime>( accumulator );
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  uint64_t useconds(void){
 | 
				
			||||||
 | 
					    assert(running == false);
 | 
				
			||||||
 | 
					    return (uint64_t) accumulator.count();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -147,6 +147,56 @@ namespace Grid {
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
      Orthogonalise();
 | 
					      Orthogonalise();
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      // Run a Lanczos with sloppy convergence
 | 
				
			||||||
 | 
						const int Nstop = nn;
 | 
				
			||||||
 | 
						const int Nk = nn+20;
 | 
				
			||||||
 | 
						const int Np = nn+20;
 | 
				
			||||||
 | 
						const int Nm = Nk+Np;
 | 
				
			||||||
 | 
						const int MaxIt= 10000;
 | 
				
			||||||
 | 
						RealD resid = 1.0e-3;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Chebyshev<FineField> Cheb(0.5,64.0,21);
 | 
				
			||||||
 | 
						ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
 | 
				
			||||||
 | 
						//	IRL.lock = 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						FineField noise(FineGrid); gaussian(RNG,noise);
 | 
				
			||||||
 | 
						FineField tmp(FineGrid); 
 | 
				
			||||||
 | 
						std::vector<RealD>     eval(Nm);
 | 
				
			||||||
 | 
						std::vector<FineField> evec(Nm,FineGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int Nconv;
 | 
				
			||||||
 | 
						IRL.calc(eval,evec,
 | 
				
			||||||
 | 
							 noise,
 | 
				
			||||||
 | 
							 Nconv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    	// pull back nn vectors
 | 
				
			||||||
 | 
						for(int b=0;b<nn;b++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  subspace[b]   = evec[b];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  hermop.Op(subspace[b],tmp); 
 | 
				
			||||||
 | 
						  std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  noise = tmp -  sqrt(eval[b])*subspace[b] ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  noise = tmp +  eval[b]*subspace[b] ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						Orthogonalise();
 | 
				
			||||||
 | 
						for(int b=0;b<nn;b++){
 | 
				
			||||||
 | 
						  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
 | 
					    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      RealD scale;
 | 
					      RealD scale;
 | 
				
			||||||
@@ -200,7 +250,7 @@ namespace Grid {
 | 
				
			|||||||
    ////////////////////
 | 
					    ////////////////////
 | 
				
			||||||
    Geometry         geom;
 | 
					    Geometry         geom;
 | 
				
			||||||
    GridBase *       _grid; 
 | 
					    GridBase *       _grid; 
 | 
				
			||||||
    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 
 | 
					    CartesianStencil<siteVector,siteVector> Stencil; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::vector<CoarseMatrix> A;
 | 
					    std::vector<CoarseMatrix> A;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -222,6 +222,7 @@ namespace Grid {
 | 
				
			|||||||
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
 | 
					      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
 | 
				
			||||||
      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
					      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
				
			||||||
	Field tmp(in._grid);
 | 
						Field tmp(in._grid);
 | 
				
			||||||
 | 
					//	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	_Mat.Meooe(in,tmp);
 | 
						_Mat.Meooe(in,tmp);
 | 
				
			||||||
	_Mat.MooeeInv(tmp,out);
 | 
						_Mat.MooeeInv(tmp,out);
 | 
				
			||||||
@@ -251,10 +252,10 @@ namespace Grid {
 | 
				
			|||||||
      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
					      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
				
			||||||
	Field tmp(in._grid);
 | 
						Field tmp(in._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	_Mat.Meooe(in,tmp);
 | 
						_Mat.Meooe(in,out);
 | 
				
			||||||
	_Mat.MooeeInv(tmp,out);
 | 
						_Mat.MooeeInv(out,tmp);
 | 
				
			||||||
	_Mat.Meooe(out,tmp);
 | 
						_Mat.Meooe(tmp,out);
 | 
				
			||||||
	_Mat.MooeeInv(tmp,out);
 | 
						_Mat.MooeeInv(out,tmp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return axpy_norm(out,-1.0,tmp,in);
 | 
						return axpy_norm(out,-1.0,tmp,in);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -270,6 +271,35 @@ namespace Grid {
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<class Matrix,class Field>
 | 
				
			||||||
 | 
					      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
 | 
				
			||||||
 | 
					    protected:
 | 
				
			||||||
 | 
					      Matrix &_Mat;
 | 
				
			||||||
 | 
					    public:
 | 
				
			||||||
 | 
					      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
				
			||||||
 | 
						Field tmp(in._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						_Mat.MooeeInv(in,out);
 | 
				
			||||||
 | 
						_Mat.Meooe(out,tmp);
 | 
				
			||||||
 | 
						_Mat.MooeeInv(tmp,out);
 | 
				
			||||||
 | 
						_Mat.Meooe(out,tmp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return axpy_norm(out,-1.0,tmp,in);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      virtual  RealD MpcDag   (const Field &in, Field &out){
 | 
				
			||||||
 | 
						Field tmp(in._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						_Mat.MeooeDag(in,out);
 | 
				
			||||||
 | 
						_Mat.MooeeInvDag(out,tmp);
 | 
				
			||||||
 | 
						_Mat.MeooeDag(tmp,out);
 | 
				
			||||||
 | 
						_Mat.MooeeInvDag(out,tmp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return axpy_norm(out,-1.0,tmp,in);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    /////////////////////////////////////////////////////////////
 | 
					    /////////////////////////////////////////////////////////////
 | 
				
			||||||
    // Base classes for functions of operators
 | 
					    // Base classes for functions of operators
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -58,13 +58,14 @@ namespace Grid {
 | 
				
			|||||||
      Field Mtmp(in._grid);
 | 
					      Field Mtmp(in._grid);
 | 
				
			||||||
      AtoN = in;
 | 
					      AtoN = in;
 | 
				
			||||||
      out = AtoN*Coeffs[0];
 | 
					      out = AtoN*Coeffs[0];
 | 
				
			||||||
      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
 | 
					//            std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
 | 
				
			||||||
      //      std::cout <<"0 " <<norm2(out)<<std::endl;
 | 
					//            std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
 | 
				
			||||||
      for(int n=1;n<Coeffs.size();n++){
 | 
					      for(int n=1;n<Coeffs.size();n++){
 | 
				
			||||||
	Mtmp = AtoN;
 | 
						Mtmp = AtoN;
 | 
				
			||||||
	Linop.HermOp(Mtmp,AtoN);
 | 
						Linop.HermOp(Mtmp,AtoN);
 | 
				
			||||||
	out=out+AtoN*Coeffs[n];
 | 
						out=out+AtoN*Coeffs[n];
 | 
				
			||||||
	//	std::cout << n<<" " <<norm2(out)<<std::endl;
 | 
					//            std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
 | 
				
			||||||
 | 
					//		std::cout << n<<" " <<norm2(out)<<std::endl;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
@@ -82,7 +83,8 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  public:
 | 
					  public:
 | 
				
			||||||
    void csv(std::ostream &out){
 | 
					    void csv(std::ostream &out){
 | 
				
			||||||
      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
 | 
						RealD diff = hi-lo;
 | 
				
			||||||
 | 
					      for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
 | 
				
			||||||
	RealD f = approx(x);
 | 
						RealD f = approx(x);
 | 
				
			||||||
	out<< x<<" "<<f<<std::endl;
 | 
						out<< x<<" "<<f<<std::endl;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -99,10 +101,24 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    Chebyshev(){};
 | 
					    Chebyshev(){};
 | 
				
			||||||
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
 | 
					    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
 | 
				
			||||||
 | 
					    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
 | 
					    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// CJ: the one we need for Lanczos
 | 
				
			||||||
 | 
					    void Init(RealD _lo,RealD _hi,int _order)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      lo=_lo;
 | 
				
			||||||
 | 
					      hi=_hi;
 | 
				
			||||||
 | 
					      order=_order;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      if(order < 2) exit(-1);
 | 
				
			||||||
 | 
					      Coeffs.resize(order);
 | 
				
			||||||
 | 
					      Coeffs.assign(0.,order);
 | 
				
			||||||
 | 
					      Coeffs[order-1] = 1.;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
 | 
					    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      lo=_lo;
 | 
					      lo=_lo;
 | 
				
			||||||
@@ -182,6 +198,8 @@ namespace Grid {
 | 
				
			|||||||
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 | 
					    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      GridBase *grid=in._grid;
 | 
					      GridBase *grid=in._grid;
 | 
				
			||||||
 | 
					//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
 | 
				
			||||||
 | 
					//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int vol=grid->gSites();
 | 
					      int vol=grid->gSites();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -16,9 +16,13 @@
 | 
				
			|||||||
#define INCLUDED_ALG_REMEZ_H
 | 
					#define INCLUDED_ALG_REMEZ_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <stddef.h>
 | 
					#include <stddef.h>
 | 
				
			||||||
 | 
					#include <Config.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#include <algorithms/approx/bigfloat.h>
 | 
					#ifdef HAVE_GMP_H
 | 
				
			||||||
 | 
					#include <algorithms/approx/bigfloat.h>
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
#include <algorithms/approx/bigfloat_double.h>
 | 
					#include <algorithms/approx/bigfloat_double.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define JMAX 10000 //Maximum number of iterations of Newton's approximation
 | 
					#define JMAX 10000 //Maximum number of iterations of Newton's approximation
 | 
				
			||||||
#define SUM_MAX 10 // Maximum number of terms in exponential
 | 
					#define SUM_MAX 10 // Maximum number of terms in exponential
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -84,7 +84,7 @@ public:
 | 
				
			|||||||
	return;
 | 
						return;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
 | 
					      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      GridStopWatch LinalgTimer;
 | 
					      GridStopWatch LinalgTimer;
 | 
				
			||||||
      GridStopWatch MatrixTimer;
 | 
					      GridStopWatch MatrixTimer;
 | 
				
			||||||
@@ -101,8 +101,8 @@ public:
 | 
				
			|||||||
	MatrixTimer.Stop();
 | 
						MatrixTimer.Stop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	LinalgTimer.Start();
 | 
						LinalgTimer.Start();
 | 
				
			||||||
	RealD    qqck = norm2(mmp);
 | 
						//	RealD    qqck = norm2(mmp);
 | 
				
			||||||
	ComplexD dck  = innerProduct(p,mmp);
 | 
						//	ComplexD dck  = innerProduct(p,mmp);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
	a      = c/d;
 | 
						a      = c/d;
 | 
				
			||||||
	b_pred = a*(a*qq-d)/c;
 | 
						b_pred = a*(a*qq-d)/c;
 | 
				
			||||||
@@ -115,7 +115,7 @@ public:
 | 
				
			|||||||
	p  = p*b+r;
 | 
						p  = p*b+r;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	LinalgTimer.Stop();
 | 
						LinalgTimer.Stop();
 | 
				
			||||||
	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 | 
						std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	// Stopping condition
 | 
						// Stopping condition
 | 
				
			||||||
	if ( cp <= rsq ) { 
 | 
						if ( cp <= rsq ) { 
 | 
				
			||||||
@@ -132,9 +132,9 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 | 
						  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 | 
				
			||||||
		   <<" computed residual "<<sqrt(cp/ssq)
 | 
							   <<" computed residual "<<sqrt(cp/ssq)
 | 
				
			||||||
		   <<" true residual     "<<true_residual
 | 
							   <<" true residual "    <<true_residual
 | 
				
			||||||
		   <<" target "<<Tolerance;
 | 
							   <<" target "<<Tolerance<<std::endl;
 | 
				
			||||||
	  std::cout<<" Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 | 
						  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 | 
				
			||||||
	  std::cout<<std::endl;
 | 
						  std::cout<<std::endl;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  assert(true_residual/Tolerance < 1000.0);
 | 
						  assert(true_residual/Tolerance < 1000.0);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -274,7 +274,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  // ugly hack
 | 
					  // ugly hack
 | 
				
			||||||
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
 | 
					  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
 | 
				
			||||||
  assert(0);
 | 
					//  assert(0);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -38,32 +38,34 @@ template<class Field>
 | 
				
			|||||||
class SortEigen {
 | 
					class SortEigen {
 | 
				
			||||||
 private:
 | 
					 private:
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					//hacking for testing for now
 | 
				
			||||||
 | 
					 private:
 | 
				
			||||||
  static bool less_lmd(RealD left,RealD right){
 | 
					  static bool less_lmd(RealD left,RealD right){
 | 
				
			||||||
    return fabs(left) < fabs(right);
 | 
					    return left > right;
 | 
				
			||||||
  }  
 | 
					  }  
 | 
				
			||||||
  static bool less_pair(std::pair<RealD,Field>& left,
 | 
					  static bool less_pair(std::pair<RealD,Field const*>& left,
 | 
				
			||||||
		 std::pair<RealD,Field>& right){
 | 
					                        std::pair<RealD,Field const*>& right){
 | 
				
			||||||
    return fabs(left.first) < fabs(right.first);
 | 
					    return left.first > (right.first);
 | 
				
			||||||
  }  
 | 
					  }  
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 public:
 | 
					 public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  void push(DenseVector<RealD>& lmd,
 | 
					  void push(DenseVector<RealD>& lmd,
 | 
				
			||||||
	    DenseVector<Field>& evec,int N) {
 | 
					            DenseVector<Field>& evec,int N) {
 | 
				
			||||||
 | 
					    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
 | 
				
			||||||
 | 
					    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    DenseVector<std::pair<RealD, Field> > emod;
 | 
					    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
 | 
				
			||||||
    typename DenseVector<std::pair<RealD, Field> >::iterator it;
 | 
					    for(int i=0;i<lmd.size();++i)
 | 
				
			||||||
    
 | 
					      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
 | 
				
			||||||
    for(int i=0;i<lmd.size();++i){
 | 
					 | 
				
			||||||
      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
 | 
					    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    it=emod.begin();
 | 
					    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
 | 
				
			||||||
    for(int i=0;i<N;++i){
 | 
					    for(int i=0;i<N;++i){
 | 
				
			||||||
      lmd[i]=it->first;
 | 
					      lmd[i]=it->first;
 | 
				
			||||||
      evec[i]=it->second;
 | 
					      evec[i]=*(it->second);
 | 
				
			||||||
      ++it;
 | 
					      ++it;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -29,6 +29,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#ifndef GRID_IRL_H
 | 
					#ifndef GRID_IRL_H
 | 
				
			||||||
#define GRID_IRL_H
 | 
					#define GRID_IRL_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <string.h> //memset
 | 
				
			||||||
 | 
					#ifdef USE_LAPACK
 | 
				
			||||||
 | 
					#include <lapacke.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
#include <algorithms/iterative/DenseMatrix.h>
 | 
					#include <algorithms/iterative/DenseMatrix.h>
 | 
				
			||||||
#include <algorithms/iterative/EigenSort.h>
 | 
					#include <algorithms/iterative/EigenSort.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -49,6 +53,7 @@ public:
 | 
				
			|||||||
    int Niter;
 | 
					    int Niter;
 | 
				
			||||||
    int converged;
 | 
					    int converged;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int Nstop;   // Number of evecs checked for convergence
 | 
				
			||||||
    int Nk;      // Number of converged sought
 | 
					    int Nk;      // Number of converged sought
 | 
				
			||||||
    int Np;      // Np -- Number of spare vecs in kryloc space
 | 
					    int Np;      // Np -- Number of spare vecs in kryloc space
 | 
				
			||||||
    int Nm;      // Nm -- total number of vectors
 | 
					    int Nm;      // Nm -- total number of vectors
 | 
				
			||||||
@@ -57,6 +62,8 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    SortEigen<Field> _sort;
 | 
					    SortEigen<Field> _sort;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//    GridCartesian &_fgrid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LinearOperatorBase<Field> &_Linop;
 | 
					    LinearOperatorBase<Field> &_Linop;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    OperatorFunction<Field>   &_poly;
 | 
					    OperatorFunction<Field>   &_poly;
 | 
				
			||||||
@@ -67,7 +74,27 @@ public:
 | 
				
			|||||||
    void init(void){};
 | 
					    void init(void){};
 | 
				
			||||||
    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
 | 
					    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
 | 
					    ImplicitlyRestartedLanczos(
 | 
				
			||||||
 | 
									LinearOperatorBase<Field> &Linop, // op
 | 
				
			||||||
 | 
								       OperatorFunction<Field> & poly,   // polynmial
 | 
				
			||||||
 | 
								       int _Nstop, // sought vecs
 | 
				
			||||||
 | 
								       int _Nk, // sought vecs
 | 
				
			||||||
 | 
								       int _Nm, // spare vecs
 | 
				
			||||||
 | 
								       RealD _eresid, // resid in lmdue deficit 
 | 
				
			||||||
 | 
								       int _Niter) : // Max iterations
 | 
				
			||||||
 | 
					      _Linop(Linop),
 | 
				
			||||||
 | 
					      _poly(poly),
 | 
				
			||||||
 | 
					      Nstop(_Nstop),
 | 
				
			||||||
 | 
					      Nk(_Nk),
 | 
				
			||||||
 | 
					      Nm(_Nm),
 | 
				
			||||||
 | 
					      eresid(_eresid),
 | 
				
			||||||
 | 
					      Niter(_Niter)
 | 
				
			||||||
 | 
					    { 
 | 
				
			||||||
 | 
					      Np = Nm-Nk; assert(Np>0);
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ImplicitlyRestartedLanczos(
 | 
				
			||||||
 | 
									LinearOperatorBase<Field> &Linop, // op
 | 
				
			||||||
			       OperatorFunction<Field> & poly,   // polynmial
 | 
								       OperatorFunction<Field> & poly,   // polynmial
 | 
				
			||||||
			       int _Nk, // sought vecs
 | 
								       int _Nk, // sought vecs
 | 
				
			||||||
			       int _Nm, // spare vecs
 | 
								       int _Nm, // spare vecs
 | 
				
			||||||
@@ -75,6 +102,7 @@ public:
 | 
				
			|||||||
			       int _Niter) : // Max iterations
 | 
								       int _Niter) : // Max iterations
 | 
				
			||||||
      _Linop(Linop),
 | 
					      _Linop(Linop),
 | 
				
			||||||
      _poly(poly),
 | 
					      _poly(poly),
 | 
				
			||||||
 | 
					      Nstop(_Nk),
 | 
				
			||||||
      Nk(_Nk),
 | 
					      Nk(_Nk),
 | 
				
			||||||
      Nm(_Nm),
 | 
					      Nm(_Nm),
 | 
				
			||||||
      eresid(_eresid),
 | 
					      eresid(_eresid),
 | 
				
			||||||
@@ -142,10 +170,11 @@ public:
 | 
				
			|||||||
      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
 | 
					      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
 | 
				
			||||||
                                 // 7. vk+1 := wk/βk+1
 | 
					                                 // 7. vk+1 := wk/βk+1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
 | 
				
			||||||
      const RealD tiny = 1.0e-20;
 | 
					      const RealD tiny = 1.0e-20;
 | 
				
			||||||
      if ( beta < tiny ) { 
 | 
					      if ( beta < tiny ) { 
 | 
				
			||||||
	std::cout << " beta is tiny "<<beta<<std::endl;
 | 
						std::cout << " beta is tiny "<<beta<<std::endl;
 | 
				
			||||||
      }
 | 
					     }
 | 
				
			||||||
      lmd[k] = alph;
 | 
					      lmd[k] = alph;
 | 
				
			||||||
      lme[k]  = beta;
 | 
					      lme[k]  = beta;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -219,15 +248,122 @@ public:
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef USE_LAPACK
 | 
				
			||||||
 | 
					    void diagonalize_lapack(DenseVector<RealD>& lmd,
 | 
				
			||||||
 | 
							     DenseVector<RealD>& lme, 
 | 
				
			||||||
 | 
							     int N1,
 | 
				
			||||||
 | 
							     int N2,
 | 
				
			||||||
 | 
							     DenseVector<RealD>& Qt,
 | 
				
			||||||
 | 
							     GridBase *grid){
 | 
				
			||||||
 | 
					  const int size = Nm;
 | 
				
			||||||
 | 
					//  tevals.resize(size);
 | 
				
			||||||
 | 
					//  tevecs.resize(size);
 | 
				
			||||||
 | 
					  int NN = N1;
 | 
				
			||||||
 | 
					  double evals_tmp[NN];
 | 
				
			||||||
 | 
					  double evec_tmp[NN][NN];
 | 
				
			||||||
 | 
					  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
 | 
				
			||||||
 | 
					//  double AA[NN][NN];
 | 
				
			||||||
 | 
					  double DD[NN];
 | 
				
			||||||
 | 
					  double EE[NN];
 | 
				
			||||||
 | 
					  for (int i = 0; i< NN; i++)
 | 
				
			||||||
 | 
					    for (int j = i - 1; j <= i + 1; j++)
 | 
				
			||||||
 | 
					      if ( j < NN && j >= 0 ) {
 | 
				
			||||||
 | 
					        if (i==j) DD[i] = lmd[i];
 | 
				
			||||||
 | 
					        if (i==j) evals_tmp[i] = lmd[i];
 | 
				
			||||||
 | 
					        if (j==(i-1)) EE[j] = lme[j];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					  int evals_found;
 | 
				
			||||||
 | 
					  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
 | 
				
			||||||
 | 
					  int liwork =  3+NN*10 ;
 | 
				
			||||||
 | 
					  int iwork[liwork];
 | 
				
			||||||
 | 
					  double work[lwork];
 | 
				
			||||||
 | 
					  int isuppz[2*NN];
 | 
				
			||||||
 | 
					  char jobz = 'V'; // calculate evals & evecs
 | 
				
			||||||
 | 
					  char range = 'I'; // calculate all evals
 | 
				
			||||||
 | 
					  //    char range = 'A'; // calculate all evals
 | 
				
			||||||
 | 
					  char uplo = 'U'; // refer to upper half of original matrix
 | 
				
			||||||
 | 
					  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
 | 
				
			||||||
 | 
					  int ifail[NN];
 | 
				
			||||||
 | 
					  int info;
 | 
				
			||||||
 | 
					//  int total = QMP_get_number_of_nodes();
 | 
				
			||||||
 | 
					//  int node = QMP_get_node_number();
 | 
				
			||||||
 | 
					//  GridBase *grid = evec[0]._grid;
 | 
				
			||||||
 | 
					  int total = grid->_Nprocessors;
 | 
				
			||||||
 | 
					  int node = grid->_processor;
 | 
				
			||||||
 | 
					  int interval = (NN/total)+1;
 | 
				
			||||||
 | 
					  double vl = 0.0, vu = 0.0;
 | 
				
			||||||
 | 
					  int il = interval*node+1 , iu = interval*(node+1);
 | 
				
			||||||
 | 
					  if (iu > NN)  iu=NN;
 | 
				
			||||||
 | 
					  double tol = 0.0;
 | 
				
			||||||
 | 
					    if (1) {
 | 
				
			||||||
 | 
					      memset(evals_tmp,0,sizeof(double)*NN);
 | 
				
			||||||
 | 
					      if ( il <= NN){
 | 
				
			||||||
 | 
					        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
 | 
				
			||||||
 | 
					        LAPACK_dstegr(&jobz, &range, &NN,
 | 
				
			||||||
 | 
					            (double*)DD, (double*)EE,
 | 
				
			||||||
 | 
					            &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
 | 
				
			||||||
 | 
					            &tol, // tolerance
 | 
				
			||||||
 | 
					            &evals_found, evals_tmp, (double*)evec_tmp, &NN,
 | 
				
			||||||
 | 
					            isuppz,
 | 
				
			||||||
 | 
					            work, &lwork, iwork, &liwork,
 | 
				
			||||||
 | 
					            &info);
 | 
				
			||||||
 | 
					        for (int i = iu-1; i>= il-1; i--){
 | 
				
			||||||
 | 
					          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
 | 
				
			||||||
 | 
					          evals_tmp[i] = evals_tmp[i - (il-1)];
 | 
				
			||||||
 | 
					          if (il>1) evals_tmp[i-(il-1)]=0.;
 | 
				
			||||||
 | 
					          for (int j = 0; j< NN; j++){
 | 
				
			||||||
 | 
					            evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
 | 
				
			||||||
 | 
					            if (il>1) evec_tmp[i-(il-1)][j]=0.;
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					//        QMP_sum_double_array(evals_tmp,NN);
 | 
				
			||||||
 | 
					//        QMP_sum_double_array((double *)evec_tmp,NN*NN);
 | 
				
			||||||
 | 
					         grid->GlobalSumVector(evals_tmp,NN);
 | 
				
			||||||
 | 
					         grid->GlobalSumVector((double*)evec_tmp,NN*NN);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
 | 
				
			||||||
 | 
					  for(int i=0;i<NN;i++){
 | 
				
			||||||
 | 
					    for(int j=0;j<NN;j++)
 | 
				
			||||||
 | 
					      Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
 | 
				
			||||||
 | 
					      lmd [NN-1-i]=evals_tmp[i];
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void diagonalize(DenseVector<RealD>& lmd,
 | 
					    void diagonalize(DenseVector<RealD>& lmd,
 | 
				
			||||||
		     DenseVector<RealD>& lme, 
 | 
							     DenseVector<RealD>& lme, 
 | 
				
			||||||
		     int Nm2,
 | 
							     int N2,
 | 
				
			||||||
		     int Nm,
 | 
							     int N1,
 | 
				
			||||||
		     DenseVector<RealD>& Qt)
 | 
							     DenseVector<RealD>& Qt,
 | 
				
			||||||
 | 
							     GridBase *grid)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      int Niter = 100*Nm;
 | 
					
 | 
				
			||||||
 | 
					#ifdef USE_LAPACK
 | 
				
			||||||
 | 
					    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if(!check_lapack)
 | 
				
			||||||
 | 
						return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						DenseVector <RealD> lmd2(N1);
 | 
				
			||||||
 | 
						DenseVector <RealD> lme2(N1);
 | 
				
			||||||
 | 
						DenseVector<RealD> Qt2(N1*N1);
 | 
				
			||||||
 | 
					         for(int k=0; k<N1; ++k){
 | 
				
			||||||
 | 
						    lmd2[k] = lmd[k];
 | 
				
			||||||
 | 
						    lme2[k] = lme[k];
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
					         for(int k=0; k<N1*N1; ++k)
 | 
				
			||||||
 | 
						Qt2[k] = Qt[k];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      int Niter = 100*N1;
 | 
				
			||||||
      int kmin = 1;
 | 
					      int kmin = 1;
 | 
				
			||||||
      int kmax = Nk;
 | 
					      int kmax = N2;
 | 
				
			||||||
      // (this should be more sophisticated)
 | 
					      // (this should be more sophisticated)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int iter=0; iter<Niter; ++iter){
 | 
					      for(int iter=0; iter<Niter; ++iter){
 | 
				
			||||||
@@ -239,7 +375,7 @@ public:
 | 
				
			|||||||
	// (Dsh: shift)
 | 
						// (Dsh: shift)
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	// transformation
 | 
						// transformation
 | 
				
			||||||
	qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
 | 
						qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	// Convergence criterion (redef of kmin and kamx)
 | 
						// Convergence criterion (redef of kmin and kamx)
 | 
				
			||||||
	for(int j=kmax-1; j>= kmin; --j){
 | 
						for(int j=kmax-1; j>= kmin; --j){
 | 
				
			||||||
@@ -250,6 +386,23 @@ public:
 | 
				
			|||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	Niter = iter;
 | 
						Niter = iter;
 | 
				
			||||||
 | 
					#ifdef USE_LAPACK
 | 
				
			||||||
 | 
					    if(check_lapack){
 | 
				
			||||||
 | 
						const double SMALL=1e-8;
 | 
				
			||||||
 | 
						diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
 | 
				
			||||||
 | 
						DenseVector <RealD> lmd3(N2);
 | 
				
			||||||
 | 
					         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
 | 
				
			||||||
 | 
					        _sort.push(lmd3,N2);
 | 
				
			||||||
 | 
					        _sort.push(lmd2,N2);
 | 
				
			||||||
 | 
					         for(int k=0; k<N2; ++k){
 | 
				
			||||||
 | 
						    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
 | 
				
			||||||
 | 
					//	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
					         for(int k=0; k<N1*N1; ++k){
 | 
				
			||||||
 | 
					//	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
	return;
 | 
						return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      continued:
 | 
					      continued:
 | 
				
			||||||
@@ -265,6 +418,7 @@ public:
 | 
				
			|||||||
      abort();
 | 
					      abort();
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 1
 | 
				
			||||||
    static RealD normalise(Field& v) 
 | 
					    static RealD normalise(Field& v) 
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      RealD nn = norm2(v);
 | 
					      RealD nn = norm2(v);
 | 
				
			||||||
@@ -326,6 +480,7 @@ until convergence
 | 
				
			|||||||
      {
 | 
					      {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	GridBase *grid = evec[0]._grid;
 | 
						GridBase *grid = evec[0]._grid;
 | 
				
			||||||
 | 
						assert(grid == src._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
 | 
						std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
 | 
				
			||||||
	std::cout << " -- Nm = " << Nm << std::endl;
 | 
						std::cout << " -- Nm = " << Nm << std::endl;
 | 
				
			||||||
@@ -356,11 +511,21 @@ until convergence
 | 
				
			|||||||
	// (uniform vector) Why not src??
 | 
						// (uniform vector) Why not src??
 | 
				
			||||||
	//	evec[0] = 1.0;
 | 
						//	evec[0] = 1.0;
 | 
				
			||||||
	evec[0] = src;
 | 
						evec[0] = src;
 | 
				
			||||||
 | 
						std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
 | 
				
			||||||
 | 
					// << src._grid  << std::endl;
 | 
				
			||||||
	normalise(evec[0]);
 | 
						normalise(evec[0]);
 | 
				
			||||||
 | 
						std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
 | 
				
			||||||
 | 
					// << evec[0]._grid << std::endl;
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	// Initial Nk steps
 | 
						// Initial Nk steps
 | 
				
			||||||
	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
 | 
						for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
 | 
				
			||||||
 | 
					//	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
 | 
				
			||||||
 | 
					//	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
 | 
				
			||||||
	RitzMatrix(evec,Nk);
 | 
						RitzMatrix(evec,Nk);
 | 
				
			||||||
 | 
						for(int k=0; k<Nk; ++k){
 | 
				
			||||||
 | 
					//	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
 | 
				
			||||||
 | 
					//	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Restarting loop begins
 | 
						// Restarting loop begins
 | 
				
			||||||
	for(int iter = 0; iter<Niter; ++iter){
 | 
						for(int iter = 0; iter<Niter; ++iter){
 | 
				
			||||||
@@ -382,20 +547,24 @@ until convergence
 | 
				
			|||||||
	    lme2[k] = lme[k+k1-1];
 | 
						    lme2[k] = lme[k+k1-1];
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	  setUnit_Qt(Nm,Qt);
 | 
						  setUnit_Qt(Nm,Qt);
 | 
				
			||||||
	  diagonalize(eval2,lme2,Nm,Nm,Qt);
 | 
						  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  // sorting
 | 
						  // sorting
 | 
				
			||||||
	  _sort.push(eval2,Nm);
 | 
						  _sort.push(eval2,Nm);
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  // Implicitly shifted QR transformations
 | 
						  // Implicitly shifted QR transformations
 | 
				
			||||||
	  setUnit_Qt(Nm,Qt);
 | 
						  setUnit_Qt(Nm,Qt);
 | 
				
			||||||
	  for(int ip=k2; ip<Nm; ++ip) 
 | 
						  for(int ip=k2; ip<Nm; ++ip){ 
 | 
				
			||||||
 | 
						std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 | 
				
			||||||
	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
 | 
						    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 | 
						  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  for(int j=k1-1; j<k2+1; ++j){
 | 
						  for(int j=k1-1; j<k2+1; ++j){
 | 
				
			||||||
	    for(int k=0; k<Nm; ++k){
 | 
						    for(int k=0; k<Nm; ++k){
 | 
				
			||||||
 | 
						    B[j].checkerboard = evec[k].checkerboard;
 | 
				
			||||||
	      B[j] += Qt[k+Nm*j] * evec[k];
 | 
						      B[j] += Qt[k+Nm*j] * evec[k];
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
@@ -418,21 +587,25 @@ until convergence
 | 
				
			|||||||
	    lme2[k] = lme[k];
 | 
						    lme2[k] = lme[k];
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	  setUnit_Qt(Nm,Qt);
 | 
						  setUnit_Qt(Nm,Qt);
 | 
				
			||||||
	  diagonalize(eval2,lme2,Nk,Nm,Qt);
 | 
						  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
 | 
						  for(int k = 0; k<Nk; ++k) B[k]=0.0;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  for(int j = 0; j<Nk; ++j){
 | 
						  for(int j = 0; j<Nk; ++j){
 | 
				
			||||||
	    for(int k = 0; k<Nk; ++k){
 | 
						    for(int k = 0; k<Nk; ++k){
 | 
				
			||||||
 | 
						    B[j].checkerboard = evec[k].checkerboard;
 | 
				
			||||||
	      B[j] += Qt[k+j*Nm] * evec[k];
 | 
						      B[j] += Qt[k+j*Nm] * evec[k];
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
 | 
					//	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
 | 
					//	_sort.push(eval2,B,Nk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  Nconv = 0;
 | 
						  Nconv = 0;
 | 
				
			||||||
	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 | 
						  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 | 
				
			||||||
	  for(int i=0; i<Nk; ++i){
 | 
						  for(int i=0; i<Nk; ++i){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    _poly(_Linop,B[i],v);
 | 
					//	    _poly(_Linop,B[i],v);
 | 
				
			||||||
 | 
						    _Linop.HermOp(B[i],v);
 | 
				
			||||||
	    
 | 
						    
 | 
				
			||||||
	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
 | 
						    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
 | 
				
			||||||
	    RealD vden = norm2(B[i]);
 | 
						    RealD vden = norm2(B[i]);
 | 
				
			||||||
@@ -440,11 +613,13 @@ until convergence
 | 
				
			|||||||
	    v -= eval2[i]*B[i];
 | 
						    v -= eval2[i]*B[i];
 | 
				
			||||||
	    RealD vv = norm2(v);
 | 
						    RealD vv = norm2(v);
 | 
				
			||||||
	    
 | 
						    
 | 
				
			||||||
 | 
						    std::cout.precision(13);
 | 
				
			||||||
	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 | 
						    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 | 
				
			||||||
	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 | 
						    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 | 
				
			||||||
	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
 | 
						    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
 | 
				
			||||||
	    
 | 
						    
 | 
				
			||||||
	    if(vv<eresid*eresid){
 | 
						// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
 | 
				
			||||||
 | 
						    if((vv<eresid*eresid) && (i == Nconv) ){
 | 
				
			||||||
	      Iconv[Nconv] = i;
 | 
						      Iconv[Nconv] = i;
 | 
				
			||||||
	      ++Nconv;
 | 
						      ++Nconv;
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
@@ -455,7 +630,7 @@ until convergence
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
 | 
						  std::cout<<" #modes converged: "<<Nconv<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  if( Nconv>=Nk ){
 | 
						  if( Nconv>=Nstop ){
 | 
				
			||||||
	    goto converged;
 | 
						    goto converged;
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	} // end of iter loop
 | 
						} // end of iter loop
 | 
				
			||||||
@@ -464,21 +639,20 @@ until convergence
 | 
				
			|||||||
	abort();
 | 
						abort();
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
      converged:
 | 
					      converged:
 | 
				
			||||||
	// Sorting
 | 
					       // Sorting
 | 
				
			||||||
 | 
					       eval.resize(Nconv);
 | 
				
			||||||
 | 
					       evec.resize(Nconv,grid);
 | 
				
			||||||
 | 
					       for(int i=0; i<Nconv; ++i){
 | 
				
			||||||
 | 
					         eval[i] = eval2[Iconv[i]];
 | 
				
			||||||
 | 
					         evec[i] = B[Iconv[i]];
 | 
				
			||||||
 | 
					       }
 | 
				
			||||||
 | 
					      _sort.push(eval,evec,Nconv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	eval.clear();
 | 
					      std::cout << "\n Converged\n Summary :\n";
 | 
				
			||||||
	evec.clear();
 | 
					      std::cout << " -- Iterations  = "<< Nconv  << "\n";
 | 
				
			||||||
	for(int i=0; i<Nconv; ++i){
 | 
					      std::cout << " -- beta(k)     = "<< beta_k << "\n";
 | 
				
			||||||
	  eval.push_back(eval2[Iconv[i]]);
 | 
					      std::cout << " -- Nconv       = "<< Nconv  << "\n";
 | 
				
			||||||
	  evec.push_back(B[Iconv[i]]);
 | 
					     }
 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	_sort.push(eval,evec,Nconv);
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	std::cout << "\n Converged\n Summary :\n";
 | 
					 | 
				
			||||||
	std::cout << " -- Iterations  = "<< Nconv  << "\n";
 | 
					 | 
				
			||||||
	std::cout << " -- beta(k)     = "<< beta_k << "\n";
 | 
					 | 
				
			||||||
	std::cout << " -- Nconv       = "<< Nconv  << "\n";
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    /////////////////////////////////////////////////
 | 
					    /////////////////////////////////////////////////
 | 
				
			||||||
    // Adapted from Rudy's lanczos factor routine
 | 
					    // Adapted from Rudy's lanczos factor routine
 | 
				
			||||||
@@ -1025,6 +1199,7 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 };
 | 
					 };
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -47,6 +47,10 @@ namespace Grid {
 | 
				
			|||||||
    int mmax;
 | 
					    int mmax;
 | 
				
			||||||
    int nstep;
 | 
					    int nstep;
 | 
				
			||||||
    int steps;
 | 
					    int steps;
 | 
				
			||||||
 | 
					    GridStopWatch PrecTimer;
 | 
				
			||||||
 | 
					    GridStopWatch MatTimer;
 | 
				
			||||||
 | 
					    GridStopWatch LinalgTimer;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LinearFunction<Field> &Preconditioner;
 | 
					    LinearFunction<Field> &Preconditioner;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
 | 
					   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
 | 
				
			||||||
@@ -68,14 +72,24 @@ namespace Grid {
 | 
				
			|||||||
      
 | 
					      
 | 
				
			||||||
      Field r(src._grid);
 | 
					      Field r(src._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        PrecTimer.Reset();
 | 
				
			||||||
 | 
					         MatTimer.Reset();
 | 
				
			||||||
 | 
					      LinalgTimer.Reset();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      GridStopWatch SolverTimer;
 | 
				
			||||||
 | 
					      SolverTimer.Start();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      steps=0;
 | 
					      steps=0;
 | 
				
			||||||
      for(int k=0;k<MaxIterations;k++){
 | 
					      for(int k=0;k<MaxIterations;k++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	cp=GCRnStep(Linop,src,psi,rsq);
 | 
						cp=GCRnStep(Linop,src,psi,rsq);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
 | 
						std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if(cp<rsq) {
 | 
						if(cp<rsq) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  SolverTimer.Stop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  Linop.HermOp(psi,r);
 | 
						  Linop.HermOp(psi,r);
 | 
				
			||||||
	  axpy(r,-1.0,src,r);
 | 
						  axpy(r,-1.0,src,r);
 | 
				
			||||||
	  RealD tr = norm2(r);
 | 
						  RealD tr = norm2(r);
 | 
				
			||||||
@@ -83,6 +97,11 @@ namespace Grid {
 | 
				
			|||||||
		   << " computed residual "<<sqrt(cp/ssq)
 | 
							   << " computed residual "<<sqrt(cp/ssq)
 | 
				
			||||||
	           << " true residual "    <<sqrt(tr/ssq)
 | 
						           << " true residual "    <<sqrt(tr/ssq)
 | 
				
			||||||
	           << " target "           <<Tolerance <<std::endl;
 | 
						           << " target "           <<Tolerance <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
 | 
				
			||||||
 | 
						  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
 | 
				
			||||||
 | 
						  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
 | 
				
			||||||
 | 
						  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
 | 
				
			||||||
	  return;
 | 
						  return;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -90,6 +109,7 @@ namespace Grid {
 | 
				
			|||||||
      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
 | 
					      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
 | 
				
			||||||
      assert(0);
 | 
					      assert(0);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
 | 
					    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      RealD cp;
 | 
					      RealD cp;
 | 
				
			||||||
@@ -116,24 +136,25 @@ namespace Grid {
 | 
				
			|||||||
      // initial guess x0 is taken as nonzero.
 | 
					      // initial guess x0 is taken as nonzero.
 | 
				
			||||||
      // r0=src-A x0 = src
 | 
					      // r0=src-A x0 = src
 | 
				
			||||||
      //////////////////////////////////
 | 
					      //////////////////////////////////
 | 
				
			||||||
 | 
					      MatTimer.Start();
 | 
				
			||||||
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
 | 
					      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
 | 
				
			||||||
 | 
					      MatTimer.Stop();
 | 
				
			||||||
      r=src-Az;
 | 
					      r=src-Az;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      /////////////////////
 | 
					      /////////////////////
 | 
				
			||||||
      // p = Prec(r)
 | 
					      // p = Prec(r)
 | 
				
			||||||
      /////////////////////
 | 
					      /////////////////////
 | 
				
			||||||
 | 
					      PrecTimer.Start();
 | 
				
			||||||
      Preconditioner(r,z);
 | 
					      Preconditioner(r,z);
 | 
				
			||||||
 | 
					      PrecTimer.Stop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
 | 
					      MatTimer.Start();
 | 
				
			||||||
      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      Linop.HermOp(z,tmp); 
 | 
					      Linop.HermOp(z,tmp); 
 | 
				
			||||||
 | 
					      MatTimer.Stop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
 | 
					 | 
				
			||||||
      ttmp=tmp;
 | 
					      ttmp=tmp;
 | 
				
			||||||
      tmp=tmp-r;
 | 
					      tmp=tmp-r;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 | 
					 | 
				
			||||||
      /*
 | 
					      /*
 | 
				
			||||||
      std::cout<<GridLogMessage<<r<<std::endl;
 | 
					      std::cout<<GridLogMessage<<r<<std::endl;
 | 
				
			||||||
      std::cout<<GridLogMessage<<z<<std::endl;
 | 
					      std::cout<<GridLogMessage<<z<<std::endl;
 | 
				
			||||||
@@ -141,7 +162,9 @@ namespace Grid {
 | 
				
			|||||||
      std::cout<<GridLogMessage<<tmp<<std::endl;
 | 
					      std::cout<<GridLogMessage<<tmp<<std::endl;
 | 
				
			||||||
      */
 | 
					      */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      MatTimer.Start();
 | 
				
			||||||
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
 | 
					      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
 | 
				
			||||||
 | 
					      MatTimer.Stop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      //p[0],q[0],qq[0] 
 | 
					      //p[0],q[0],qq[0] 
 | 
				
			||||||
      p[0]= z;
 | 
					      p[0]= z;
 | 
				
			||||||
@@ -165,18 +188,22 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	cp = axpy_norm(r,-a,q[peri_k],r);  
 | 
						cp = axpy_norm(r,-a,q[peri_k],r);  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
 | 
					 | 
				
			||||||
	if((k==nstep-1)||(cp<rsq)){
 | 
						if((k==nstep-1)||(cp<rsq)){
 | 
				
			||||||
	  return cp;
 | 
						  return cp;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						PrecTimer.Start();
 | 
				
			||||||
	Preconditioner(r,z);// solve Az = r
 | 
						Preconditioner(r,z);// solve Az = r
 | 
				
			||||||
 | 
						PrecTimer.Stop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						MatTimer.Start();
 | 
				
			||||||
	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 | 
						Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	Linop.HermOp(z,tmp);
 | 
						Linop.HermOp(z,tmp);
 | 
				
			||||||
 | 
						MatTimer.Stop();
 | 
				
			||||||
        tmp=tmp-r;
 | 
					        tmp=tmp-r;
 | 
				
			||||||
	std::cout<<GridLogMessage<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 | 
						std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	q[peri_kp]=Az;
 | 
						q[peri_kp]=Az;
 | 
				
			||||||
	p[peri_kp]=z;
 | 
						p[peri_kp]=z;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -102,6 +102,8 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      pickCheckerboard(Even,src_e,in);
 | 
					      pickCheckerboard(Even,src_e,in);
 | 
				
			||||||
      pickCheckerboard(Odd ,src_o,in);
 | 
					      pickCheckerboard(Odd ,src_o,in);
 | 
				
			||||||
 | 
					      pickCheckerboard(Even,sol_e,out);
 | 
				
			||||||
 | 
					      pickCheckerboard(Odd ,sol_o,out);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
      /////////////////////////////////////////////////////
 | 
					      /////////////////////////////////////////////////////
 | 
				
			||||||
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
 | 
					      // src_o = Mdag * (source_o - Moe MeeInv source_e)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -115,27 +115,11 @@ public:
 | 
				
			|||||||
      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
 | 
					      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
 | 
				
			||||||
      return idx;
 | 
					      return idx;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
 | 
					 | 
				
			||||||
      int nd= dims.size();
 | 
					 | 
				
			||||||
      coor.resize(nd);
 | 
					 | 
				
			||||||
      for(int d=0;d<nd;d++){
 | 
					 | 
				
			||||||
	coor[d] = index % dims[d];
 | 
					 | 
				
			||||||
	index   = index / dims[d];
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
 | 
					    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
 | 
				
			||||||
      CoorFromIndex(coor,Oindex,_rdimensions);
 | 
					      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
 | 
					 | 
				
			||||||
      int nd=dims.size();
 | 
					 | 
				
			||||||
      int stride=1;
 | 
					 | 
				
			||||||
      index=0;
 | 
					 | 
				
			||||||
      for(int d=0;d<nd;d++){
 | 
					 | 
				
			||||||
	index = index+stride*coor[d];
 | 
					 | 
				
			||||||
	stride=stride*dims[d];
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    //////////////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////////////
 | 
				
			||||||
    // SIMD lane addressing
 | 
					    // SIMD lane addressing
 | 
				
			||||||
    //////////////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -147,13 +131,32 @@ public:
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
 | 
					    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      CoorFromIndex(coor,lane,_simd_layout);
 | 
					      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    inline int PermuteDim(int dimension){
 | 
					    inline int PermuteDim(int dimension){
 | 
				
			||||||
      return _simd_layout[dimension]>1;
 | 
					      return _simd_layout[dimension]>1;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    inline int PermuteType(int dimension){
 | 
					    inline int PermuteType(int dimension){
 | 
				
			||||||
      int permute_type=0;
 | 
					      int permute_type=0;
 | 
				
			||||||
 | 
					      //
 | 
				
			||||||
 | 
					      // FIXME:
 | 
				
			||||||
 | 
					      //
 | 
				
			||||||
 | 
					      // Best way to encode this would be to present a mask 
 | 
				
			||||||
 | 
					      // for which simd dimensions are rotated, and the rotation
 | 
				
			||||||
 | 
					      // size. If there is only one simd dimension rotated, this is just 
 | 
				
			||||||
 | 
					      // a permute. 
 | 
				
			||||||
 | 
					      //
 | 
				
			||||||
 | 
					      // Cases: PermuteType == 1,2,4,8
 | 
				
			||||||
 | 
					      // Distance should be either 0,1,2..
 | 
				
			||||||
 | 
					      //
 | 
				
			||||||
 | 
					      if ( _simd_layout[dimension] > 2 ) { 
 | 
				
			||||||
 | 
						for(int d=0;d<_ndimension;d++){
 | 
				
			||||||
 | 
						  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						permute_type = RotateBit; // How to specify distance; this is not just direction.
 | 
				
			||||||
 | 
						return permute_type;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int d=_ndimension-1;d>dimension;d--){
 | 
					      for(int d=_ndimension-1;d>dimension;d--){
 | 
				
			||||||
	if (_simd_layout[d]>1 ) permute_type++;
 | 
						if (_simd_layout[d]>1 ) permute_type++;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -163,12 +166,12 @@ public:
 | 
				
			|||||||
    // Array sizing queries
 | 
					    // Array sizing queries
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    inline int iSites(void) { return _isites; };
 | 
					    inline int iSites(void) const { return _isites; };
 | 
				
			||||||
    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
 | 
					    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
 | 
				
			||||||
    inline int oSites(void) { return _osites; };
 | 
					    inline int oSites(void) const { return _osites; };
 | 
				
			||||||
    inline int lSites(void) { return _isites*_osites; }; 
 | 
					    inline int lSites(void) const { return _isites*_osites; }; 
 | 
				
			||||||
    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
 | 
					    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
 | 
				
			||||||
    inline int Nd    (void) { return _ndimension;};
 | 
					    inline int Nd    (void) const { return _ndimension;};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
 | 
					    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
 | 
				
			||||||
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
 | 
					    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
 | 
				
			||||||
@@ -179,7 +182,10 @@ public:
 | 
				
			|||||||
    // Global addressing
 | 
					    // Global addressing
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////
 | 
				
			||||||
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
 | 
					    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
 | 
				
			||||||
      CoorFromIndex(gcoor,gidx,_gdimensions);
 | 
					      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
 | 
				
			||||||
 | 
					      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
 | 
					    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
 | 
				
			||||||
      gidx=0;
 | 
					      gidx=0;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -170,9 +170,15 @@ public:
 | 
				
			|||||||
	// Use a reduced simd grid
 | 
						// Use a reduced simd grid
 | 
				
			||||||
	_simd_layout[d] = simd_layout[d];
 | 
						_simd_layout[d] = simd_layout[d];
 | 
				
			||||||
	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 | 
						_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 | 
				
			||||||
 | 
						assert(_rdimensions[d]>0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// all elements of a simd vector must have same checkerboard.
 | 
						// all elements of a simd vector must have same checkerboard.
 | 
				
			||||||
	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 
 | 
						// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 | 
				
			||||||
 | 
						if ( _simd_layout[d]>1 ) {
 | 
				
			||||||
 | 
						  if ( d != _checker_dim ) { 
 | 
				
			||||||
 | 
						    assert( (_rdimensions[d]&0x1) == 0 );
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	_osites *= _rdimensions[d];
 | 
						_osites *= _rdimensions[d];
 | 
				
			||||||
	_isites *= _simd_layout[d];
 | 
						_isites *= _simd_layout[d];
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -34,6 +34,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#ifdef GRID_COMMS_MPI
 | 
					#ifdef GRID_COMMS_MPI
 | 
				
			||||||
#include <mpi.h>
 | 
					#include <mpi.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef GRID_COMMS_SHMEM
 | 
				
			||||||
 | 
					#include <mpp/shmem.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
class CartesianCommunicator {
 | 
					class CartesianCommunicator {
 | 
				
			||||||
  public:    
 | 
					  public:    
 | 
				
			||||||
@@ -53,6 +56,8 @@ class CartesianCommunicator {
 | 
				
			|||||||
    typedef int CommsRequest_t;
 | 
					    typedef int CommsRequest_t;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static void Init(int *argc, char ***argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Constructor
 | 
					    // Constructor
 | 
				
			||||||
    CartesianCommunicator(const std::vector<int> &pdimensions_in);
 | 
					    CartesianCommunicator(const std::vector<int> &pdimensions_in);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -81,6 +86,7 @@ class CartesianCommunicator {
 | 
				
			|||||||
    void GlobalSumVector(RealD *,int N);
 | 
					    void GlobalSumVector(RealD *,int N);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void GlobalSum(uint32_t &);
 | 
					    void GlobalSum(uint32_t &);
 | 
				
			||||||
 | 
					    void GlobalSum(uint64_t &);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void GlobalSum(ComplexF &c)
 | 
					    void GlobalSum(ComplexF &c)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
@@ -115,12 +121,11 @@ class CartesianCommunicator {
 | 
				
			|||||||
			int recv_from_rank,
 | 
								int recv_from_rank,
 | 
				
			||||||
			int bytes);
 | 
								int bytes);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void RecvFrom(void *recv,
 | 
					    void SendRecvPacket(void *xmit,
 | 
				
			||||||
		  int recv_from_rank,
 | 
								void *recv,
 | 
				
			||||||
		  int bytes);
 | 
								int xmit_to_rank,
 | 
				
			||||||
    void SendTo(void *xmit,
 | 
								int recv_from_rank,
 | 
				
			||||||
		int xmit_to_rank,
 | 
								int bytes);
 | 
				
			||||||
		int bytes);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
					    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
				
			||||||
			 void *xmit,
 | 
								 void *xmit,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -31,6 +31,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Should error check all MPI calls.
 | 
					  // Should error check all MPI calls.
 | 
				
			||||||
 | 
					void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
				
			||||||
 | 
					  int flag;
 | 
				
			||||||
 | 
					  MPI_Initialized(&flag); // needed to coexist with other libs apparently
 | 
				
			||||||
 | 
					  if ( !flag ) {
 | 
				
			||||||
 | 
					    MPI_Init(argc,argv);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int Rank(void) {
 | 
				
			||||||
 | 
					    int pe;
 | 
				
			||||||
 | 
					    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
 | 
				
			||||||
 | 
					    return pe;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
					CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -59,6 +72,10 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
				
			|||||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
					  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
				
			||||||
  assert(ierr==0);
 | 
					  assert(ierr==0);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(uint64_t &u){
 | 
				
			||||||
 | 
					  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
 | 
				
			||||||
 | 
					  assert(ierr==0);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
					void CartesianCommunicator::GlobalSum(float &f){
 | 
				
			||||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
					  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
				
			||||||
  assert(ierr==0);
 | 
					  assert(ierr==0);
 | 
				
			||||||
@@ -108,21 +125,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 | 
				
			|||||||
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
 | 
					  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
 | 
				
			||||||
  SendToRecvFromComplete(reqs);
 | 
					  SendToRecvFromComplete(reqs);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void CartesianCommunicator::RecvFrom(void *recv,
 | 
					
 | 
				
			||||||
				     int from,
 | 
					void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
				
			||||||
				     int bytes) 
 | 
										   void *recv,
 | 
				
			||||||
 | 
										   int sender,
 | 
				
			||||||
 | 
										   int receiver,
 | 
				
			||||||
 | 
										   int bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  MPI_Status stat;
 | 
					  MPI_Status stat;
 | 
				
			||||||
  int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
 | 
					  assert(sender != receiver);
 | 
				
			||||||
  assert(ierr==0);
 | 
					  int tag = sender;
 | 
				
			||||||
}
 | 
					  if ( _processor == sender ) {
 | 
				
			||||||
void CartesianCommunicator::SendTo(void *xmit,
 | 
					    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
 | 
				
			||||||
				   int dest,
 | 
					  }
 | 
				
			||||||
				   int bytes)
 | 
					  if ( _processor == receiver ) { 
 | 
				
			||||||
{
 | 
					    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
 | 
				
			||||||
  int rank = _processor; // used for tag; must know who it comes from
 | 
					  }
 | 
				
			||||||
  int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
 | 
					 | 
				
			||||||
  assert(ierr==0);
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Basic Halo comms primitive
 | 
					// Basic Halo comms primitive
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -28,6 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include "Grid.h"
 | 
					#include "Grid.h"
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void CartesianCommunicator::Init(int *argc, char *** arv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int Rank(void ){ return 0; };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
					CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  _processors = processors;
 | 
					  _processors = processors;
 | 
				
			||||||
@@ -47,17 +53,14 @@ void CartesianCommunicator::GlobalSum(float &){}
 | 
				
			|||||||
void CartesianCommunicator::GlobalSumVector(float *,int N){}
 | 
					void CartesianCommunicator::GlobalSumVector(float *,int N){}
 | 
				
			||||||
void CartesianCommunicator::GlobalSum(double &){}
 | 
					void CartesianCommunicator::GlobalSum(double &){}
 | 
				
			||||||
void CartesianCommunicator::GlobalSum(uint32_t &){}
 | 
					void CartesianCommunicator::GlobalSum(uint32_t &){}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(uint64_t &){}
 | 
				
			||||||
void CartesianCommunicator::GlobalSumVector(double *,int N){}
 | 
					void CartesianCommunicator::GlobalSumVector(double *,int N){}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void CartesianCommunicator::RecvFrom(void *recv,
 | 
					void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
				
			||||||
				     int recv_from_rank,
 | 
										   void *recv,
 | 
				
			||||||
				     int bytes) 
 | 
										   int xmit_to_rank,
 | 
				
			||||||
{
 | 
										   int recv_from_rank,
 | 
				
			||||||
  assert(0);
 | 
										   int bytes)
 | 
				
			||||||
}
 | 
					 | 
				
			||||||
void CartesianCommunicator::SendTo(void *xmit,
 | 
					 | 
				
			||||||
				   int xmit_to_rank,
 | 
					 | 
				
			||||||
				   int bytes)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  assert(0);
 | 
					  assert(0);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										334
									
								
								lib/communicator/Communicator_shmem.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										334
									
								
								lib/communicator/Communicator_shmem.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,334 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/communicator/Communicator_shmem.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#include "Grid.h"
 | 
				
			||||||
 | 
					#include <mpp/shmem.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Should error check all MPI calls.
 | 
				
			||||||
 | 
					#define SHMEM_VET(addr) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define SHMEM_VET_DEBUG(addr) {				\
 | 
				
			||||||
 | 
					  if ( ! shmem_addr_accessible(addr,_processor) ) {\
 | 
				
			||||||
 | 
					    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
 | 
				
			||||||
 | 
					    BACKTRACEFILE();		   \
 | 
				
			||||||
 | 
					  }\
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					int Rank(void) {
 | 
				
			||||||
 | 
					  return shmem_my_pe();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					typedef struct HandShake_t { 
 | 
				
			||||||
 | 
					  uint64_t seq_local;
 | 
				
			||||||
 | 
					  uint64_t seq_remote;
 | 
				
			||||||
 | 
					} HandShake;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static Vector< HandShake > XConnections;
 | 
				
			||||||
 | 
					static Vector< HandShake > RConnections;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
				
			||||||
 | 
					  shmem_init();
 | 
				
			||||||
 | 
					  XConnections.resize(shmem_n_pes());
 | 
				
			||||||
 | 
					  RConnections.resize(shmem_n_pes());
 | 
				
			||||||
 | 
					  for(int pe =0 ; pe<shmem_n_pes();pe++){
 | 
				
			||||||
 | 
					    XConnections[pe].seq_local = 0;
 | 
				
			||||||
 | 
					    XConnections[pe].seq_remote= 0;
 | 
				
			||||||
 | 
					    RConnections[pe].seq_local = 0;
 | 
				
			||||||
 | 
					    RConnections[pe].seq_remote= 0;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  shmem_barrier_all();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  _ndimension = processors.size();
 | 
				
			||||||
 | 
					  std::vector<int> periodic(_ndimension,1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  _Nprocessors=1;
 | 
				
			||||||
 | 
					  _processors = processors;
 | 
				
			||||||
 | 
					  _processor_coor.resize(_ndimension);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  _processor = shmem_my_pe();
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<_ndimension;i++){
 | 
				
			||||||
 | 
					    _Nprocessors*=_processors[i];
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int Size = shmem_n_pes(); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(Size==_Nprocessors);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
				
			||||||
 | 
					  static long long source ;
 | 
				
			||||||
 | 
					  static long long dest   ;
 | 
				
			||||||
 | 
					  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
				
			||||||
 | 
					  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  int nreduce=1;
 | 
				
			||||||
 | 
					  //  int pestart=0;
 | 
				
			||||||
 | 
					  //  int logStride=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  source = u;
 | 
				
			||||||
 | 
					  dest   = 0;
 | 
				
			||||||
 | 
					  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
				
			||||||
 | 
					  shmem_barrier_all(); // necessary?
 | 
				
			||||||
 | 
					  u = dest;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(uint64_t &u){
 | 
				
			||||||
 | 
					  static long long source ;
 | 
				
			||||||
 | 
					  static long long dest   ;
 | 
				
			||||||
 | 
					  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
				
			||||||
 | 
					  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  int nreduce=1;
 | 
				
			||||||
 | 
					  //  int pestart=0;
 | 
				
			||||||
 | 
					  //  int logStride=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  source = u;
 | 
				
			||||||
 | 
					  dest   = 0;
 | 
				
			||||||
 | 
					  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
				
			||||||
 | 
					  shmem_barrier_all(); // necessary?
 | 
				
			||||||
 | 
					  u = dest;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(float &f){
 | 
				
			||||||
 | 
					  static float source ;
 | 
				
			||||||
 | 
					  static float dest   ;
 | 
				
			||||||
 | 
					  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
				
			||||||
 | 
					  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  source = f;
 | 
				
			||||||
 | 
					  dest   =0.0;
 | 
				
			||||||
 | 
					  shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
				
			||||||
 | 
					  f = dest;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSumVector(float *f,int N)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  static float source ;
 | 
				
			||||||
 | 
					  static float dest   = 0 ;
 | 
				
			||||||
 | 
					  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
				
			||||||
 | 
					  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( shmem_addr_accessible(f,_processor)  ){
 | 
				
			||||||
 | 
					    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    dest   =0.0;
 | 
				
			||||||
 | 
					    source = f[i];
 | 
				
			||||||
 | 
					    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
				
			||||||
 | 
					    f[i] = dest;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(double &d)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  static double source;
 | 
				
			||||||
 | 
					  static double dest  ;
 | 
				
			||||||
 | 
					  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
				
			||||||
 | 
					  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  source = d;
 | 
				
			||||||
 | 
					  dest   = 0;
 | 
				
			||||||
 | 
					  shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
				
			||||||
 | 
					  d = dest;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSumVector(double *d,int N)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  static double source ;
 | 
				
			||||||
 | 
					  static double dest   ;
 | 
				
			||||||
 | 
					  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
 | 
				
			||||||
 | 
					  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( shmem_addr_accessible(d,_processor)  ){
 | 
				
			||||||
 | 
					    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    source = d[i];
 | 
				
			||||||
 | 
					    dest   =0.0;
 | 
				
			||||||
 | 
					    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
 | 
				
			||||||
 | 
					    d[i] = dest;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::vector<int> coor = _processor_coor;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(std::abs(shift) <_processors[dim]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
 | 
				
			||||||
 | 
					  Lexicographic::IndexFromCoor(coor,source,_processors);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
 | 
				
			||||||
 | 
					  Lexicographic::IndexFromCoor(coor,dest,_processors);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int rank;
 | 
				
			||||||
 | 
					  Lexicographic::IndexFromCoor(coor,rank,_processors);
 | 
				
			||||||
 | 
					  return rank;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Lexicographic::CoorFromIndex(coor,rank,_processors);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Basic Halo comms primitive
 | 
				
			||||||
 | 
					void CartesianCommunicator::SendToRecvFrom(void *xmit,
 | 
				
			||||||
 | 
										   int dest,
 | 
				
			||||||
 | 
										   void *recv,
 | 
				
			||||||
 | 
										   int from,
 | 
				
			||||||
 | 
										   int bytes)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  SHMEM_VET(xmit);
 | 
				
			||||||
 | 
					  SHMEM_VET(recv);
 | 
				
			||||||
 | 
					  std::vector<CommsRequest_t> reqs(0);
 | 
				
			||||||
 | 
					  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
 | 
				
			||||||
 | 
					  SendToRecvFromComplete(reqs);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
				
			||||||
 | 
										   void *recv,
 | 
				
			||||||
 | 
										   int sender,
 | 
				
			||||||
 | 
										   int receiver,
 | 
				
			||||||
 | 
										   int bytes)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  static uint64_t seq;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(recv!=xmit);
 | 
				
			||||||
 | 
					  volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
 | 
				
			||||||
 | 
					  volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( _processor == sender ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
 | 
				
			||||||
 | 
					    // Check he has posted a receive
 | 
				
			||||||
 | 
					    while(SendSeq->seq_remote == SendSeq->seq_local);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    printf("Sender receive %d posted\n",sender,receiver);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Advance our send count
 | 
				
			||||||
 | 
					    seq = ++(SendSeq->seq_local);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Send this packet 
 | 
				
			||||||
 | 
					    SHMEM_VET(recv);
 | 
				
			||||||
 | 
					    shmem_putmem(recv,xmit,bytes,receiver);
 | 
				
			||||||
 | 
					    shmem_fence();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    printf("Sender sent payload %d\n",seq);
 | 
				
			||||||
 | 
					    //Notify him we're done
 | 
				
			||||||
 | 
					    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
 | 
				
			||||||
 | 
					    shmem_fence();
 | 
				
			||||||
 | 
					    printf("Sender ringing door bell  %d\n",seq);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if ( _processor == receiver ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
 | 
				
			||||||
 | 
					    // Post a receive
 | 
				
			||||||
 | 
					    seq = ++(RecvSeq->seq_local);
 | 
				
			||||||
 | 
					    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    printf("Receiver Opening letter box %d\n",seq);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Now wait until he has advanced our reception counter
 | 
				
			||||||
 | 
					    while(RecvSeq->seq_remote != RecvSeq->seq_local);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    printf("Receiver Got the mail %d\n",seq);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Basic Halo comms primitive
 | 
				
			||||||
 | 
					void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
				
			||||||
 | 
											void *xmit,
 | 
				
			||||||
 | 
											int dest,
 | 
				
			||||||
 | 
											void *recv,
 | 
				
			||||||
 | 
											int from,
 | 
				
			||||||
 | 
											int bytes)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  SHMEM_VET(xmit);
 | 
				
			||||||
 | 
					  SHMEM_VET(recv);
 | 
				
			||||||
 | 
					  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
 | 
				
			||||||
 | 
					  shmem_putmem(recv,xmit,bytes,dest);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  //  shmem_quiet();      // I'm done
 | 
				
			||||||
 | 
					  shmem_barrier_all();// He's done too
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::Barrier(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  shmem_barrier_all();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					  static uint32_t word;
 | 
				
			||||||
 | 
					  uint32_t *array = (uint32_t *) data;
 | 
				
			||||||
 | 
					  assert( (bytes % 4)==0);
 | 
				
			||||||
 | 
					  int words = bytes/4;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( shmem_addr_accessible(data,_processor)  ){
 | 
				
			||||||
 | 
					    shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync);
 | 
				
			||||||
 | 
					    return;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int w=0;w<words;w++){
 | 
				
			||||||
 | 
					    word = array[w];
 | 
				
			||||||
 | 
					    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
 | 
				
			||||||
 | 
					    if ( shmem_my_pe() != root ) {
 | 
				
			||||||
 | 
					      array[w] = word;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    shmem_barrier_all();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
				
			||||||
 | 
					  static uint32_t word;
 | 
				
			||||||
 | 
					  uint32_t *array = (uint32_t *) data;
 | 
				
			||||||
 | 
					  assert( (bytes % 4)==0);
 | 
				
			||||||
 | 
					  int words = bytes/4;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int w=0;w<words;w++){
 | 
				
			||||||
 | 
					    word = array[w];
 | 
				
			||||||
 | 
					    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
 | 
				
			||||||
 | 
					    if ( shmem_my_pe() != root ) {
 | 
				
			||||||
 | 
					      array[w]= word;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    shmem_barrier_all();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -35,7 +35,7 @@ class SimpleCompressor {
 | 
				
			|||||||
public:
 | 
					public:
 | 
				
			||||||
  void Point(int) {};
 | 
					  void Point(int) {};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
 | 
					  vobj operator() (const vobj &arg) {
 | 
				
			||||||
    return arg;
 | 
					    return arg;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
@@ -56,24 +56,24 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
 | 
				
			|||||||
  
 | 
					  
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
					  int e1=rhs._grid->_slice_nblock[dimension];
 | 
				
			||||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
					  int e2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					  int stride=rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
  if ( cbmask == 0x3 ) { 
 | 
					  if ( cbmask == 0x3 ) { 
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
    for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
      for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
	int o  = n*rhs._grid->_slice_stride[dimension];
 | 
						int o  = n*stride;
 | 
				
			||||||
	int bo = n*rhs._grid->_slice_block[dimension];
 | 
						int bo = n*e2;
 | 
				
			||||||
	buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
						buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
     int bo=0;
 | 
					     int bo=0;
 | 
				
			||||||
     for(int n=0;n<e1;n++){
 | 
					     for(int n=0;n<e1;n++){
 | 
				
			||||||
       for(int b=0;b<e2;b++){
 | 
					       for(int b=0;b<e2;b++){
 | 
				
			||||||
	 int o  = n*rhs._grid->_slice_stride[dimension];
 | 
						 int o  = n*stride;
 | 
				
			||||||
	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
						 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
				
			||||||
	 if ( ocb &cbmask ) {
 | 
						 if ( ocb &cbmask ) {
 | 
				
			||||||
	   buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
						   buffer[off+bo++]=compress(rhs._odata[so+o+b]);
 | 
				
			||||||
	 }
 | 
						 }
 | 
				
			||||||
       }
 | 
					       }
 | 
				
			||||||
     }
 | 
					     }
 | 
				
			||||||
@@ -97,16 +97,16 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
					  int e1=rhs._grid->_slice_nblock[dimension];
 | 
				
			||||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
					  int e2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
  
 | 
					  int n1=rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
					  int n2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
  if ( cbmask ==0x3){
 | 
					  if ( cbmask ==0x3){
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
    for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
      for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	int o=n*rhs._grid->_slice_stride[dimension];
 | 
						int o      =   n*n1;
 | 
				
			||||||
	int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
						int offset = b+n*n2;
 | 
				
			||||||
 | 
						cobj temp =compress(rhs._odata[so+o+b]);
 | 
				
			||||||
	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
					 | 
				
			||||||
	extract<cobj>(temp,pointers,offset);
 | 
						extract<cobj>(temp,pointers,offset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -121,7 +121,7 @@ PARALLEL_NESTED_LOOP2
 | 
				
			|||||||
	int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
						int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if ( ocb & cbmask ) {
 | 
						if ( ocb & cbmask ) {
 | 
				
			||||||
	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
						  cobj temp =compress(rhs._odata[so+o+b]);
 | 
				
			||||||
	  extract<cobj>(temp,pointers,offset);
 | 
						  extract<cobj>(temp,pointers,offset);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -243,13 +243,13 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
 | 
					  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
 | 
				
			||||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
					  int e2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					  int stride = rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
  if(cbmask == 0x3 ){
 | 
					  if(cbmask == 0x3 ){
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
    for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
      for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
        int o =n*rhs._grid->_slice_stride[dimension]+b;
 | 
					        int o =n*stride+b;
 | 
				
			||||||
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
					  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
				
			||||||
	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 | 
						vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -259,7 +259,7 @@ PARALLEL_NESTED_LOOP2
 | 
				
			|||||||
    for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
      for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
        int o =n*rhs._grid->_slice_stride[dimension]+b;
 | 
					        int o =n*stride+b;
 | 
				
			||||||
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
 | 
					        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
 | 
				
			||||||
        if ( ocb&cbmask ) {
 | 
					        if ( ocb&cbmask ) {
 | 
				
			||||||
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
					  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
				
			||||||
@@ -285,11 +285,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
					  int e1=rhs._grid->_slice_nblock[dimension];
 | 
				
			||||||
  int e2=rhs._grid->_slice_block [dimension];
 | 
					  int e2=rhs._grid->_slice_block [dimension];
 | 
				
			||||||
 | 
					  int stride = rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
  for(int n=0;n<e1;n++){
 | 
					  for(int n=0;n<e1;n++){
 | 
				
			||||||
  for(int b=0;b<e2;b++){
 | 
					  for(int b=0;b<e2;b++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int o  =n*rhs._grid->_slice_stride[dimension];
 | 
					      int o  =n*stride;
 | 
				
			||||||
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
 | 
					      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
 | 
				
			||||||
      if ( ocb&cbmask ) {
 | 
					      if ( ocb&cbmask ) {
 | 
				
			||||||
	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
 | 
						permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
 | 
				
			||||||
@@ -323,6 +324,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
				
			|||||||
  int rd = grid->_rdimensions[dimension];
 | 
					  int rd = grid->_rdimensions[dimension];
 | 
				
			||||||
  int ld = grid->_ldimensions[dimension];
 | 
					  int ld = grid->_ldimensions[dimension];
 | 
				
			||||||
  int gd = grid->_gdimensions[dimension];
 | 
					  int gd = grid->_gdimensions[dimension];
 | 
				
			||||||
 | 
					  int ly = grid->_simd_layout[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Map to always positive shift modulo global full dimension.
 | 
					  // Map to always positive shift modulo global full dimension.
 | 
				
			||||||
  shift = (shift+fd)%fd;
 | 
					  shift = (shift+fd)%fd;
 | 
				
			||||||
@@ -331,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
				
			|||||||
  // the permute type
 | 
					  // the permute type
 | 
				
			||||||
  int permute_dim =grid->PermuteDim(dimension);
 | 
					  int permute_dim =grid->PermuteDim(dimension);
 | 
				
			||||||
  int permute_type=grid->PermuteType(dimension);
 | 
					  int permute_type=grid->PermuteType(dimension);
 | 
				
			||||||
 | 
					  int permute_type_dist;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  for(int x=0;x<rd;x++){       
 | 
					  for(int x=0;x<rd;x++){       
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -342,15 +345,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 | 
				
			|||||||
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
					    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
				
			||||||
    int sx     = (x+sshift)%rd;
 | 
					    int sx     = (x+sshift)%rd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // FIXME : This must change where we have a 
 | 
				
			||||||
 | 
					    // Rotate slice.
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Document how this works ; why didn't I do this when I first wrote it...
 | 
				
			||||||
 | 
					    // wrap is whether sshift > rd.
 | 
				
			||||||
 | 
					    //  num is sshift mod rd.
 | 
				
			||||||
 | 
					    // 
 | 
				
			||||||
    int permute_slice=0;
 | 
					    int permute_slice=0;
 | 
				
			||||||
    if(permute_dim){
 | 
					    if(permute_dim){
 | 
				
			||||||
      int wrap = sshift/rd;
 | 
					      int wrap = sshift/rd;
 | 
				
			||||||
      int  num = sshift%rd;
 | 
					      int  num = sshift%rd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      if ( x< rd-num ) permute_slice=wrap;
 | 
					      if ( x< rd-num ) permute_slice=wrap;
 | 
				
			||||||
      else permute_slice = 1-wrap;
 | 
					      else permute_slice = (wrap+1)%ly;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if ( (ly>2) && (permute_slice) ) {
 | 
				
			||||||
 | 
						assert(permute_type & RotateBit);
 | 
				
			||||||
 | 
						permute_type_dist = permute_type|permute_slice;
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						permute_type_dist = permute_type;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
 | 
					    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
 | 
				
			||||||
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
 | 
					    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -191,8 +191,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 | 
				
			|||||||
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
 | 
					  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
 | 
				
			||||||
  int words = sizeof(vobj)/sizeof(vector_type);
 | 
					  int words = sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 | 
					  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
 | 
				
			||||||
  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 | 
					  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int bytes = buffer_size*sizeof(scalar_object);
 | 
					  int bytes = buffer_size*sizeof(scalar_object);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<scalar_object *>  pointers(Nsimd); // 
 | 
					  std::vector<scalar_object *>  pointers(Nsimd); // 
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -55,7 +55,13 @@ extern int GridCshiftPermuteMap[4][16];
 | 
				
			|||||||
// Basic expressions used in Expression Template
 | 
					// Basic expressions used in Expression Template
 | 
				
			||||||
////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LatticeBase {};
 | 
					class LatticeBase
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					public:
 | 
				
			||||||
 | 
					    virtual ~LatticeBase(void) = default;
 | 
				
			||||||
 | 
					    GridBase *_grid;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
class LatticeExpressionBase {};
 | 
					class LatticeExpressionBase {};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
 | 
					template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
 | 
				
			||||||
@@ -88,8 +94,6 @@ template<class vobj>
 | 
				
			|||||||
class Lattice : public LatticeBase
 | 
					class Lattice : public LatticeBase
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
 | 
					 | 
				
			||||||
    GridBase *_grid;
 | 
					 | 
				
			||||||
    int checkerboard;
 | 
					    int checkerboard;
 | 
				
			||||||
    Vector<vobj> _odata;
 | 
					    Vector<vobj> _odata;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
@@ -177,8 +181,8 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  //GridFromExpression is tricky to do
 | 
					  //GridFromExpression is tricky to do
 | 
				
			||||||
  template<class Op,class T1>
 | 
					  template<class Op,class T1>
 | 
				
			||||||
    Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
 | 
					    Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
 | 
				
			||||||
 | 
					    _grid = nullptr;
 | 
				
			||||||
    GridFromExpression(_grid,expr);
 | 
					    GridFromExpression(_grid,expr);
 | 
				
			||||||
    assert(_grid!=nullptr);
 | 
					    assert(_grid!=nullptr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -199,7 +203,8 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  template<class Op,class T1, class T2>
 | 
					  template<class Op,class T1, class T2>
 | 
				
			||||||
  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
 | 
					  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
 | 
				
			||||||
 | 
					    _grid = nullptr;
 | 
				
			||||||
    GridFromExpression(_grid,expr);
 | 
					    GridFromExpression(_grid,expr);
 | 
				
			||||||
    assert(_grid!=nullptr);
 | 
					    assert(_grid!=nullptr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -220,7 +225,8 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  template<class Op,class T1, class T2, class T3>
 | 
					  template<class Op,class T1, class T2, class T3>
 | 
				
			||||||
  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
 | 
					  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
 | 
				
			||||||
 | 
					    _grid = nullptr;
 | 
				
			||||||
    GridFromExpression(_grid,expr);
 | 
					    GridFromExpression(_grid,expr);
 | 
				
			||||||
    assert(_grid!=nullptr);
 | 
					    assert(_grid!=nullptr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -240,7 +246,8 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    // Constructor requires "grid" passed.
 | 
					    // Constructor requires "grid" passed.
 | 
				
			||||||
    // what about a default grid?
 | 
					    // what about a default grid?
 | 
				
			||||||
    //////////////////////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////////////////////
 | 
				
			||||||
    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
 | 
					    Lattice(GridBase *grid) : _odata(grid->oSites()) {
 | 
				
			||||||
 | 
					        _grid = grid;
 | 
				
			||||||
    //        _odata.reserve(_grid->oSites());
 | 
					    //        _odata.reserve(_grid->oSites());
 | 
				
			||||||
    //        _odata.resize(_grid->oSites());
 | 
					    //        _odata.resize(_grid->oSites());
 | 
				
			||||||
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
 | 
					    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
 | 
				
			||||||
@@ -248,6 +255,8 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
        checkerboard=0;
 | 
					        checkerboard=0;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    virtual ~Lattice(void) = default;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 | 
					    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int ss=0;ss<_grid->oSites();ss++){
 | 
					        for(int ss=0;ss<_grid->oSites();ss++){
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    // Peek a scalar object from the SIMD array
 | 
					    // Peek a scalar object from the SIMD array
 | 
				
			||||||
    //////////////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////////////
 | 
				
			||||||
    template<class vobj,class sobj>
 | 
					    template<class vobj,class sobj>
 | 
				
			||||||
    void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
 | 
					    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
      GridBase *grid=l._grid;
 | 
					      GridBase *grid=l._grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -152,7 +152,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
				
			|||||||
  assert(grid!=NULL);
 | 
					  assert(grid!=NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // FIXME
 | 
					  // FIXME
 | 
				
			||||||
  std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 | 
					  // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int    Nd = grid->_ndimension;
 | 
					  const int    Nd = grid->_ndimension;
 | 
				
			||||||
  const int Nsimd = grid->Nsimd();
 | 
					  const int Nsimd = grid->Nsimd();
 | 
				
			||||||
@@ -178,7 +178,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
				
			|||||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
					  // sum over reduced dimension planes, breaking out orthog dir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  for(int ss=0;ss<grid->oSites();ss++){
 | 
					  for(int ss=0;ss<grid->oSites();ss++){
 | 
				
			||||||
    GridBase::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
					    Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
				
			||||||
    int r = coor[orthogdim];
 | 
					    int r = coor[orthogdim];
 | 
				
			||||||
    lvSum[r]=lvSum[r]+Data._odata[ss];
 | 
					    lvSum[r]=lvSum[r]+Data._odata[ss];
 | 
				
			||||||
  }  
 | 
					  }  
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -75,7 +75,7 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    std::seed_seq src;
 | 
					    std::seed_seq src;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    fixedSeed(std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
 | 
					    fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    result_type operator () (void){
 | 
					    result_type operator () (void){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -119,9 +119,10 @@ namespace Grid {
 | 
				
			|||||||
    typedef uint32_t     RngStateType;
 | 
					    typedef uint32_t     RngStateType;
 | 
				
			||||||
    static const int     RngStateCount = std::mt19937::state_size;
 | 
					    static const int     RngStateCount = std::mt19937::state_size;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    std::vector<RngEngine>             _generators;
 | 
					    std::vector<RngEngine>                             _generators;
 | 
				
			||||||
    std::vector<std::uniform_real_distribution<RealD> > _uniform;
 | 
					    std::vector<std::uniform_real_distribution<RealD>> _uniform;
 | 
				
			||||||
    std::vector<std::normal_distribution<RealD> >       _gaussian;
 | 
					    std::vector<std::normal_distribution<RealD>>       _gaussian;
 | 
				
			||||||
 | 
					    std::vector<std::discrete_distribution<int32_t>>     _bernoulli;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void GetState(std::vector<RngStateType> & saved,int gen) {
 | 
					    void GetState(std::vector<RngStateType> & saved,int gen) {
 | 
				
			||||||
      saved.resize(RngStateCount);
 | 
					      saved.resize(RngStateCount);
 | 
				
			||||||
@@ -161,6 +162,7 @@ namespace Grid {
 | 
				
			|||||||
      _generators.resize(1);
 | 
					      _generators.resize(1);
 | 
				
			||||||
      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
 | 
					      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
 | 
				
			||||||
      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
 | 
					      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
 | 
				
			||||||
 | 
					      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
 | 
				
			||||||
      _seeded=0;
 | 
					      _seeded=0;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -242,7 +244,7 @@ namespace Grid {
 | 
				
			|||||||
      std::random_device rd;
 | 
					      std::random_device rd;
 | 
				
			||||||
      Seed(rd);
 | 
					      Seed(rd);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    void SeedFixedIntegers(std::vector<int> &seeds){
 | 
					    void SeedFixedIntegers(const std::vector<int> &seeds){
 | 
				
			||||||
      fixedSeed src(seeds);
 | 
					      fixedSeed src(seeds);
 | 
				
			||||||
      Seed(src);
 | 
					      Seed(src);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -266,6 +268,7 @@ namespace Grid {
 | 
				
			|||||||
      _generators.resize(_vol);
 | 
					      _generators.resize(_vol);
 | 
				
			||||||
      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
 | 
					      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
 | 
				
			||||||
      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
 | 
					      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
 | 
				
			||||||
 | 
					      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
 | 
				
			||||||
      _seeded=0;
 | 
					      _seeded=0;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -354,7 +357,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      std::random_device rd;
 | 
					      std::random_device rd;
 | 
				
			||||||
      Seed(rd);
 | 
					      Seed(rd);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    void SeedFixedIntegers(std::vector<int> &seeds){
 | 
					    void SeedFixedIntegers(const std::vector<int> &seeds){
 | 
				
			||||||
      fixedSeed src(seeds);
 | 
					      fixedSeed src(seeds);
 | 
				
			||||||
      Seed(src);
 | 
					      Seed(src);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -369,13 +372,21 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    rng.fill(l,rng._gaussian);
 | 
					    rng.fill(l,rng._gaussian);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){
 | 
				
			||||||
 | 
					    rng.fill(l,rng._bernoulli);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
 | 
					  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
 | 
				
			||||||
    rng.fill(l,rng._uniform);
 | 
					    rng.fill(l,rng._uniform);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
 | 
					  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
 | 
				
			||||||
    rng.fill(l,rng._gaussian);
 | 
					    rng.fill(l,rng._gaussian);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){
 | 
				
			||||||
 | 
					    rng.fill(l,rng._bernoulli);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -115,9 +115,9 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
				
			|||||||
    int sc;
 | 
					    int sc;
 | 
				
			||||||
    std::vector<int> coor_c(_ndimension);
 | 
					    std::vector<int> coor_c(_ndimension);
 | 
				
			||||||
    std::vector<int> coor_f(_ndimension);
 | 
					    std::vector<int> coor_f(_ndimension);
 | 
				
			||||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
					    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
				
			||||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
					    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
				
			||||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
					    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int i=0;i<nbasis;i++) {
 | 
					    for(int i=0;i<nbasis;i++) {
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
@@ -160,9 +160,9 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    std::vector<int> coor_c(_ndimension);
 | 
					    std::vector<int> coor_c(_ndimension);
 | 
				
			||||||
    std::vector<int> coor_f(_ndimension);
 | 
					    std::vector<int> coor_f(_ndimension);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
					    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
				
			||||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
					    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
				
			||||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
					    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // z = A x + y
 | 
					    // z = A x + y
 | 
				
			||||||
    fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
 | 
					    fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
 | 
				
			||||||
@@ -225,9 +225,9 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 | 
				
			|||||||
    std::vector<int> coor_c(_ndimension);
 | 
					    std::vector<int> coor_c(_ndimension);
 | 
				
			||||||
    std::vector<int> coor_f(_ndimension);
 | 
					    std::vector<int> coor_f(_ndimension);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
					    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
				
			||||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
					    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
				
			||||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
					    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
 | 
					    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -311,9 +311,9 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
				
			|||||||
    std::vector<int> coor_c(_ndimension);
 | 
					    std::vector<int> coor_c(_ndimension);
 | 
				
			||||||
    std::vector<int> coor_f(_ndimension);
 | 
					    std::vector<int> coor_f(_ndimension);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
					    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
				
			||||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
					    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
				
			||||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
					    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int i=0;i<nbasis;i++) {
 | 
					    for(int i=0;i<nbasis;i++) {
 | 
				
			||||||
      if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
 | 
					      if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
 | 
				
			||||||
@@ -325,6 +325,126 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
				
			|||||||
  
 | 
					  
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 | 
				
			||||||
 | 
					// Simd layouts need not match since we use peek/poke Local
 | 
				
			||||||
 | 
					template<class vobj,class vvobj>
 | 
				
			||||||
 | 
					void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_object sobj;
 | 
				
			||||||
 | 
					  typedef typename vvobj::scalar_object ssobj;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  sobj s;
 | 
				
			||||||
 | 
					  ssobj ss;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridBase *ig = in._grid;
 | 
				
			||||||
 | 
					  GridBase *og = out._grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int ni = ig->_ndimension;
 | 
				
			||||||
 | 
					  int no = og->_ndimension;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(ni == no);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int d=0;d<no;d++){
 | 
				
			||||||
 | 
					    assert(ig->_processors[d]  == og->_processors[d]);
 | 
				
			||||||
 | 
					    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
 | 
					  for(int idx=0;idx<ig->lSites();idx++){
 | 
				
			||||||
 | 
					    std::vector<int> lcoor(ni);
 | 
				
			||||||
 | 
					    ig->LocalIndexToLocalCoor(idx,lcoor);
 | 
				
			||||||
 | 
					    peekLocalSite(s,in,lcoor);
 | 
				
			||||||
 | 
					    ss=s;
 | 
				
			||||||
 | 
					    pokeLocalSite(ss,out,lcoor);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class vobj>
 | 
				
			||||||
 | 
					void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_object sobj;
 | 
				
			||||||
 | 
					  sobj s;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridBase *lg = lowDim._grid;
 | 
				
			||||||
 | 
					  GridBase *hg = higherDim._grid;
 | 
				
			||||||
 | 
					  int nl = lg->_ndimension;
 | 
				
			||||||
 | 
					  int nh = hg->_ndimension;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(nl+1 == nh);
 | 
				
			||||||
 | 
					  assert(orthog<nh);
 | 
				
			||||||
 | 
					  assert(orthog>=0);
 | 
				
			||||||
 | 
					  assert(hg->_processors[orthog]==1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int dl; dl = 0;
 | 
				
			||||||
 | 
					  for(int d=0;d<nh;d++){
 | 
				
			||||||
 | 
					    if ( d != orthog) {
 | 
				
			||||||
 | 
					      assert(lg->_processors[dl]  == hg->_processors[d]);
 | 
				
			||||||
 | 
					      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
 | 
				
			||||||
 | 
					      dl++;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // the above should guarantee that the operations are local
 | 
				
			||||||
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
 | 
					  for(int idx=0;idx<lg->lSites();idx++){
 | 
				
			||||||
 | 
					    std::vector<int> lcoor(nl);
 | 
				
			||||||
 | 
					    std::vector<int> hcoor(nh);
 | 
				
			||||||
 | 
					    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
				
			||||||
 | 
					    dl=0;
 | 
				
			||||||
 | 
					    hcoor[orthog] = slice;
 | 
				
			||||||
 | 
					    for(int d=0;d<nh;d++){
 | 
				
			||||||
 | 
					      if ( d!=orthog ) { 
 | 
				
			||||||
 | 
						hcoor[d]=lcoor[dl++];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    peekLocalSite(s,lowDim,lcoor);
 | 
				
			||||||
 | 
					    pokeLocalSite(s,higherDim,hcoor);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class vobj>
 | 
				
			||||||
 | 
					void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_object sobj;
 | 
				
			||||||
 | 
					  sobj s;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridBase *lg = lowDim._grid;
 | 
				
			||||||
 | 
					  GridBase *hg = higherDim._grid;
 | 
				
			||||||
 | 
					  int nl = lg->_ndimension;
 | 
				
			||||||
 | 
					  int nh = hg->_ndimension;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(nl+1 == nh);
 | 
				
			||||||
 | 
					  assert(orthog<nh);
 | 
				
			||||||
 | 
					  assert(orthog>=0);
 | 
				
			||||||
 | 
					  assert(hg->_processors[orthog]==1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int dl; dl = 0;
 | 
				
			||||||
 | 
					  for(int d=0;d<nh;d++){
 | 
				
			||||||
 | 
					    if ( d != orthog) {
 | 
				
			||||||
 | 
					      assert(lg->_processors[dl]  == hg->_processors[d]);
 | 
				
			||||||
 | 
					      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
 | 
				
			||||||
 | 
					      dl++;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  // the above should guarantee that the operations are local
 | 
				
			||||||
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
 | 
					  for(int idx=0;idx<lg->lSites();idx++){
 | 
				
			||||||
 | 
					    std::vector<int> lcoor(nl);
 | 
				
			||||||
 | 
					    std::vector<int> hcoor(nh);
 | 
				
			||||||
 | 
					    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
				
			||||||
 | 
					    dl=0;
 | 
				
			||||||
 | 
					    hcoor[orthog] = slice;
 | 
				
			||||||
 | 
					    for(int d=0;d<nh;d++){
 | 
				
			||||||
 | 
					      if ( d!=orthog ) { 
 | 
				
			||||||
 | 
						hcoor[d]=lcoor[dl++];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    peekLocalSite(s,higherDim,hcoor);
 | 
				
			||||||
 | 
					    pokeLocalSite(s,lowDim,lcoor);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class vobj>
 | 
					template<class vobj>
 | 
				
			||||||
void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 | 
					void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -146,7 +146,7 @@ class BinaryIO {
 | 
				
			|||||||
    csum = 0;
 | 
					    csum = 0;
 | 
				
			||||||
    std::vector<int> lcoor;
 | 
					    std::vector<int> lcoor;
 | 
				
			||||||
    for(int l=0;l<grid->lSites();l++){
 | 
					    for(int l=0;l<grid->lSites();l++){
 | 
				
			||||||
      grid->CoorFromIndex(lcoor,l,grid->_ldimensions);
 | 
					      Lexicographic::CoorFromIndex(lcoor,l,grid->_ldimensions);
 | 
				
			||||||
      peekLocalSite(siteObj,lat,lcoor);
 | 
					      peekLocalSite(siteObj,lat,lcoor);
 | 
				
			||||||
      munge(siteObj,fileObj,csum);
 | 
					      munge(siteObj,fileObj,csum);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -168,6 +168,7 @@ class BinaryIO {
 | 
				
			|||||||
    GridBase *grid = Umu._grid;
 | 
					    GridBase *grid = Umu._grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
 | 
					    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
 | 
				
			||||||
 | 
					    GridStopWatch timer; timer.Start();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int ieee32big = (format == std::string("IEEE32BIG"));
 | 
					    int ieee32big = (format == std::string("IEEE32BIG"));
 | 
				
			||||||
    int ieee32    = (format == std::string("IEEE32"));
 | 
					    int ieee32    = (format == std::string("IEEE32"));
 | 
				
			||||||
@@ -182,6 +183,7 @@ class BinaryIO {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    Umu = zero;
 | 
					    Umu = zero;
 | 
				
			||||||
    uint32_t csum=0;
 | 
					    uint32_t csum=0;
 | 
				
			||||||
 | 
					    uint64_t bytes=0;
 | 
				
			||||||
    fobj file_object;
 | 
					    fobj file_object;
 | 
				
			||||||
    sobj munged;
 | 
					    sobj munged;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
@@ -194,7 +196,7 @@ class BinaryIO {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      if ( grid->IsBoss() ) {
 | 
					      if ( grid->IsBoss() ) {
 | 
				
			||||||
	fin.read((char *)&file_object,sizeof(file_object));
 | 
						fin.read((char *)&file_object,sizeof(file_object));
 | 
				
			||||||
	
 | 
						bytes += sizeof(file_object);
 | 
				
			||||||
	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
 | 
						if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
 | 
				
			||||||
	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
 | 
						if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
 | 
				
			||||||
	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
 | 
						if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
 | 
				
			||||||
@@ -205,6 +207,10 @@ class BinaryIO {
 | 
				
			|||||||
      // The boss who read the file has their value poked
 | 
					      // The boss who read the file has their value poked
 | 
				
			||||||
      pokeSite(munged,Umu,site);
 | 
					      pokeSite(munged,Umu,site);
 | 
				
			||||||
    }}}}
 | 
					    }}}}
 | 
				
			||||||
 | 
					    timer.Stop();
 | 
				
			||||||
 | 
					    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
				
			||||||
 | 
						     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return csum;
 | 
					    return csum;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -224,13 +230,14 @@ class BinaryIO {
 | 
				
			|||||||
    // Serialise through node zero
 | 
					    // Serialise through node zero
 | 
				
			||||||
    //////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////
 | 
				
			||||||
    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
 | 
					    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
 | 
				
			||||||
 | 
					    GridStopWatch timer; timer.Start();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::ofstream fout;
 | 
					    std::ofstream fout;
 | 
				
			||||||
    if ( grid->IsBoss() ) {
 | 
					    if ( grid->IsBoss() ) {
 | 
				
			||||||
      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
					      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
				
			||||||
      fout.seekp(offset);
 | 
					      fout.seekp(offset);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    
 | 
					    uint64_t bytes=0;
 | 
				
			||||||
    uint32_t csum=0;
 | 
					    uint32_t csum=0;
 | 
				
			||||||
    fobj file_object;
 | 
					    fobj file_object;
 | 
				
			||||||
    sobj unmunged;
 | 
					    sobj unmunged;
 | 
				
			||||||
@@ -253,9 +260,14 @@ class BinaryIO {
 | 
				
			|||||||
	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
 | 
						if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
 | 
				
			||||||
	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
 | 
						if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// NB could gather an xstrip as an optimisation.
 | 
				
			||||||
	fout.write((char *)&file_object,sizeof(file_object));
 | 
						fout.write((char *)&file_object,sizeof(file_object));
 | 
				
			||||||
 | 
						bytes+=sizeof(file_object);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }}}}
 | 
					    }}}}
 | 
				
			||||||
 | 
					    timer.Stop();
 | 
				
			||||||
 | 
					    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
				
			||||||
 | 
						     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return csum;
 | 
					    return csum;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -265,6 +277,7 @@ class BinaryIO {
 | 
				
			|||||||
    typedef typename GridSerialRNG::RngStateType RngStateType;
 | 
					    typedef typename GridSerialRNG::RngStateType RngStateType;
 | 
				
			||||||
    const int RngStateCount = GridSerialRNG::RngStateCount;
 | 
					    const int RngStateCount = GridSerialRNG::RngStateCount;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    GridBase *grid = parallel._grid;
 | 
					    GridBase *grid = parallel._grid;
 | 
				
			||||||
    int gsites = grid->_gsites;
 | 
					    int gsites = grid->_gsites;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -310,7 +323,7 @@ class BinaryIO {
 | 
				
			|||||||
      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
					      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
				
			||||||
      fout.write((char *)&saved[0],bytes);
 | 
					      fout.write((char *)&saved[0],bytes);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    grid->Broadcast(0,(void *)&csum,sizeof(csum));
 | 
				
			||||||
    return csum;
 | 
					    return csum;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file,int offset)
 | 
					  static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file,int offset)
 | 
				
			||||||
@@ -360,6 +373,8 @@ class BinaryIO {
 | 
				
			|||||||
      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
					      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    grid->Broadcast(0,(void *)&csum,sizeof(csum));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return csum;
 | 
					    return csum;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -398,7 +413,7 @@ class BinaryIO {
 | 
				
			|||||||
    int IOnode = 1;
 | 
					    int IOnode = 1;
 | 
				
			||||||
    for(int d=0;d<grid->_ndimension;d++) {
 | 
					    for(int d=0;d<grid->_ndimension;d++) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      if ( d==0 ) parallel[d] = 0;
 | 
					      if ( d == 0 ) parallel[d] = 0;
 | 
				
			||||||
      if (parallel[d]) {
 | 
					      if (parallel[d]) {
 | 
				
			||||||
	range[d] = grid->_ldimensions[d];
 | 
						range[d] = grid->_ldimensions[d];
 | 
				
			||||||
	start[d] = grid->_processor_coor[d]*range[d];
 | 
						start[d] = grid->_processor_coor[d]*range[d];
 | 
				
			||||||
@@ -426,6 +441,9 @@ class BinaryIO {
 | 
				
			|||||||
      std::cout << std::endl;
 | 
					      std::cout << std::endl;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GridStopWatch timer; timer.Start();
 | 
				
			||||||
 | 
					    uint64_t bytes=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int myrank = grid->ThisRank();
 | 
					    int myrank = grid->ThisRank();
 | 
				
			||||||
    int iorank = grid->RankFromProcessorCoor(ioproc);
 | 
					    int iorank = grid->RankFromProcessorCoor(ioproc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -439,9 +457,9 @@ class BinaryIO {
 | 
				
			|||||||
    // available (how short sighted is that?)
 | 
					    // available (how short sighted is that?)
 | 
				
			||||||
    //////////////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////////////
 | 
				
			||||||
    Umu = zero;
 | 
					    Umu = zero;
 | 
				
			||||||
    uint32_t csum=0;
 | 
					    static uint32_t csum=0;
 | 
				
			||||||
    fobj fileObj;
 | 
					    fobj fileObj;
 | 
				
			||||||
    sobj siteObj;
 | 
					    static sobj siteObj; // Static to place in symmetric region for SHMEM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // need to implement these loops in Nd independent way with a lexico conversion
 | 
					      // need to implement these loops in Nd independent way with a lexico conversion
 | 
				
			||||||
    for(int tlex=0;tlex<slice_vol;tlex++){
 | 
					    for(int tlex=0;tlex<slice_vol;tlex++){
 | 
				
			||||||
@@ -451,7 +469,7 @@ class BinaryIO {
 | 
				
			|||||||
      std::vector<int> lsite(nd);
 | 
					      std::vector<int> lsite(nd);
 | 
				
			||||||
      std::vector<int> iosite(nd);
 | 
					      std::vector<int> iosite(nd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      grid->CoorFromIndex(tsite,tlex,range);
 | 
					      Lexicographic::CoorFromIndex(tsite,tlex,range);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int d=0;d<nd;d++){
 | 
					      for(int d=0;d<nd;d++){
 | 
				
			||||||
	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
						lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
				
			||||||
@@ -461,7 +479,7 @@ class BinaryIO {
 | 
				
			|||||||
      /////////////////////////
 | 
					      /////////////////////////
 | 
				
			||||||
      // Get the rank of owner of data
 | 
					      // Get the rank of owner of data
 | 
				
			||||||
      /////////////////////////
 | 
					      /////////////////////////
 | 
				
			||||||
	int rank, o_idx,i_idx, g_idx;
 | 
					      int rank, o_idx,i_idx, g_idx;
 | 
				
			||||||
      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
 | 
					      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
 | 
				
			||||||
      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
 | 
					      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
@@ -472,6 +490,7 @@ class BinaryIO {
 | 
				
			|||||||
	
 | 
						
 | 
				
			||||||
	fin.seekg(offset+g_idx*sizeof(fileObj));
 | 
						fin.seekg(offset+g_idx*sizeof(fileObj));
 | 
				
			||||||
	fin.read((char *)&fileObj,sizeof(fileObj));
 | 
						fin.read((char *)&fileObj,sizeof(fileObj));
 | 
				
			||||||
 | 
						bytes+=sizeof(fileObj);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
						if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
				
			||||||
	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
						if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
 | 
				
			||||||
@@ -480,22 +499,28 @@ class BinaryIO {
 | 
				
			|||||||
	
 | 
						
 | 
				
			||||||
	munge(fileObj,siteObj,csum);
 | 
						munge(fileObj,siteObj,csum);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if ( rank != myrank ) {
 | 
					      }	
 | 
				
			||||||
	  grid->SendTo((void *)&siteObj,rank,sizeof(siteObj));
 | 
					 | 
				
			||||||
	} else { 
 | 
					 | 
				
			||||||
	  pokeLocalSite(siteObj,Umu,lsite);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      } else { 
 | 
					      // Possibly do transport through pt2pt 
 | 
				
			||||||
	if ( myrank == rank ) {
 | 
					      if ( rank != iorank ) { 
 | 
				
			||||||
	  grid->RecvFrom((void *)&siteObj,iorank,sizeof(siteObj));
 | 
						if ( (myrank == rank) || (myrank==iorank) ) {
 | 
				
			||||||
	  pokeLocalSite(siteObj,Umu,lsite);
 | 
						  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					      // Poke at destination
 | 
				
			||||||
 | 
					      if ( myrank == rank ) {
 | 
				
			||||||
 | 
						  pokeLocalSite(siteObj,Umu,lsite);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
      grid->Barrier(); // necessary?
 | 
					      grid->Barrier(); // necessary?
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    grid->GlobalSum(csum);
 | 
					    grid->GlobalSum(csum);
 | 
				
			||||||
 | 
					    grid->GlobalSum(bytes);
 | 
				
			||||||
 | 
					    grid->Barrier();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    timer.Stop();
 | 
				
			||||||
 | 
					    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
				
			||||||
 | 
						     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    return csum;
 | 
					    return csum;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -530,7 +555,7 @@ class BinaryIO {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    for(int d=0;d<grid->_ndimension;d++) {
 | 
					    for(int d=0;d<grid->_ndimension;d++) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      if ( d==0 ) parallel[d] = 0;
 | 
					      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      if (parallel[d]) {
 | 
					      if (parallel[d]) {
 | 
				
			||||||
	range[d] = grid->_ldimensions[d];
 | 
						range[d] = grid->_ldimensions[d];
 | 
				
			||||||
@@ -559,6 +584,9 @@ class BinaryIO {
 | 
				
			|||||||
      std::cout << std::endl;
 | 
					      std::cout << std::endl;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GridStopWatch timer; timer.Start();
 | 
				
			||||||
 | 
					    uint64_t bytes=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int myrank = grid->ThisRank();
 | 
					    int myrank = grid->ThisRank();
 | 
				
			||||||
    int iorank = grid->RankFromProcessorCoor(ioproc);
 | 
					    int iorank = grid->RankFromProcessorCoor(ioproc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -577,10 +605,10 @@ class BinaryIO {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    uint32_t csum=0;
 | 
					    uint32_t csum=0;
 | 
				
			||||||
    fobj fileObj;
 | 
					    fobj fileObj;
 | 
				
			||||||
    sobj siteObj;
 | 
					    static sobj siteObj; // static for SHMEM target; otherwise dynamic allocate with AlignedAllocator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // should aggregate a whole chunk and then write.
 | 
				
			||||||
      // need to implement these loops in Nd independent way with a lexico conversion
 | 
					    // need to implement these loops in Nd independent way with a lexico conversion
 | 
				
			||||||
    for(int tlex=0;tlex<slice_vol;tlex++){
 | 
					    for(int tlex=0;tlex<slice_vol;tlex++){
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
      std::vector<int> tsite(nd); // temporary mixed up site
 | 
					      std::vector<int> tsite(nd); // temporary mixed up site
 | 
				
			||||||
@@ -588,7 +616,7 @@ class BinaryIO {
 | 
				
			|||||||
      std::vector<int> lsite(nd);
 | 
					      std::vector<int> lsite(nd);
 | 
				
			||||||
      std::vector<int> iosite(nd);
 | 
					      std::vector<int> iosite(nd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      grid->CoorFromIndex(tsite,tlex,range);
 | 
					      Lexicographic::CoorFromIndex(tsite,tlex,range);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int d=0;d<nd;d++){
 | 
					      for(int d=0;d<nd;d++){
 | 
				
			||||||
	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
						lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
 | 
				
			||||||
@@ -606,13 +634,21 @@ class BinaryIO {
 | 
				
			|||||||
      ////////////////////////////////
 | 
					      ////////////////////////////////
 | 
				
			||||||
      // iorank writes from the seek
 | 
					      // iorank writes from the seek
 | 
				
			||||||
      ////////////////////////////////
 | 
					      ////////////////////////////////
 | 
				
			||||||
      if (myrank == iorank) {
 | 
					 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
	if ( rank != myrank ) {
 | 
					      // Owner of data peeks it
 | 
				
			||||||
	  grid->RecvFrom((void *)&siteObj,rank,sizeof(siteObj));
 | 
					      peekLocalSite(siteObj,Umu,lsite);
 | 
				
			||||||
	} else { 
 | 
					
 | 
				
			||||||
	  peekLocalSite(siteObj,Umu,lsite);
 | 
					      // Pair of nodes may need to do pt2pt send
 | 
				
			||||||
 | 
					      if ( rank != iorank ) { // comms is necessary
 | 
				
			||||||
 | 
						if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
 | 
				
			||||||
 | 
						  // Send to IOrank 
 | 
				
			||||||
 | 
						  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      grid->Barrier(); // necessary?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if (myrank == iorank) {
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	munge(siteObj,fileObj,csum);
 | 
						munge(siteObj,fileObj,csum);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -623,17 +659,16 @@ class BinaryIO {
 | 
				
			|||||||
	
 | 
						
 | 
				
			||||||
	fout.seekp(offset+g_idx*sizeof(fileObj));
 | 
						fout.seekp(offset+g_idx*sizeof(fileObj));
 | 
				
			||||||
	fout.write((char *)&fileObj,sizeof(fileObj));
 | 
						fout.write((char *)&fileObj,sizeof(fileObj));
 | 
				
			||||||
 | 
						bytes+=sizeof(fileObj);
 | 
				
			||||||
      } else { 
 | 
					 | 
				
			||||||
	if ( myrank == rank ) {
 | 
					 | 
				
			||||||
	  peekLocalSite(siteObj,Umu,lsite);
 | 
					 | 
				
			||||||
	  grid->SendTo((void *)&siteObj,iorank,sizeof(siteObj));
 | 
					 | 
				
			||||||
	} 
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      grid->Barrier(); // necessary// or every 16 packets to rate throttle??
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    grid->GlobalSum(csum);
 | 
					    grid->GlobalSum(csum);
 | 
				
			||||||
 | 
					    grid->GlobalSum(bytes);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    timer.Stop();
 | 
				
			||||||
 | 
					    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 | 
				
			||||||
 | 
						     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return csum;
 | 
					    return csum;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -213,37 +213,38 @@ class NerscIO : public BinaryIO {
 | 
				
			|||||||
  static inline void truncate(std::string file){
 | 
					  static inline void truncate(std::string file){
 | 
				
			||||||
    std::ofstream fout(file,std::ios::out);
 | 
					    std::ofstream fout(file,std::ios::out);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  #define dump_nersc_header(field, s)\
 | 
				
			||||||
 | 
					  s << "BEGIN_HEADER"      << std::endl;\
 | 
				
			||||||
 | 
					  s << "HDR_VERSION = "    << field.hdr_version    << std::endl;\
 | 
				
			||||||
 | 
					  s << "DATATYPE = "       << field.data_type      << std::endl;\
 | 
				
			||||||
 | 
					  s << "STORAGE_FORMAT = " << field.storage_format << std::endl;\
 | 
				
			||||||
 | 
					  for(int i=0;i<4;i++){\
 | 
				
			||||||
 | 
					    s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;\
 | 
				
			||||||
 | 
					  }\
 | 
				
			||||||
 | 
					  s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;\
 | 
				
			||||||
 | 
					  s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;\
 | 
				
			||||||
 | 
					  for(int i=0;i<4;i++){\
 | 
				
			||||||
 | 
					    s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;\
 | 
				
			||||||
 | 
					  }\
 | 
				
			||||||
 | 
					  \
 | 
				
			||||||
 | 
					  s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;\
 | 
				
			||||||
 | 
					  s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;\
 | 
				
			||||||
 | 
					  s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;\
 | 
				
			||||||
 | 
					  s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;\
 | 
				
			||||||
 | 
					  s << "CREATOR = "         << field.creator          << std::endl;\
 | 
				
			||||||
 | 
					  s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;\
 | 
				
			||||||
 | 
					  s << "CREATION_DATE = "   << field.creation_date    << std::endl;\
 | 
				
			||||||
 | 
					  s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;\
 | 
				
			||||||
 | 
					  s << "FLOATING_POINT = "  << field.floating_point   << std::endl;\
 | 
				
			||||||
 | 
					  s << "END_HEADER"         << std::endl;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
  static inline unsigned int writeHeader(NerscField &field,std::string file)
 | 
					  static inline unsigned int writeHeader(NerscField &field,std::string file)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    std::ofstream fout(file,std::ios::out|std::ios::in);
 | 
					    std::ofstream fout(file,std::ios::out|std::ios::in);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
    fout.seekp(0,std::ios::beg);
 | 
					    fout.seekp(0,std::ios::beg);
 | 
				
			||||||
    fout << "BEGIN_HEADER"      << std::endl;
 | 
					    dump_nersc_header(field, fout);
 | 
				
			||||||
    fout << "HDR_VERSION = "    << field.hdr_version    << std::endl;
 | 
					 | 
				
			||||||
    fout << "DATATYPE = "       << field.data_type      << std::endl;
 | 
					 | 
				
			||||||
    fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for(int i=0;i<4;i++){
 | 
					 | 
				
			||||||
      fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    // just to keep the space and write it later
 | 
					 | 
				
			||||||
    fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
 | 
					 | 
				
			||||||
    fout << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;
 | 
					 | 
				
			||||||
    for(int i=0;i<4;i++){
 | 
					 | 
				
			||||||
      fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fout << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fout << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;
 | 
					 | 
				
			||||||
    fout << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;
 | 
					 | 
				
			||||||
    fout << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;
 | 
					 | 
				
			||||||
    fout << "CREATOR = "         << field.creator          << std::endl;
 | 
					 | 
				
			||||||
    fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
 | 
					 | 
				
			||||||
    fout << "CREATION_DATE = "   << field.creation_date    << std::endl;
 | 
					 | 
				
			||||||
    fout << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;
 | 
					 | 
				
			||||||
    fout << "FLOATING_POINT = "  << field.floating_point   << std::endl;
 | 
					 | 
				
			||||||
    fout << "END_HEADER"         << std::endl;
 | 
					 | 
				
			||||||
    field.data_start = fout.tellp();
 | 
					    field.data_start = fout.tellp();
 | 
				
			||||||
    return field.data_start;
 | 
					    return field.data_start;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -345,17 +346,17 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 | 
				
			|||||||
  if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
 | 
					  if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
 | 
				
			||||||
    if ( ieee32 || ieee32big ) {
 | 
					    if ( ieee32 || ieee32big ) {
 | 
				
			||||||
      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 | 
					      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 | 
				
			||||||
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 | 
						csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 | 
				
			||||||
	(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
 | 
						(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    if ( ieee64 || ieee64big ) {
 | 
					    if ( ieee64 || ieee64big ) {
 | 
				
			||||||
      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 | 
					      //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 | 
				
			||||||
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 | 
					      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 | 
				
			||||||
	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
 | 
					      	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
 | 
					  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
 | 
				
			||||||
    if ( ieee32 || ieee32big ) {
 | 
					    if ( ieee32 || ieee32big ) {
 | 
				
			||||||
      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 | 
					      //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 | 
				
			||||||
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 | 
					      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 | 
				
			||||||
	(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
 | 
						(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -372,6 +373,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
 | 
					  assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
 | 
				
			||||||
  assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
 | 
					  assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  assert(csum == header.checksum );
 | 
					  assert(csum == header.checksum );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
 | 
					  std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
 | 
				
			||||||
@@ -419,6 +421,7 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
 | 
				
			|||||||
    std::string file1 = file+"para";
 | 
					    std::string file1 = file+"para";
 | 
				
			||||||
    int offset1 = writeHeader(header,file1);
 | 
					    int offset1 = writeHeader(header,file1);
 | 
				
			||||||
    int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
 | 
					    int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
 | 
				
			||||||
 | 
					    //int csum1=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
 | 
					    std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
 | 
				
			||||||
@@ -429,11 +432,12 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    header.floating_point = std::string("IEEE64BIG");
 | 
					    header.floating_point = std::string("IEEE64BIG");
 | 
				
			||||||
    header.data_type      = std::string("4D_SU3_GAUGE_3X3");
 | 
					    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
 | 
				
			||||||
    NerscSimpleUnmunger<fobj3D,sobj> munge;
 | 
					    NerscSimpleUnmunger<fobj3D,sobj> munge;
 | 
				
			||||||
    BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
 | 
					    BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
 | 
				
			||||||
    offset = writeHeader(header,file);
 | 
					    offset = writeHeader(header,file);
 | 
				
			||||||
    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
 | 
					    //    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
 | 
				
			||||||
 | 
					    csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
 | 
					  std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
 | 
				
			||||||
@@ -507,6 +511,8 @@ static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel
 | 
				
			|||||||
  // munger is a function of <floating point, Real, data_type>
 | 
					  // munger is a function of <floating point, Real, data_type>
 | 
				
			||||||
  uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
 | 
					  uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cerr<<" Csum "<< csum << " "<< header.checksum <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  assert(csum == header.checksum );
 | 
					  assert(csum == header.checksum );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
 | 
					  std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -90,7 +90,7 @@ namespace QCD {
 | 
				
			|||||||
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
 | 
					    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
 | 
				
			||||||
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
 | 
					    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 | 
					    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 | 
				
			||||||
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 | 
					    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Spin matrix
 | 
					    // Spin matrix
 | 
				
			||||||
@@ -383,7 +383,6 @@ namespace QCD {
 | 
				
			|||||||
    //////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////
 | 
				
			||||||
    // Poke scalars
 | 
					    // Poke scalars
 | 
				
			||||||
    //////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////
 | 
				
			||||||
 | 
					 | 
				
			||||||
    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
 | 
					    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      pokeIndex<SpinIndex>(lhs,rhs,i);
 | 
					      pokeIndex<SpinIndex>(lhs,rhs,i);
 | 
				
			||||||
@@ -407,6 +406,40 @@ namespace QCD {
 | 
				
			|||||||
      pokeIndex<LorentzIndex>(lhs,rhs,i);
 | 
					      pokeIndex<LorentzIndex>(lhs,rhs,i);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    //////////////////////////////////////////////
 | 
				
			||||||
 | 
					    // Fermion <-> propagator assignements
 | 
				
			||||||
 | 
					    //////////////////////////////////////////////
 | 
				
			||||||
 | 
					    template <class Prop, class Ferm>
 | 
				
			||||||
 | 
					    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        for(int j = 0; j < Ns; ++j)
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            auto pjs = peekSpin(p, j, s);
 | 
				
			||||||
 | 
					            auto fj  = peekSpin(f, j);
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            for(int i = 0; i < Nc; ++i)
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                pokeColour(pjs, peekColour(fj, i), i, c);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            pokeSpin(p, pjs, j, s);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    template <class Prop, class Ferm>
 | 
				
			||||||
 | 
					    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        for(int j = 0; j < Ns; ++j)
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            auto pjs = peekSpin(p, j, s);
 | 
				
			||||||
 | 
					            auto fj  = peekSpin(f, j);
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            for(int i = 0; i < Nc; ++i)
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                pokeColour(fj, peekColour(pjs, i, c), i);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            pokeSpin(f, fj, j);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    //////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////
 | 
				
			||||||
    // transpose array and scalar
 | 
					    // transpose array and scalar
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -109,10 +109,12 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#define FermOpTemplateInstantiate(A) \
 | 
					#define FermOpTemplateInstantiate(A) \
 | 
				
			||||||
  template class A<WilsonImplF>;		\
 | 
					  template class A<WilsonImplF>;		\
 | 
				
			||||||
  template class A<WilsonImplD>;  \
 | 
					  template class A<WilsonImplD>;		\
 | 
				
			||||||
  template class A<GparityWilsonImplF>;		\
 | 
					  template class A<GparityWilsonImplF>;		\
 | 
				
			||||||
  template class A<GparityWilsonImplD>;		
 | 
					  template class A<GparityWilsonImplD>;		
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define GparityFermOpTemplateInstantiate(A) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
////////////////////////////////////////////
 | 
					////////////////////////////////////////////
 | 
				
			||||||
// Fermion operators / actions
 | 
					// Fermion operators / actions
 | 
				
			||||||
////////////////////////////////////////////
 | 
					////////////////////////////////////////////
 | 
				
			||||||
@@ -208,6 +210,14 @@ typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 | 
				
			|||||||
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 | 
					typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 | 
				
			||||||
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
 | 
					typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 | 
				
			||||||
 | 
					typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 | 
				
			||||||
 | 
					typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
 | 
				
			||||||
 | 
					typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 | 
				
			||||||
 | 
					typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 | 
				
			||||||
 | 
					typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  }}
 | 
					  }}
 | 
				
			||||||
///////////////////////////////////////////////////////////////////////////////
 | 
					///////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 | 
					// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -527,6 +527,7 @@ namespace QCD {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  FermOpTemplateInstantiate(CayleyFermion5D);
 | 
					  FermOpTemplateInstantiate(CayleyFermion5D);
 | 
				
			||||||
 | 
					  GparityFermOpTemplateInstantiate(CayleyFermion5D);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -130,7 +130,7 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
					      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
				
			||||||
      typedef WilsonImplParams ImplParams;
 | 
					      typedef WilsonImplParams ImplParams;
 | 
				
			||||||
      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
 | 
					      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      ImplParams Params;
 | 
					      ImplParams Params;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -142,6 +142,10 @@ namespace Grid {
 | 
				
			|||||||
        mult(&phi(),&U(mu),&chi());
 | 
					        mult(&phi(),&U(mu),&chi());
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      template<class ref>
 | 
				
			||||||
 | 
					      inline void loadLinkElement(Simd & reg,ref &memory){
 | 
				
			||||||
 | 
						reg = memory;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
					      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
				
			||||||
      {
 | 
					      {
 | 
				
			||||||
        conformable(Uds._grid,GaugeGrid);
 | 
					        conformable(Uds._grid,GaugeGrid);
 | 
				
			||||||
@@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ///////
 | 
				
			||||||
 | 
					    // Single flavour four spinors with colour index, 5d redblack
 | 
				
			||||||
 | 
					    ///////
 | 
				
			||||||
 | 
					    template<class S,int Nrepresentation=Nc>
 | 
				
			||||||
 | 
					    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
 | 
				
			||||||
 | 
					    public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      INHERIT_GIMPL_TYPES(Gimpl);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
 | 
				
			||||||
 | 
					      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
 | 
				
			||||||
 | 
					      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
 | 
				
			||||||
 | 
					      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
 | 
				
			||||||
 | 
					      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					      typedef iImplSpinor    <Simd>           SiteSpinor;
 | 
				
			||||||
 | 
					      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
 | 
				
			||||||
 | 
					      typedef Lattice<SiteSpinor>             FermionField;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Make the doubled gauge field a *scalar*
 | 
				
			||||||
 | 
					      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
 | 
				
			||||||
 | 
					      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
 | 
				
			||||||
 | 
					      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
				
			||||||
 | 
					      typedef WilsonImplParams ImplParams;
 | 
				
			||||||
 | 
					      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      ImplParams Params;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      bool overlapCommsCompute(void) { return false; };
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					      template<class ref>
 | 
				
			||||||
 | 
					      inline void loadLinkElement(Simd & reg,ref &memory){
 | 
				
			||||||
 | 
						vsplat(reg,memory);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
						SiteGaugeLink UU;
 | 
				
			||||||
 | 
						for(int i=0;i<Nrepresentation;i++){
 | 
				
			||||||
 | 
						  for(int j=0;j<Nrepresentation;j++){
 | 
				
			||||||
 | 
						    vsplat(UU()()(i,j),U(mu)()(i,j));
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					        mult(&phi(),&UU(),&chi());
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
						SiteScalarGaugeField  ScalarUmu;
 | 
				
			||||||
 | 
						SiteDoubledGaugeField ScalarUds;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        GaugeLinkField U   (Umu._grid);
 | 
				
			||||||
 | 
						GaugeField     Uadj(Umu._grid);
 | 
				
			||||||
 | 
					        for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					  	  U = PeekIndex<LorentzIndex>(Umu,mu);
 | 
				
			||||||
 | 
						  U = adj(Cshift(U,mu,-1));
 | 
				
			||||||
 | 
						  PokeIndex<LorentzIndex>(Uadj,U,mu);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
 | 
				
			||||||
 | 
						  std::vector<int> lcoor;
 | 
				
			||||||
 | 
						  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  peekLocalSite(ScalarUmu,Umu,lcoor);
 | 
				
			||||||
 | 
						  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  peekLocalSite(ScalarUmu,Uadj,lcoor);
 | 
				
			||||||
 | 
						  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  pokeLocalSite(ScalarUds,Uds,lcoor);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 | 
				
			||||||
 | 
						assert(0);
 | 
				
			||||||
 | 
					      }   
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
 | 
				
			||||||
 | 
						assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
    // Flavour doubled spinors; is Gparity the only? what about C*?
 | 
					    // Flavour doubled spinors; is Gparity the only? what about C*?
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -205,7 +303,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 | 
					      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
					      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
				
			||||||
      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
 | 
					      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      typedef GparityWilsonImplParams ImplParams;
 | 
					      typedef GparityWilsonImplParams ImplParams;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
	conformable(Uds._grid,GaugeGrid);
 | 
						conformable(Uds._grid,GaugeGrid);
 | 
				
			||||||
	conformable(Umu._grid,GaugeGrid);
 | 
						conformable(Umu._grid,GaugeGrid);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	GaugeLinkField Utmp(GaugeGrid);
 | 
						GaugeLinkField Utmp (GaugeGrid);
 | 
				
			||||||
	GaugeLinkField U(GaugeGrid);
 | 
						GaugeLinkField U    (GaugeGrid);
 | 
				
			||||||
	GaugeLinkField Uconj(GaugeGrid);
 | 
						GaugeLinkField Uconj(GaugeGrid);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	Lattice<iScalar<vInteger> > coor(GaugeGrid);
 | 
						Lattice<iScalar<vInteger> > coor(GaugeGrid);
 | 
				
			||||||
@@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
 | 
					    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
 | 
				
			||||||
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
 | 
					    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
 | 
				
			||||||
 | 
					    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
 | 
				
			||||||
 | 
					    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
 | 
					    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
 | 
				
			||||||
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
 | 
					    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
 | 
				
			||||||
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
 | 
					    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -48,14 +48,16 @@ namespace Grid {
 | 
				
			|||||||
			GridCartesian         &FourDimGrid,
 | 
								GridCartesian         &FourDimGrid,
 | 
				
			||||||
			GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
								GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
				
			||||||
			RealD _mass,RealD _M5,
 | 
								RealD _mass,RealD _M5,
 | 
				
			||||||
			RealD scale) :
 | 
					//			RealD scale):
 | 
				
			||||||
 | 
								RealD scale,const ImplParams &p= ImplParams()) :
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
 | 
					      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
 | 
				
			||||||
      MobiusFermion<Impl>(_Umu,
 | 
					      MobiusFermion<Impl>(_Umu,
 | 
				
			||||||
		    FiveDimGrid,
 | 
							    FiveDimGrid,
 | 
				
			||||||
		    FiveDimRedBlackGrid,
 | 
							    FiveDimRedBlackGrid,
 | 
				
			||||||
		    FourDimGrid,
 | 
							    FourDimGrid,
 | 
				
			||||||
		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
 | 
						FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
 | 
				
			||||||
 | 
					//		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
 | 
				
			||||||
      {
 | 
					      {
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -48,12 +48,7 @@ namespace QCD {
 | 
				
			|||||||
      mu=p;
 | 
					      mu=p;
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
      return spinproject(in);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    SiteHalfSpinor spinproject(const SiteSpinor &in)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      SiteHalfSpinor ret;
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
      int mudag=mu;
 | 
					      int mudag=mu;
 | 
				
			||||||
      if (!dag) {
 | 
					      if (!dag) {
 | 
				
			||||||
@@ -92,6 +87,173 @@ namespace QCD {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /////////////////////////
 | 
				
			||||||
 | 
					  // optimised versions
 | 
				
			||||||
 | 
					  /////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  template<class SiteHalfSpinor,class SiteSpinor>
 | 
				
			||||||
 | 
					  class WilsonXpCompressor {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
 | 
					      spProjXp(ret,in);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  template<class SiteHalfSpinor,class SiteSpinor>
 | 
				
			||||||
 | 
					  class WilsonYpCompressor {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
 | 
					      spProjYp(ret,in);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  template<class SiteHalfSpinor,class SiteSpinor>
 | 
				
			||||||
 | 
					  class WilsonZpCompressor {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
 | 
					      spProjZp(ret,in);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  template<class SiteHalfSpinor,class SiteSpinor>
 | 
				
			||||||
 | 
					  class WilsonTpCompressor {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
 | 
					      spProjTp(ret,in);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  template<class SiteHalfSpinor,class SiteSpinor>
 | 
				
			||||||
 | 
					  class WilsonXmCompressor {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
 | 
					      spProjXm(ret,in);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  template<class SiteHalfSpinor,class SiteSpinor>
 | 
				
			||||||
 | 
					  class WilsonYmCompressor {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
 | 
					      spProjYm(ret,in);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  template<class SiteHalfSpinor,class SiteSpinor>
 | 
				
			||||||
 | 
					  class WilsonZmCompressor {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
 | 
					      spProjZm(ret,in);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  template<class SiteHalfSpinor,class SiteSpinor>
 | 
				
			||||||
 | 
					  class WilsonTmCompressor {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
 | 
				
			||||||
 | 
					      SiteHalfSpinor ret;
 | 
				
			||||||
 | 
					      spProjTm(ret,in);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Fast comms buffer manipulation which should inline right through (avoid direction
 | 
				
			||||||
 | 
					    // dependent logic that prevents inlining
 | 
				
			||||||
 | 
					  template<class vobj,class cobj>
 | 
				
			||||||
 | 
					  class WilsonStencil : public CartesianStencil<vobj,cobj> {
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    WilsonStencil(GridBase *grid,
 | 
				
			||||||
 | 
							int npoints,
 | 
				
			||||||
 | 
							int checkerboard,
 | 
				
			||||||
 | 
							const std::vector<int> &directions,
 | 
				
			||||||
 | 
							const std::vector<int> &distances)  : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
 | 
				
			||||||
 | 
					      {    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template < class compressor>
 | 
				
			||||||
 | 
					    std::thread HaloExchangeOptBegin(const Lattice<vobj> &source,compressor &compress) {
 | 
				
			||||||
 | 
					      this->Mergers.resize(0); 
 | 
				
			||||||
 | 
					      this->Packets.resize(0);
 | 
				
			||||||
 | 
					      this->HaloGatherOpt(source,compress);
 | 
				
			||||||
 | 
					      return std::thread([&] { this->Communicate(); });
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template < class compressor>
 | 
				
			||||||
 | 
					    void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      auto thr = this->HaloExchangeOptBegin(source,compress);
 | 
				
			||||||
 | 
					      this->HaloExchangeOptComplete(thr);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void HaloExchangeOptComplete(std::thread &thr) 
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
						this->CommsMerge(); // spins
 | 
				
			||||||
 | 
						this->jointime-=usecond();
 | 
				
			||||||
 | 
						thr.join();
 | 
				
			||||||
 | 
						this->jointime+=usecond();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template < class compressor>
 | 
				
			||||||
 | 
					    void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
						// conformable(source._grid,_grid);
 | 
				
			||||||
 | 
						assert(source._grid==this->_grid);
 | 
				
			||||||
 | 
						this->halogtime-=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						assert (this->comm_buf.size() == this->_unified_buffer_size );
 | 
				
			||||||
 | 
						this->u_comm_offset=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int dag = compress.dag;
 | 
				
			||||||
 | 
						static std::vector<int> dirs(Nd*2);
 | 
				
			||||||
 | 
						for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
						  if ( dag ) {
 | 
				
			||||||
 | 
						    dirs[mu]  =mu;
 | 
				
			||||||
 | 
						    dirs[mu+4]=mu+Nd;
 | 
				
			||||||
 | 
						  } else { 
 | 
				
			||||||
 | 
						    dirs[mu]  =mu+Nd;
 | 
				
			||||||
 | 
						    dirs[mu+Nd]=mu;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WilsonXpCompressor<cobj,vobj> XpCompress;
 | 
				
			||||||
 | 
						this->HaloGatherDir(source,XpCompress,dirs[0]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WilsonYpCompressor<cobj,vobj> YpCompress;
 | 
				
			||||||
 | 
						this->HaloGatherDir(source,YpCompress,dirs[1]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WilsonZpCompressor<cobj,vobj> ZpCompress;
 | 
				
			||||||
 | 
						this->HaloGatherDir(source,ZpCompress,dirs[2]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WilsonTpCompressor<cobj,vobj> TpCompress;
 | 
				
			||||||
 | 
						this->HaloGatherDir(source,TpCompress,dirs[3]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WilsonXmCompressor<cobj,vobj> XmCompress;
 | 
				
			||||||
 | 
						this->HaloGatherDir(source,XmCompress,dirs[4]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WilsonYmCompressor<cobj,vobj> YmCompress;
 | 
				
			||||||
 | 
						this->HaloGatherDir(source,YmCompress,dirs[5]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WilsonZmCompressor<cobj,vobj> ZmCompress;
 | 
				
			||||||
 | 
						this->HaloGatherDir(source,ZmCompress,dirs[6]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WilsonTmCompressor<cobj,vobj> TmCompress;
 | 
				
			||||||
 | 
						this->HaloGatherDir(source,TmCompress,dirs[7]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						assert(this->u_comm_offset==this->_unified_buffer_size);
 | 
				
			||||||
 | 
						this->halogtime+=usecond();
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}} // namespace close
 | 
					}} // namespace close
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -64,7 +64,9 @@ namespace QCD {
 | 
				
			|||||||
  template<class Impl>
 | 
					  template<class Impl>
 | 
				
			||||||
  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
					  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
 | 
					    GaugeField HUmu(_Umu._grid);
 | 
				
			||||||
 | 
					    HUmu = _Umu*(-0.5);
 | 
				
			||||||
 | 
					    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
 | 
				
			||||||
    pickCheckerboard(Even,UmuEven,Umu);
 | 
					    pickCheckerboard(Even,UmuEven,Umu);
 | 
				
			||||||
    pickCheckerboard(Odd ,UmuOdd,Umu);
 | 
					    pickCheckerboard(Odd ,UmuOdd,Umu);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -286,121 +288,27 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
					  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
				
			||||||
					 const FermionField &in, FermionField &out,int dag) 
 | 
										 const FermionField &in, FermionField &out,int dag) 
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    if ( Impl::overlapCommsCompute () ) { 
 | 
					 | 
				
			||||||
      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
 | 
					 | 
				
			||||||
    } else { 
 | 
					 | 
				
			||||||
      DhopInternalCommsThenCompute(st,U,in,out,dag);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  template<class Impl>
 | 
					 | 
				
			||||||
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
					 | 
				
			||||||
							 const FermionField &in, FermionField &out,int dag) {
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Compressor compressor(dag);
 | 
					    Compressor compressor(dag);
 | 
				
			||||||
    st.HaloExchange(in,compressor);
 | 
					    st.HaloExchange(in,compressor);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    if ( dag == DaggerYes ) {
 | 
					    if ( dag == DaggerYes ) {
 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 | 
						Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 | 
						Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
  template<class Impl>
 | 
					 | 
				
			||||||
  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
					 | 
				
			||||||
						     const FermionField &in, FermionField &out,int dag) {
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Compressor compressor(dag);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    auto handle = st.HaloExchangeBegin(in,compressor);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    bool local    = true;
 | 
					 | 
				
			||||||
    bool nonlocal = false;
 | 
					 | 
				
			||||||
    if ( dag == DaggerYes ) {
 | 
					 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    st.HaloExchangeComplete(handle);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    local    = false;
 | 
					 | 
				
			||||||
    nonlocal = true;
 | 
					 | 
				
			||||||
    if ( dag == DaggerYes ) {
 | 
					 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
        for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
  FermOpTemplateInstantiate(WilsonFermion);
 | 
					  FermOpTemplateInstantiate(WilsonFermion);
 | 
				
			||||||
 | 
					  GparityFermOpTemplateInstantiate(WilsonFermion);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -114,12 +114,6 @@ namespace Grid {
 | 
				
			|||||||
      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
					      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
				
			||||||
			const FermionField &in, FermionField &out,int dag) ;
 | 
								const FermionField &in, FermionField &out,int dag) ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
					 | 
				
			||||||
				    const FermionField &in, FermionField &out,int dag) ;
 | 
					 | 
				
			||||||
      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
					 | 
				
			||||||
				    const FermionField &in, FermionField &out,int dag) ;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      // Constructor
 | 
					      // Constructor
 | 
				
			||||||
      WilsonFermion(GaugeField &_Umu,
 | 
					      WilsonFermion(GaugeField &_Umu,
 | 
				
			||||||
		    GridCartesian         &Fgrid,
 | 
							    GridCartesian         &Fgrid,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,4 +1,4 @@
 | 
				
			|||||||
    /*************************************************************************************
 | 
					/*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -38,8 +38,6 @@ namespace QCD {
 | 
				
			|||||||
// S-direction is INNERMOST and takes no part in the parity.
 | 
					// S-direction is INNERMOST and takes no part in the parity.
 | 
				
			||||||
const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 | 
					const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 | 
				
			||||||
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 | 
					const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 | 
				
			||||||
int WilsonFermion5DStatic::HandOptDslash;
 | 
					 | 
				
			||||||
int WilsonFermion5DStatic::AsmOptDslash;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // 5d lattice for DWF.
 | 
					  // 5d lattice for DWF.
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
@@ -67,10 +65,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
				
			|||||||
  // some assertions
 | 
					  // some assertions
 | 
				
			||||||
  assert(FiveDimGrid._ndimension==5);
 | 
					  assert(FiveDimGrid._ndimension==5);
 | 
				
			||||||
  assert(FourDimGrid._ndimension==4);
 | 
					  assert(FourDimGrid._ndimension==4);
 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
					  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
				
			||||||
  assert(FourDimRedBlackGrid._ndimension==4);
 | 
					  assert(FourDimRedBlackGrid._ndimension==4);
 | 
				
			||||||
 | 
					 | 
				
			||||||
  assert(FiveDimRedBlackGrid._checker_dim==1);
 | 
					  assert(FiveDimRedBlackGrid._checker_dim==1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Dimension zero of the five-d is the Ls direction
 | 
					  // Dimension zero of the five-d is the Ls direction
 | 
				
			||||||
@@ -99,16 +95,74 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  // Allocate the required comms buffer
 | 
					  // Allocate the required comms buffer
 | 
				
			||||||
  ImportGauge(_Umu);
 | 
					  ImportGauge(_Umu);
 | 
				
			||||||
  alltime=0;
 | 
					 | 
				
			||||||
  commtime=0;
 | 
					 | 
				
			||||||
  jointime=0;
 | 
					 | 
				
			||||||
  dslashtime=0;
 | 
					 | 
				
			||||||
  dslash1time=0;
 | 
					 | 
				
			||||||
}  
 | 
					}  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class Impl>
 | 
				
			||||||
 | 
					WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 | 
				
			||||||
 | 
									       GridCartesian         &FiveDimGrid,
 | 
				
			||||||
 | 
									       GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
				
			||||||
 | 
									       GridCartesian         &FourDimGrid,
 | 
				
			||||||
 | 
									       RealD _M5,const ImplParams &p) :
 | 
				
			||||||
 | 
					  Kernels(p),
 | 
				
			||||||
 | 
					  _FiveDimGrid        (&FiveDimGrid),
 | 
				
			||||||
 | 
					  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
 | 
				
			||||||
 | 
					  _FourDimGrid        (&FourDimGrid),
 | 
				
			||||||
 | 
					  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
 | 
				
			||||||
 | 
					  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
 | 
				
			||||||
 | 
					  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
 | 
				
			||||||
 | 
					  M5(_M5),
 | 
				
			||||||
 | 
					  Umu(_FourDimGrid),
 | 
				
			||||||
 | 
					  UmuEven(_FourDimGrid),
 | 
				
			||||||
 | 
					  UmuOdd (_FourDimGrid),
 | 
				
			||||||
 | 
					  Lebesgue(_FourDimGrid),
 | 
				
			||||||
 | 
					  LebesgueEvenOdd(_FourDimGrid)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int nsimd = Simd::Nsimd();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // some assertions
 | 
				
			||||||
 | 
					  assert(FiveDimGrid._ndimension==5);
 | 
				
			||||||
 | 
					  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
				
			||||||
 | 
					  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
 | 
				
			||||||
 | 
					  assert(FourDimGrid._ndimension==4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Dimension zero of the five-d is the Ls direction
 | 
				
			||||||
 | 
					  Ls=FiveDimGrid._fdimensions[0];
 | 
				
			||||||
 | 
					  assert(FiveDimGrid._processors[0]         ==1);
 | 
				
			||||||
 | 
					  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
 | 
				
			||||||
 | 
					  assert(FiveDimRedBlackGrid._processors[0] ==1);
 | 
				
			||||||
 | 
					  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Other dimensions must match the decomposition of the four-D fields 
 | 
				
			||||||
 | 
					  for(int d=0;d<4;d++){
 | 
				
			||||||
 | 
					    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
 | 
				
			||||||
 | 
					    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert(FourDimGrid._simd_layout[d]=1);
 | 
				
			||||||
 | 
					    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
 | 
				
			||||||
 | 
					    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
 | 
				
			||||||
 | 
					    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    GaugeField HUmu(_Umu._grid);
 | 
				
			||||||
 | 
					    HUmu = _Umu*(-0.5);
 | 
				
			||||||
 | 
					    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
 | 
				
			||||||
 | 
					    UmuEven=Umu;// Really want a reference.
 | 
				
			||||||
 | 
					    UmuOdd =Umu;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
					void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
 | 
					  GaugeField HUmu(_Umu._grid);
 | 
				
			||||||
 | 
					  HUmu = _Umu*(-0.5);
 | 
				
			||||||
 | 
					  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
 | 
				
			||||||
  pickCheckerboard(Even,UmuEven,Umu);
 | 
					  pickCheckerboard(Even,UmuEven,Umu);
 | 
				
			||||||
  pickCheckerboard(Odd ,UmuOdd,Umu);
 | 
					  pickCheckerboard(Odd ,UmuOdd,Umu);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -232,30 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					 | 
				
			||||||
void WilsonFermion5D<Impl>::Report(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Wilson5d      time "<<alltime <<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "HaloBegin     time "<<commtime <<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Dslash        time "<<dslashtime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Dslash1       time "<<dslash1time<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "HaloComplete  time "<<jointime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil all gather      time "<<Stencil.halogtime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil splice   gather time "<<Stencil.splicetime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
 | 
					 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 | 
					void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 | 
				
			||||||
				  const FermionField &A,
 | 
									  const FermionField &A,
 | 
				
			||||||
@@ -277,280 +307,32 @@ template<class Impl>
 | 
				
			|||||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 | 
					void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 | 
				
			||||||
					 DoubledGaugeField & U,
 | 
										 DoubledGaugeField & U,
 | 
				
			||||||
					 const FermionField &in, FermionField &out,int dag)
 | 
										 const FermionField &in, FermionField &out,int dag)
 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  if ( Impl::overlapCommsCompute () ) { 
 | 
					 | 
				
			||||||
    DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class Impl>
 | 
					 | 
				
			||||||
void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
 | 
					 | 
				
			||||||
					 DoubledGaugeField & U,
 | 
					 | 
				
			||||||
					 const FermionField &in, FermionField &out,int dag)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
				
			||||||
  alltime-=usecond();
 | 
					 | 
				
			||||||
  Compressor compressor(dag);
 | 
					  Compressor compressor(dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
					  int LLs = in._grid->_rdimensions[0];
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  int threads = GridThread::GetThreads();
 | 
					  st.HaloExchange(in,compressor);
 | 
				
			||||||
  int HT      = GridThread::GetHyperThreads();
 | 
					 | 
				
			||||||
  int cores   = GridThread::GetCores();
 | 
					 | 
				
			||||||
  int nwork = U._grid->oSites();
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  commtime -=usecond();
 | 
					 | 
				
			||||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
					 | 
				
			||||||
  st.HaloExchangeComplete(handle);
 | 
					 | 
				
			||||||
  commtime +=usecond();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  jointime -=usecond();
 | 
					 | 
				
			||||||
  jointime +=usecond();
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
					  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
				
			||||||
  // Not loop ordering and data layout.
 | 
					 | 
				
			||||||
  // Designed to create 
 | 
					 | 
				
			||||||
  // - per thread reuse in L1 cache for U
 | 
					 | 
				
			||||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
					 | 
				
			||||||
  dslashtime -=usecond();
 | 
					 | 
				
			||||||
  if ( dag == DaggerYes ) {
 | 
					  if ( dag == DaggerYes ) {
 | 
				
			||||||
    if( this->HandOptDslash ) {
 | 
					 | 
				
			||||||
#pragma omp parallel for schedule(static)
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	int sU=ss;
 | 
					 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					    for(int ss=0;ss<U._grid->oSites();ss++){
 | 
				
			||||||
	{
 | 
						int sU=ss;
 | 
				
			||||||
	  int sd;
 | 
						int sF=LLs*sU;
 | 
				
			||||||
	  for(sd=0;sd<Ls;sd++){
 | 
						Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
 | 
				
			||||||
	    int sU=ss;
 | 
					 | 
				
			||||||
	    int sF = sd+Ls*sU;
 | 
					 | 
				
			||||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
    if( this->AsmOptDslash ) {
 | 
					 | 
				
			||||||
      //      for(int i=0;i<1;i++){
 | 
					 | 
				
			||||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
					 | 
				
			||||||
      //	PerformanceCounter Counter(i);
 | 
					 | 
				
			||||||
      //	Counter.Start();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#pragma omp parallel for 
 | 
					 | 
				
			||||||
      for(int t=0;t<threads;t++){
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	int hyperthread = t%HT;
 | 
					 | 
				
			||||||
	int core        = t/HT;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
					 | 
				
			||||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	for(int ss=0;ss<sswork;ss++){
 | 
					 | 
				
			||||||
	  for(int s=soff;s<soff+swork;s++){
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	    sU=ss+ ssoff;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	    if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
					 | 
				
			||||||
	      sU = lo.Reorder(sU);
 | 
					 | 
				
			||||||
	    }
 | 
					 | 
				
			||||||
	    sF = s+Ls*sU;
 | 
					 | 
				
			||||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      //      Counter.Stop();
 | 
					 | 
				
			||||||
      //      Counter.Report();
 | 
					 | 
				
			||||||
      //      }
 | 
					 | 
				
			||||||
    } else if( this->HandOptDslash ) {
 | 
					 | 
				
			||||||
      /*
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#pragma omp parallel for schedule(static)
 | 
					 | 
				
			||||||
      for(int t=0;t<threads;t++){
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	int hyperthread = t%HT;
 | 
					 | 
				
			||||||
	int core        = t/HT;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
					 | 
				
			||||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	for(int ss=0;ss<sswork;ss++){
 | 
					 | 
				
			||||||
	  sU=ss+ ssoff;
 | 
					 | 
				
			||||||
	  for(int s=soff;s<soff+swork;s++){
 | 
					 | 
				
			||||||
	    sF = s+Ls*sU;
 | 
					 | 
				
			||||||
	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#pragma omp parallel for schedule(static)
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	int sU=ss;
 | 
					 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					    for(int ss=0;ss<U._grid->oSites();ss++){
 | 
				
			||||||
	int sU=ss;
 | 
					      int sU=ss;
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					      int sF=LLs*sU;
 | 
				
			||||||
	  int sF = s+Ls*sU; 
 | 
					      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  dslashtime +=usecond();
 | 
					 | 
				
			||||||
  alltime+=usecond();
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					 | 
				
			||||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 | 
					 | 
				
			||||||
						     DoubledGaugeField & U,
 | 
					 | 
				
			||||||
						     const FermionField &in, FermionField &out,int dag)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					 | 
				
			||||||
  alltime-=usecond();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  int calls;
 | 
					 | 
				
			||||||
  int updates;
 | 
					 | 
				
			||||||
  Compressor compressor(dag);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  int threads = GridThread::GetThreads();
 | 
					 | 
				
			||||||
  int HT      = GridThread::GetHyperThreads();
 | 
					 | 
				
			||||||
  int cores   = GridThread::GetCores();
 | 
					 | 
				
			||||||
  int nwork = U._grid->oSites();
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  commtime -=usecond();
 | 
					 | 
				
			||||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
					 | 
				
			||||||
  commtime +=usecond();
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
					 | 
				
			||||||
  // Not loop ordering and data layout.
 | 
					 | 
				
			||||||
  // Designed to create 
 | 
					 | 
				
			||||||
  // - per thread reuse in L1 cache for U
 | 
					 | 
				
			||||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
					 | 
				
			||||||
  bool local    = true;
 | 
					 | 
				
			||||||
  bool nonlocal = false;
 | 
					 | 
				
			||||||
  dslashtime -=usecond();
 | 
					 | 
				
			||||||
  if ( dag == DaggerYes ) {
 | 
					 | 
				
			||||||
    if( this->HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	int sU=ss;
 | 
					 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	{
 | 
					 | 
				
			||||||
	  int sd;
 | 
					 | 
				
			||||||
	  for(sd=0;sd<Ls;sd++){
 | 
					 | 
				
			||||||
	    int sU=ss;
 | 
					 | 
				
			||||||
	    int sF = sd+Ls*sU;
 | 
					 | 
				
			||||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    if( this->HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	int sU=ss;
 | 
					 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	int sU=ss;
 | 
					 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					 | 
				
			||||||
	  int sF = s+Ls*sU; 
 | 
					 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  dslashtime +=usecond();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  jointime -=usecond();
 | 
					 | 
				
			||||||
  st.HaloExchangeComplete(handle);
 | 
					 | 
				
			||||||
  jointime +=usecond();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  local    = false;
 | 
					 | 
				
			||||||
  nonlocal = true;
 | 
					 | 
				
			||||||
  dslash1time -=usecond();
 | 
					 | 
				
			||||||
  if ( dag == DaggerYes ) {
 | 
					 | 
				
			||||||
    if( this->HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	int sU=ss;
 | 
					 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	{
 | 
					 | 
				
			||||||
	  int sd;
 | 
					 | 
				
			||||||
	  for(sd=0;sd<Ls;sd++){
 | 
					 | 
				
			||||||
	    int sU=ss;
 | 
					 | 
				
			||||||
	    int sF = sd+Ls*sU;
 | 
					 | 
				
			||||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    if( this->HandOptDslash ) {
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	int sU=ss;
 | 
					 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
					 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } else { 
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					 | 
				
			||||||
	int sU=ss;
 | 
					 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
					 | 
				
			||||||
	  int sF = s+Ls*sU; 
 | 
					 | 
				
			||||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  dslash1time +=usecond();
 | 
					 | 
				
			||||||
  alltime+=usecond();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
					void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
				
			||||||
@@ -593,6 +375,9 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
FermOpTemplateInstantiate(WilsonFermion5D);
 | 
					FermOpTemplateInstantiate(WilsonFermion5D);
 | 
				
			||||||
 | 
					GparityFermOpTemplateInstantiate(WilsonFermion5D);
 | 
				
			||||||
 | 
					template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 | 
				
			||||||
 | 
					template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,3 +1,4 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    /*************************************************************************************
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
@@ -48,8 +49,6 @@ namespace Grid {
 | 
				
			|||||||
    class WilsonFermion5DStatic { 
 | 
					    class WilsonFermion5DStatic { 
 | 
				
			||||||
    public:
 | 
					    public:
 | 
				
			||||||
      // S-direction is INNERMOST and takes no part in the parity.
 | 
					      // S-direction is INNERMOST and takes no part in the parity.
 | 
				
			||||||
      static int AsmOptDslash; // these are a temporary hack
 | 
					 | 
				
			||||||
      static int HandOptDslash; // these are a temporary hack
 | 
					 | 
				
			||||||
      static const std::vector<int> directions;
 | 
					      static const std::vector<int> directions;
 | 
				
			||||||
      static const std::vector<int> displacements;
 | 
					      static const std::vector<int> displacements;
 | 
				
			||||||
      const int npoint = 8;
 | 
					      const int npoint = 8;
 | 
				
			||||||
@@ -61,11 +60,7 @@ namespace Grid {
 | 
				
			|||||||
    public:
 | 
					    public:
 | 
				
			||||||
     INHERIT_IMPL_TYPES(Impl);
 | 
					     INHERIT_IMPL_TYPES(Impl);
 | 
				
			||||||
     typedef WilsonKernels<Impl> Kernels;
 | 
					     typedef WilsonKernels<Impl> Kernels;
 | 
				
			||||||
     double alltime;
 | 
					
 | 
				
			||||||
     double jointime;
 | 
					 | 
				
			||||||
     double commtime;
 | 
					 | 
				
			||||||
     double dslashtime;
 | 
					 | 
				
			||||||
     double dslash1time;
 | 
					 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      // Implement the abstract base
 | 
					      // Implement the abstract base
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -86,6 +81,7 @@ namespace Grid {
 | 
				
			|||||||
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
					      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
				
			||||||
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
					      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
				
			||||||
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
 | 
					      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
 | 
				
			||||||
 | 
					      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // These can be overridden by fancy 5d chiral action
 | 
					      // These can be overridden by fancy 5d chiral action
 | 
				
			||||||
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
 | 
					      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
 | 
				
			||||||
@@ -120,19 +116,6 @@ namespace Grid {
 | 
				
			|||||||
			FermionField &out,
 | 
								FermionField &out,
 | 
				
			||||||
			int dag);
 | 
								int dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void DhopInternalCommsThenCompute(StencilImpl & st,
 | 
					 | 
				
			||||||
			LebesgueOrder &lo,
 | 
					 | 
				
			||||||
			DoubledGaugeField &U,
 | 
					 | 
				
			||||||
			const FermionField &in, 
 | 
					 | 
				
			||||||
			FermionField &out,
 | 
					 | 
				
			||||||
			int dag);
 | 
					 | 
				
			||||||
      void DhopInternalCommsOverlapCompute(StencilImpl & st,
 | 
					 | 
				
			||||||
			LebesgueOrder &lo,
 | 
					 | 
				
			||||||
			DoubledGaugeField &U,
 | 
					 | 
				
			||||||
			const FermionField &in, 
 | 
					 | 
				
			||||||
			FermionField &out,
 | 
					 | 
				
			||||||
			int dag);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      // Constructors
 | 
					      // Constructors
 | 
				
			||||||
      WilsonFermion5D(GaugeField &_Umu,
 | 
					      WilsonFermion5D(GaugeField &_Umu,
 | 
				
			||||||
		      GridCartesian         &FiveDimGrid,
 | 
							      GridCartesian         &FiveDimGrid,
 | 
				
			||||||
@@ -141,14 +124,21 @@ namespace Grid {
 | 
				
			|||||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
							      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
				
			||||||
		      double _M5,const ImplParams &p= ImplParams());
 | 
							      double _M5,const ImplParams &p= ImplParams());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Constructors
 | 
				
			||||||
 | 
					      WilsonFermion5D(int simd, 
 | 
				
			||||||
 | 
							      GaugeField &_Umu,
 | 
				
			||||||
 | 
							      GridCartesian         &FiveDimGrid,
 | 
				
			||||||
 | 
							      GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
				
			||||||
 | 
							      GridCartesian         &FourDimGrid,
 | 
				
			||||||
 | 
							      double _M5,const ImplParams &p= ImplParams());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // DoubleStore
 | 
					      // DoubleStore
 | 
				
			||||||
      void ImportGauge(const GaugeField &_Umu);
 | 
					      void ImportGauge(const GaugeField &_Umu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void Report(void);
 | 
					 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      // Data members require to support the functionality
 | 
					      // Data members require to support the functionality
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
    protected:
 | 
					    public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // Add these to the support from Wilson
 | 
					      // Add these to the support from Wilson
 | 
				
			||||||
      GridBase *_FourDimGrid;
 | 
					      GridBase *_FourDimGrid;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -31,440 +31,410 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
namespace QCD {
 | 
					namespace QCD {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int WilsonKernelsStatic::HandOpt;
 | 
				
			||||||
 | 
					  int WilsonKernelsStatic::AsmOpt;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 | 
					WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Need controls to do interior, exterior, or both
 | 
					template<class Impl> 
 | 
				
			||||||
 | 
					void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
											  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
											  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  if ( AsmOpt ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int site=0;site<Ns;site++) {
 | 
				
			||||||
 | 
					      for(int s=0;s<Ls;s++) {
 | 
				
			||||||
 | 
						if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
 | 
				
			||||||
 | 
						else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
 | 
				
			||||||
 | 
						sF++;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      sU++;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 | 
										   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  // No asm implementation yet.
 | 
				
			||||||
 | 
					  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
				
			||||||
 | 
					  //  else
 | 
				
			||||||
 | 
					  for(int site=0;site<Ns;site++) {
 | 
				
			||||||
 | 
					    for(int s=0;s<Ls;s++) {
 | 
				
			||||||
 | 
					      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
				
			||||||
 | 
					      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
				
			||||||
 | 
					      sF++;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    sU++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // Generic implementation; move to different file?
 | 
				
			||||||
 | 
					  ////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class Impl> 
 | 
				
			||||||
 | 
					void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
										   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
										   int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  SiteHalfSpinor  tmp;    
 | 
					  SiteHalfSpinor  tmp;    
 | 
				
			||||||
  SiteHalfSpinor  chi;    
 | 
					  SiteHalfSpinor  chi;    
 | 
				
			||||||
 | 
					  SiteHalfSpinor *chi_p;
 | 
				
			||||||
  SiteHalfSpinor Uchi;
 | 
					  SiteHalfSpinor Uchi;
 | 
				
			||||||
  SiteSpinor result;
 | 
					  SiteSpinor result;
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
  int ptype;
 | 
					  int ptype;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int num = 0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  result=zero;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Xp
 | 
					  // Xp
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Xp,sF);
 | 
					  SE=st.GetEntry(ptype,Xp,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) { 
 | 
					  if (SE->_is_local ) { 
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjXp(tmp,in._odata[SE->_offset]);
 | 
					      spProjXp(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjXp(chi,in._odata[SE->_offset]);
 | 
					      spProjXp(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  spReconXp(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
 | 
					 | 
				
			||||||
    accumReconXp(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Yp
 | 
					  // Yp
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Yp,sF);
 | 
					  SE=st.GetEntry(ptype,Yp,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) { 
 | 
					  if ( SE->_is_local ) { 
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjYp(tmp,in._odata[SE->_offset]);
 | 
					      spProjYp(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjYp(chi,in._odata[SE->_offset]);
 | 
					      spProjYp(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconYp(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
 | 
					 | 
				
			||||||
    accumReconYp(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Zp
 | 
					  // Zp
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Zp,sF);
 | 
					  SE=st.GetEntry(ptype,Zp,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) { 
 | 
					  if ( SE->_is_local ) { 
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjZp(tmp,in._odata[SE->_offset]);
 | 
					      spProjZp(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjZp(chi,in._odata[SE->_offset]);
 | 
					      spProjZp(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconZp(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
 | 
					 | 
				
			||||||
    accumReconZp(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Tp
 | 
					  // Tp
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Tp,sF);
 | 
					  SE=st.GetEntry(ptype,Tp,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjTp(tmp,in._odata[SE->_offset]);
 | 
					      spProjTp(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjTp(chi,in._odata[SE->_offset]);
 | 
					      spProjTp(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconTp(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
 | 
					 | 
				
			||||||
    accumReconTp(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Xm
 | 
					  // Xm
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,sF);
 | 
					  SE=st.GetEntry(ptype,Xm,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjXm(tmp,in._odata[SE->_offset]);
 | 
					      spProjXm(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjXm(chi,in._odata[SE->_offset]);
 | 
					      spProjXm(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconXm(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
 | 
					 | 
				
			||||||
    accumReconXm(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Ym
 | 
					  // Ym
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Ym,sF);
 | 
					  SE=st.GetEntry(ptype,Ym,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjYm(tmp,in._odata[SE->_offset]);
 | 
					      spProjYm(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjYm(chi,in._odata[SE->_offset]);
 | 
					      spProjYm(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconYm(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
 | 
					 | 
				
			||||||
    accumReconYm(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Zm
 | 
					  // Zm
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Zm,sF);
 | 
					  SE=st.GetEntry(ptype,Zm,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjZm(tmp,in._odata[SE->_offset]);
 | 
					      spProjZm(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjZm(chi,in._odata[SE->_offset]);
 | 
					      spProjZm(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconZm(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
 | 
					 | 
				
			||||||
    accumReconZm(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Tm
 | 
					  // Tm
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Tm,sF);
 | 
					  SE=st.GetEntry(ptype,Tm,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjTm(tmp,in._odata[SE->_offset]);
 | 
					      spProjTm(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      spProjTm(chi,in._odata[SE->_offset]);
 | 
					      spProjTm(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconTm(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					  vstream(out._odata[sF],result);
 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
 | 
					 | 
				
			||||||
    accumReconTm(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    vstream(out._odata[sF],result*(-0.5));
 | 
					 | 
				
			||||||
  } else if ( num ) { 
 | 
					 | 
				
			||||||
    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Need controls to do interior, exterior, or both
 | 
					  // Need controls to do interior, exterior, or both
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 | 
											  int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  SiteHalfSpinor  tmp;    
 | 
					  SiteHalfSpinor  tmp;    
 | 
				
			||||||
  SiteHalfSpinor  chi;    
 | 
					  SiteHalfSpinor  chi;    
 | 
				
			||||||
 | 
					  SiteHalfSpinor *chi_p;    
 | 
				
			||||||
  SiteHalfSpinor Uchi;
 | 
					  SiteHalfSpinor Uchi;
 | 
				
			||||||
  SiteSpinor result;
 | 
					  SiteSpinor result;
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
  int ptype;
 | 
					  int ptype;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int num = 0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  result=zero;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Xp
 | 
					  // Xp
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,sF);
 | 
					  SE=st.GetEntry(ptype,Xm,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) { 
 | 
					  if ( SE->_is_local ) { 
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjXp(tmp,in._odata[SE->_offset]);
 | 
					      spProjXp(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjXp(chi,in._odata[SE->_offset]);
 | 
					      spProjXp(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  spReconXp(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
 | 
					 | 
				
			||||||
    accumReconXp(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Yp
 | 
					  // Yp
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Ym,sF);
 | 
					  SE=st.GetEntry(ptype,Ym,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) { 
 | 
					  if ( SE->_is_local ) { 
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjYp(tmp,in._odata[SE->_offset]);
 | 
					      spProjYp(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjYp(chi,in._odata[SE->_offset]);
 | 
					      spProjYp(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconYp(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
 | 
					 | 
				
			||||||
    accumReconYp(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Zp
 | 
					  // Zp
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Zm,sF);
 | 
					  SE=st.GetEntry(ptype,Zm,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) { 
 | 
					  if ( SE->_is_local ) { 
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjZp(tmp,in._odata[SE->_offset]);
 | 
					      spProjZp(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjZp(chi,in._odata[SE->_offset]);
 | 
					      spProjZp(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) { 
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconZp(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
 | 
					 | 
				
			||||||
    accumReconZp(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Tp
 | 
					  // Tp
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Tm,sF);
 | 
					  SE=st.GetEntry(ptype,Tm,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjTp(tmp,in._odata[SE->_offset]);
 | 
					      spProjTp(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjTp(chi,in._odata[SE->_offset]);
 | 
					      spProjTp(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconTp(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
 | 
					 | 
				
			||||||
    accumReconTp(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Xm
 | 
					  // Xm
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Xp,sF);
 | 
					  SE=st.GetEntry(ptype,Xp,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjXm(tmp,in._odata[SE->_offset]);
 | 
					      spProjXm(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjXm(chi,in._odata[SE->_offset]);
 | 
					      spProjXm(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconXm(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
 | 
					 | 
				
			||||||
    accumReconXm(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Ym
 | 
					  // Ym
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Yp,sF);
 | 
					  SE=st.GetEntry(ptype,Yp,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjYm(tmp,in._odata[SE->_offset]);
 | 
					      spProjYm(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjYm(chi,in._odata[SE->_offset]);
 | 
					      spProjYm(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconYm(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
 | 
					 | 
				
			||||||
    accumReconYm(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Zm
 | 
					  // Zm
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Zp,sF);
 | 
					  SE=st.GetEntry(ptype,Zp,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjZm(tmp,in._odata[SE->_offset]);
 | 
					      spProjZm(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      spProjZm(chi,in._odata[SE->_offset]);
 | 
					      spProjZm(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconZm(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
 | 
					 | 
				
			||||||
    accumReconZm(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  // Tm
 | 
					  // Tm
 | 
				
			||||||
  ///////////////////////////
 | 
					  ///////////////////////////
 | 
				
			||||||
  SE=st.GetEntry(ptype,Tp,sF);
 | 
					  SE=st.GetEntry(ptype,Tp,sF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (local && SE->_is_local ) {
 | 
					  if ( SE->_is_local ) {
 | 
				
			||||||
 | 
					    chi_p = χ
 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      spProjTm(tmp,in._odata[SE->_offset]);
 | 
					      spProjTm(tmp,in._odata[SE->_offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      spProjTm(chi,in._odata[SE->_offset]);
 | 
					      spProjTm(chi,in._odata[SE->_offset]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    chi_p=&buf[SE->_offset];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( nonlocal && (!SE->_is_local) ) {
 | 
					  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
 | 
				
			||||||
    chi=buf[SE->_offset];
 | 
					  accumReconTm(result,Uchi);
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
 | 
					  vstream(out._odata[sF],result);
 | 
				
			||||||
    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
 | 
					 | 
				
			||||||
    accumReconTm(result,Uchi);
 | 
					 | 
				
			||||||
    num++;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    vstream(out._odata[sF],result*(-0.5));
 | 
					 | 
				
			||||||
  } else if ( num ) { 
 | 
					 | 
				
			||||||
    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
@@ -593,19 +563,13 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			|||||||
    spReconTm(result,Uchi);
 | 
					    spReconTm(result,Uchi);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  vstream(out._odata[sF],result*(-0.5));
 | 
					  vstream(out._odata[sF],result);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
 | 
					 | 
				
			||||||
template<class Impl> 
 | 
					 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					 | 
				
			||||||
					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
					 | 
				
			||||||
					      int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  FermOpTemplateInstantiate(WilsonKernels);
 | 
					  FermOpTemplateInstantiate(WilsonKernels);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 | 
				
			||||||
 | 
					template class WilsonKernels<DomainWallRedBlack5dImplD>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -38,37 +38,56 @@ namespace Grid {
 | 
				
			|||||||
    // Helper routines that implement Wilson stencil for a single site.
 | 
					    // Helper routines that implement Wilson stencil for a single site.
 | 
				
			||||||
    // Common to both the WilsonFermion and WilsonFermion5D
 | 
					    // Common to both the WilsonFermion and WilsonFermion5D
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					    class WilsonKernelsStatic { 
 | 
				
			||||||
 | 
					    public:
 | 
				
			||||||
 | 
					      // S-direction is INNERMOST and takes no part in the parity.
 | 
				
			||||||
 | 
					      static int AsmOpt;  // these are a temporary hack
 | 
				
			||||||
 | 
					      static int HandOpt; // these are a temporary hack
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
 | 
					    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
 | 
				
			||||||
    public:
 | 
					    public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     INHERIT_IMPL_TYPES(Impl);
 | 
					     INHERIT_IMPL_TYPES(Impl);
 | 
				
			||||||
     typedef FermionOperator<Impl> Base;
 | 
					     typedef FermionOperator<Impl> Base;
 | 
				
			||||||
     
 | 
					     
 | 
				
			||||||
    public:
 | 
					    public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			   int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
								   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
 | 
								      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 | 
								  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    private:
 | 
				
			||||||
 | 
					     // Specialised variants
 | 
				
			||||||
 | 
					     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
								   int sF,int sU, const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
								      int sF,int sU,const FermionField &in,FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
								      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					
 | 
				
			||||||
 | 
					     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
								      int sF,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
     
 | 
					     
 | 
				
			||||||
     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
				 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
									 int sF,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					    public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     WilsonKernels(const ImplParams &p= ImplParams());
 | 
					     WilsonKernels(const ImplParams &p= ImplParams());
 | 
				
			||||||
     
 | 
					     
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -2,6 +2,8 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
 | 
					    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Copyright (C) 2015
 | 
					    Copyright (C) 2015
 | 
				
			||||||
@@ -26,320 +28,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
    *************************************************************************************/
 | 
					    *************************************************************************************/
 | 
				
			||||||
    /*  END LEGAL */
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <Grid.h>
 | 
					#include <Grid.h>
 | 
				
			||||||
#if defined(AVX512) || defined (IMCI)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <simd/Avx512Asm.h>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#undef VLOAD
 | 
					 | 
				
			||||||
#undef VSTORE
 | 
					 | 
				
			||||||
#undef VMUL
 | 
					 | 
				
			||||||
#undef VMADD
 | 
					 | 
				
			||||||
#undef ZEND
 | 
					 | 
				
			||||||
#undef ZLOAD
 | 
					 | 
				
			||||||
#undef ZMUL
 | 
					 | 
				
			||||||
#undef ZMADD
 | 
					 | 
				
			||||||
#undef VZERO
 | 
					 | 
				
			||||||
#undef VTIMESI
 | 
					 | 
				
			||||||
#undef VTIMESMINUSI
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VZERO(A)                  VZEROf(A)
 | 
					 | 
				
			||||||
#define VMOV(A,B)                 VMOVf(A,B)
 | 
					 | 
				
			||||||
#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
					 | 
				
			||||||
#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VADD(A,B,C)               VADDf(A,B,C)
 | 
					 | 
				
			||||||
#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
					 | 
				
			||||||
#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
 | 
					 | 
				
			||||||
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
 | 
					 | 
				
			||||||
#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
 | 
					 | 
				
			||||||
#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
 | 
					 | 
				
			||||||
#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
 | 
					 | 
				
			||||||
#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
 | 
					 | 
				
			||||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VPERM0(A,B)               VPERM0f(A,B)
 | 
					 | 
				
			||||||
#define VPERM1(A,B)               VPERM1f(A,B)
 | 
					 | 
				
			||||||
#define VPERM2(A,B)               VPERM2f(A,B)
 | 
					 | 
				
			||||||
#define VPERM3(A,B)               VPERM3f(A,B)
 | 
					 | 
				
			||||||
#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define ZEND1(A,B,C)               ZEND1f(A,B,C)
 | 
					 | 
				
			||||||
#define ZEND2(A,B,C)               ZEND2f(A,B,C)
 | 
					 | 
				
			||||||
#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 | 
					 | 
				
			||||||
#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
					 | 
				
			||||||
#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
					 | 
				
			||||||
#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
 | 
					 | 
				
			||||||
#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
					 | 
				
			||||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
namespace QCD {
 | 
					namespace QCD {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ///////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // Default to no assembler implementation
 | 
				
			||||||
 | 
					  ///////////////////////////////////////////////////////////
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
 | 
										       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  uint64_t  now;
 | 
					  assert(0);
 | 
				
			||||||
  uint64_t first ;
 | 
					 | 
				
			||||||
  int offset,local,perm, ptype;
 | 
					 | 
				
			||||||
  const SiteHalfSpinor *pbuf = & buf[0];
 | 
					 | 
				
			||||||
  const SiteSpinor   *plocal = & in._odata[0];
 | 
					 | 
				
			||||||
  void *pf;
 | 
					 | 
				
			||||||
  int osites = in._grid->oSites();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  StencilEntry *SE;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  //#define STAMP(i) timers[i] = __rdtsc() ; 
 | 
					 | 
				
			||||||
#define STAMP(i) //timers[i] = __rdtsc() ; 
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  MASK_REGS;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  first = __rdtsc();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if 0
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  LOAD64(%r9,pf);
 | 
					 | 
				
			||||||
  __asm__( 
 | 
					 | 
				
			||||||
	  VPREFETCH(0,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(1,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(2,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(3,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(4,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(5,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(6,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(7,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(8,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(9,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(10,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(11,%r9) );
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Xm
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Prefetch
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    XM_PROJMEM(&plocal[offset]);
 | 
					 | 
				
			||||||
    if ( perm) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    LOAD_CHI(&pbuf[offset]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MULT_2SPIN_DIR_PFXM(Xm,pf);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  XM_RECON;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Ym
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Prefetch
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    YM_PROJMEM(&plocal[offset]);
 | 
					 | 
				
			||||||
    if ( perm) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    LOAD_CHI(&pbuf[offset]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MULT_2SPIN_DIR_PFYM(Ym,pf);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  YM_RECON_ACCUM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Zm
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Prefetch
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    ZM_PROJMEM(&plocal[offset]);
 | 
					 | 
				
			||||||
    if ( perm) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    LOAD_CHI(&pbuf[offset]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MULT_2SPIN_DIR_PFZM(Zm,pf);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  ZM_RECON_ACCUM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Tm
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    TM_PROJMEM(&plocal[offset]);
 | 
					 | 
				
			||||||
    if ( perm) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    LOAD_CHI(&pbuf[offset]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MULT_2SPIN_DIR_PFTM(Tm,pf);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  TM_RECON_ACCUM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Tp
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Prefetch
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    TP_PROJMEM(&plocal[offset]);
 | 
					 | 
				
			||||||
    if ( perm) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    LOAD_CHI(&pbuf[offset]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MULT_2SPIN_DIR_PFTP(Tp,pf);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  TP_RECON_ACCUM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Zp
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Prefetch
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    ZP_PROJMEM(&plocal[offset]);
 | 
					 | 
				
			||||||
    if ( perm) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    LOAD_CHI(&pbuf[offset]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MULT_2SPIN_DIR_PFZP(Zp,pf);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  ZP_RECON_ACCUM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Prefetch
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    YP_PROJMEM(&plocal[offset]);
 | 
					 | 
				
			||||||
    if ( perm) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    LOAD_CHI(&pbuf[offset]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MULT_2SPIN_DIR_PFYP(Yp,pf);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  YP_RECON_ACCUM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Xp
 | 
					 | 
				
			||||||
  perm   = SE->_permute;
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  local  = SE->_is_local;
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
  //  PREFETCH_R(A);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Prefetch
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( local ) {
 | 
					 | 
				
			||||||
    XP_PROJMEM(&plocal[offset]);
 | 
					 | 
				
			||||||
    if ( perm) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  } else { 
 | 
					 | 
				
			||||||
    LOAD_CHI(&pbuf[offset]);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MULT_2SPIN_DIR_PFXP(Xp,pf);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  XP_RECON_ACCUM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 debug:
 | 
					 | 
				
			||||||
  SAVE_RESULT(&out._odata[ss]);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template class WilsonKernels<WilsonImplF>;		
 | 
					#if defined(AVX512) 
 | 
				
			||||||
  template class WilsonKernels<WilsonImplD>; 
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ///////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // If we are AVX512 specialise the single precision routine
 | 
				
			||||||
 | 
					  ///////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <simd/Intel512wilson.h>
 | 
				
			||||||
 | 
					#include <simd/Intel512single.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static Vector<vComplexF> signs;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int setupSigns(void ){
 | 
				
			||||||
 | 
					  Vector<vComplexF> bother(2);
 | 
				
			||||||
 | 
					  signs = bother;
 | 
				
			||||||
 | 
					  vrsign(signs[0]);
 | 
				
			||||||
 | 
					  visign(signs[1]);
 | 
				
			||||||
 | 
					  return 1;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					static int signInit = setupSigns();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MAYBEPERM(A,perm) if (perm) { A ; }
 | 
				
			||||||
 | 
					#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<>
 | 
				
			||||||
 | 
					void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
											     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
											     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
				
			||||||
 | 
					#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VMOVIDUP
 | 
				
			||||||
 | 
					#undef VMOVRDUP
 | 
				
			||||||
 | 
					#undef MAYBEPERM
 | 
				
			||||||
 | 
					#undef MULT_2SPIN
 | 
				
			||||||
 | 
					#define MAYBEPERM(A,B) 
 | 
				
			||||||
 | 
					#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 | 
				
			||||||
 | 
					#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 | 
				
			||||||
 | 
					#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 | 
				
			||||||
 | 
					template<>
 | 
				
			||||||
 | 
					void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
													   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
													   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
				
			||||||
 | 
					#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}
 | 
					 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
												      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
				
			||||||
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
												       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
 | 
					template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
				
			||||||
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
												       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
 | 
					template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
				
			||||||
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
												       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
				
			||||||
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
												       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
				
			||||||
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
												       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
 | 
					}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										164
									
								
								lib/qcd/action/fermion/WilsonKernelsAsmBody.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										164
									
								
								lib/qcd/action/fermion/WilsonKernelsAsmBody.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,164 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					  int locala,perma, ptypea;
 | 
				
			||||||
 | 
					  int localb,permb, ptypeb;
 | 
				
			||||||
 | 
					  uint64_t basea, baseb;
 | 
				
			||||||
 | 
					  uint64_t basex;
 | 
				
			||||||
 | 
					  const uint64_t plocal =(uint64_t) & in._odata[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  vComplexF isigns[2] = { signs[0], signs[1] };
 | 
				
			||||||
 | 
					  vComplexF *isigns = &signs[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int site=0;site<Ns;site++) {
 | 
				
			||||||
 | 
					  for(int s=0;s<Ls;s++) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Xp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  int ent=ss*8;// 2*Ndim
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  basex = basea;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);
 | 
				
			||||||
 | 
					    XM_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFXP(Xp,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);
 | 
				
			||||||
 | 
					  XM_RECON;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Yp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    YM_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFYP(Yp,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  YM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Zp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    ZM_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFZP(Zp,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  ZM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Tp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    TM_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFTP(Tp,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  TM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Xm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    XP_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFXM(Xm,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  XP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Ym
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    YP_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFYM(Ym,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  YP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Zm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    ZP_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFZM(Zm,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  ZP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Tm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = (uint64_t)&out._odata[ss];
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    TP_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFTM(Tm,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  TP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(basex);
 | 
				
			||||||
 | 
					  SAVE_RESULT(&out._odata[ss]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  ss++;
 | 
				
			||||||
 | 
					  } 
 | 
				
			||||||
 | 
					  sU++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
    Chi_11 = ref()(1)(1);\
 | 
					    Chi_11 = ref()(1)(1);\
 | 
				
			||||||
    Chi_12 = ref()(1)(2);
 | 
					    Chi_12 = ref()(1)(2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// To splat or not to splat depends on the implementation
 | 
				
			||||||
#define MULT_2SPIN(A)\
 | 
					#define MULT_2SPIN(A)\
 | 
				
			||||||
   auto & ref(U._odata[sU](A));	\
 | 
					   auto & ref(U._odata[sU](A));	\
 | 
				
			||||||
    U_00 = ref()(0,0);\
 | 
					   Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
				
			||||||
    U_10 = ref()(1,0);\
 | 
					   Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
				
			||||||
    U_20 = ref()(2,0);\
 | 
					   Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
				
			||||||
    U_01 = ref()(0,1);\
 | 
					   Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
				
			||||||
    U_11 = ref()(1,1);				\
 | 
					   Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
				
			||||||
    U_21 = ref()(2,1);\
 | 
					   Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
				
			||||||
    UChi_00 = U_00*Chi_00;\
 | 
					    UChi_00 = U_00*Chi_00;\
 | 
				
			||||||
    UChi_10 = U_00*Chi_10;\
 | 
					    UChi_10 = U_00*Chi_10;\
 | 
				
			||||||
    UChi_01 = U_10*Chi_00;\
 | 
					    UChi_01 = U_10*Chi_00;\
 | 
				
			||||||
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
    UChi_11+= U_11*Chi_11;\
 | 
					    UChi_11+= U_11*Chi_11;\
 | 
				
			||||||
    UChi_02+= U_21*Chi_01;\
 | 
					    UChi_02+= U_21*Chi_01;\
 | 
				
			||||||
    UChi_12+= U_21*Chi_11;\
 | 
					    UChi_12+= U_21*Chi_11;\
 | 
				
			||||||
    U_00 = ref()(0,2);\
 | 
					    Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
				
			||||||
    U_10 = ref()(1,2);\
 | 
					    Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
				
			||||||
    U_20 = ref()(2,2);\
 | 
					    Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
				
			||||||
    UChi_00+= U_00*Chi_02;\
 | 
					    UChi_00+= U_00*Chi_02;\
 | 
				
			||||||
    UChi_10+= U_00*Chi_12;\
 | 
					    UChi_10+= U_00*Chi_12;\
 | 
				
			||||||
    UChi_01+= U_10*Chi_02;\
 | 
					    UChi_01+= U_10*Chi_02;\
 | 
				
			||||||
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
    UChi_02+= U_20*Chi_02;\
 | 
					    UChi_02+= U_20*Chi_02;\
 | 
				
			||||||
    UChi_12+= U_20*Chi_12;
 | 
					    UChi_12+= U_20*Chi_12;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define PERMUTE_DIR(dir)			\
 | 
					#define PERMUTE_DIR(dir)			\
 | 
				
			||||||
      permute##dir(Chi_00,Chi_00);\
 | 
					      permute##dir(Chi_00,Chi_00);\
 | 
				
			||||||
      permute##dir(Chi_01,Chi_01);\
 | 
					      permute##dir(Chi_01,Chi_01);\
 | 
				
			||||||
@@ -309,546 +311,10 @@ namespace Grid {
 | 
				
			|||||||
namespace QCD {
 | 
					namespace QCD {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					 | 
				
			||||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					 | 
				
			||||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
					 | 
				
			||||||
						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
					 | 
				
			||||||
  typedef typename Simd::scalar_type S;
 | 
					 | 
				
			||||||
  typedef typename Simd::vector_type V;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_01;
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_02;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_10;
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_11;
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_12;   // 14 left
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_01;
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_02;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_10;
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_11;
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_12;  // 8 left
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
					 | 
				
			||||||
  REGISTER Simd U_10;
 | 
					 | 
				
			||||||
  REGISTER Simd U_20;  
 | 
					 | 
				
			||||||
  REGISTER Simd U_01;
 | 
					 | 
				
			||||||
  REGISTER Simd U_11;
 | 
					 | 
				
			||||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define Chimu_00 Chi_00
 | 
					 | 
				
			||||||
#define Chimu_01 Chi_01
 | 
					 | 
				
			||||||
#define Chimu_02 Chi_02
 | 
					 | 
				
			||||||
#define Chimu_10 Chi_10
 | 
					 | 
				
			||||||
#define Chimu_11 Chi_11
 | 
					 | 
				
			||||||
#define Chimu_12 Chi_12
 | 
					 | 
				
			||||||
#define Chimu_20 UChi_00
 | 
					 | 
				
			||||||
#define Chimu_21 UChi_01
 | 
					 | 
				
			||||||
#define Chimu_22 UChi_02
 | 
					 | 
				
			||||||
#define Chimu_30 UChi_10
 | 
					 | 
				
			||||||
#define Chimu_31 UChi_11
 | 
					 | 
				
			||||||
#define Chimu_32 UChi_12
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  StencilEntry *SE;
 | 
					 | 
				
			||||||
  int offset, ptype;
 | 
					 | 
				
			||||||
  int num = 0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Xp
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    XP_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Xp);
 | 
					 | 
				
			||||||
    XP_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Yp
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    YP_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Yp);
 | 
					 | 
				
			||||||
    YP_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Zp
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    ZP_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }  
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Zp);
 | 
					 | 
				
			||||||
    ZP_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Tp
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    TP_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Tp);
 | 
					 | 
				
			||||||
    TP_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  // Xm
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    XM_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Xm);
 | 
					 | 
				
			||||||
    XM_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  // Ym
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    YM_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Ym);
 | 
					 | 
				
			||||||
    YM_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Zm
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    ZM_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Zm);
 | 
					 | 
				
			||||||
    ZM_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Tm
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    TM_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Tm);
 | 
					 | 
				
			||||||
    TM_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  SiteSpinor & ref (out._odata[ss]);
 | 
					 | 
				
			||||||
  if ( Local ) {
 | 
					 | 
				
			||||||
    vstream(ref()(0)(0),result_00*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(0)(1),result_01*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(0)(2),result_02*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(0),result_10*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(1),result_11*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(2),result_12*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(0),result_20*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(1),result_21*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(2),result_22*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
					 | 
				
			||||||
    return 1;
 | 
					 | 
				
			||||||
  } else if ( num ) { 
 | 
					 | 
				
			||||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
					 | 
				
			||||||
    return 1;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  return 0;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class Impl>
 | 
					 | 
				
			||||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					 | 
				
			||||||
						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
					 | 
				
			||||||
						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
					 | 
				
			||||||
  typedef typename Simd::scalar_type S;
 | 
					 | 
				
			||||||
  typedef typename Simd::vector_type V;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
 | 
					 | 
				
			||||||
  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_01;
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_02;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_10;
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_11;
 | 
					 | 
				
			||||||
  REGISTER Simd Chi_12;   // 14 left
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_01;
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_02;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_10;
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_11;
 | 
					 | 
				
			||||||
  REGISTER Simd UChi_12;  // 8 left
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  REGISTER Simd U_00;  // two rows of U matrix
 | 
					 | 
				
			||||||
  REGISTER Simd U_10;
 | 
					 | 
				
			||||||
  REGISTER Simd U_20;  
 | 
					 | 
				
			||||||
  REGISTER Simd U_01;
 | 
					 | 
				
			||||||
  REGISTER Simd U_11;
 | 
					 | 
				
			||||||
  REGISTER Simd U_21;  // 2 reg left.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define Chimu_00 Chi_00
 | 
					 | 
				
			||||||
#define Chimu_01 Chi_01
 | 
					 | 
				
			||||||
#define Chimu_02 Chi_02
 | 
					 | 
				
			||||||
#define Chimu_10 Chi_10
 | 
					 | 
				
			||||||
#define Chimu_11 Chi_11
 | 
					 | 
				
			||||||
#define Chimu_12 Chi_12
 | 
					 | 
				
			||||||
#define Chimu_20 UChi_00
 | 
					 | 
				
			||||||
#define Chimu_21 UChi_01
 | 
					 | 
				
			||||||
#define Chimu_22 UChi_02
 | 
					 | 
				
			||||||
#define Chimu_30 UChi_10
 | 
					 | 
				
			||||||
#define Chimu_31 UChi_11
 | 
					 | 
				
			||||||
#define Chimu_32 UChi_12
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  StencilEntry *SE;
 | 
					 | 
				
			||||||
  int offset, ptype;
 | 
					 | 
				
			||||||
  int num = 0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Xp
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Xp,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    XM_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Xp);
 | 
					 | 
				
			||||||
    XM_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Yp
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    YM_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Yp);
 | 
					 | 
				
			||||||
    YM_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Zp
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Zp,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    ZM_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }  
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Zp);
 | 
					 | 
				
			||||||
    ZM_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Tp
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Tp,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    TM_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Tp);
 | 
					 | 
				
			||||||
    TM_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  // Xm
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    XP_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Xm);
 | 
					 | 
				
			||||||
    XP_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  // Ym
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Ym,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    YP_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Ym);
 | 
					 | 
				
			||||||
    YP_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Zm
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Zm,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    ZP_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Zm);
 | 
					 | 
				
			||||||
    ZP_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Tm
 | 
					 | 
				
			||||||
  SE=st.GetEntry(ptype,Tm,ss);
 | 
					 | 
				
			||||||
  offset = SE->_offset;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if (Local && SE->_is_local ) { 
 | 
					 | 
				
			||||||
    LOAD_CHIMU;
 | 
					 | 
				
			||||||
    TP_PROJ;
 | 
					 | 
				
			||||||
    if ( SE->_permute ) {
 | 
					 | 
				
			||||||
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					 | 
				
			||||||
    LOAD_CHI;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
 | 
					 | 
				
			||||||
    MULT_2SPIN(Tm);
 | 
					 | 
				
			||||||
    TP_RECON_ACCUM;
 | 
					 | 
				
			||||||
    num++;  
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  SiteSpinor & ref (out._odata[ss]);
 | 
					 | 
				
			||||||
  if ( Local ) {
 | 
					 | 
				
			||||||
    vstream(ref()(0)(0),result_00*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(0)(1),result_01*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(0)(2),result_02*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(0),result_10*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(1),result_11*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(2),result_12*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(0),result_20*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(1),result_21*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(2),result_22*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
					 | 
				
			||||||
    return 1;
 | 
					 | 
				
			||||||
  } else if ( num ) { 
 | 
					 | 
				
			||||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
					 | 
				
			||||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
					 | 
				
			||||||
    return 1;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  return 0;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  /*
 | 
					 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
										       int ss,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  typedef typename Simd::scalar_type S;
 | 
					  typedef typename Simd::scalar_type S;
 | 
				
			||||||
  typedef typename Simd::vector_type V;
 | 
					  typedef typename Simd::vector_type V;
 | 
				
			||||||
@@ -1073,89 +539,346 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    SiteSpinor & ref (out._odata[ss]);
 | 
					    SiteSpinor & ref (out._odata[ss]);
 | 
				
			||||||
    vstream(ref()(0)(0),result_00*(-0.5));
 | 
					    vstream(ref()(0)(0),result_00);
 | 
				
			||||||
    vstream(ref()(0)(1),result_01*(-0.5));
 | 
					    vstream(ref()(0)(1),result_01);
 | 
				
			||||||
    vstream(ref()(0)(2),result_02*(-0.5));
 | 
					    vstream(ref()(0)(2),result_02);
 | 
				
			||||||
    vstream(ref()(1)(0),result_10*(-0.5));
 | 
					    vstream(ref()(1)(0),result_10);
 | 
				
			||||||
    vstream(ref()(1)(1),result_11*(-0.5));
 | 
					    vstream(ref()(1)(1),result_11);
 | 
				
			||||||
    vstream(ref()(1)(2),result_12*(-0.5));
 | 
					    vstream(ref()(1)(2),result_12);
 | 
				
			||||||
    vstream(ref()(2)(0),result_20*(-0.5));
 | 
					    vstream(ref()(2)(0),result_20);
 | 
				
			||||||
    vstream(ref()(2)(1),result_21*(-0.5));
 | 
					    vstream(ref()(2)(1),result_21);
 | 
				
			||||||
    vstream(ref()(2)(2),result_22*(-0.5));
 | 
					    vstream(ref()(2)(2),result_22);
 | 
				
			||||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
					    vstream(ref()(3)(0),result_30);
 | 
				
			||||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
					    vstream(ref()(3)(1),result_31);
 | 
				
			||||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
					    vstream(ref()(3)(2),result_32);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
*/
 | 
					
 | 
				
			||||||
 | 
					template<class Impl>
 | 
				
			||||||
 | 
					void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
										       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
										       int ss,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
				
			||||||
 | 
					  typedef typename Simd::scalar_type S;
 | 
				
			||||||
 | 
					  typedef typename Simd::vector_type V;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd result_00; // 12 regs on knc
 | 
				
			||||||
 | 
					  REGISTER Simd result_01;
 | 
				
			||||||
 | 
					  REGISTER Simd result_02;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  REGISTER Simd result_10;
 | 
				
			||||||
 | 
					  REGISTER Simd result_11;
 | 
				
			||||||
 | 
					  REGISTER Simd result_12;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd result_20;
 | 
				
			||||||
 | 
					  REGISTER Simd result_21;
 | 
				
			||||||
 | 
					  REGISTER Simd result_22;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd result_30;
 | 
				
			||||||
 | 
					  REGISTER Simd result_31;
 | 
				
			||||||
 | 
					  REGISTER Simd result_32; // 20 left
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_00;    // two spinor; 6 regs
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_01;
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_02;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_10;
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_11;
 | 
				
			||||||
 | 
					  REGISTER Simd Chi_12;   // 14 left
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_00;  // two spinor; 6 regs
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_01;
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_02;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_10;
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_11;
 | 
				
			||||||
 | 
					  REGISTER Simd UChi_12;  // 8 left
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  REGISTER Simd U_00;  // two rows of U matrix
 | 
				
			||||||
 | 
					  REGISTER Simd U_10;
 | 
				
			||||||
 | 
					  REGISTER Simd U_20;  
 | 
				
			||||||
 | 
					  REGISTER Simd U_01;
 | 
				
			||||||
 | 
					  REGISTER Simd U_11;
 | 
				
			||||||
 | 
					  REGISTER Simd U_21;  // 2 reg left.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define Chimu_00 Chi_00
 | 
				
			||||||
 | 
					#define Chimu_01 Chi_01
 | 
				
			||||||
 | 
					#define Chimu_02 Chi_02
 | 
				
			||||||
 | 
					#define Chimu_10 Chi_10
 | 
				
			||||||
 | 
					#define Chimu_11 Chi_11
 | 
				
			||||||
 | 
					#define Chimu_12 Chi_12
 | 
				
			||||||
 | 
					#define Chimu_20 UChi_00
 | 
				
			||||||
 | 
					#define Chimu_21 UChi_01
 | 
				
			||||||
 | 
					#define Chimu_22 UChi_02
 | 
				
			||||||
 | 
					#define Chimu_30 UChi_10
 | 
				
			||||||
 | 
					#define Chimu_31 UChi_11
 | 
				
			||||||
 | 
					#define Chimu_32 UChi_12
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  StencilEntry *SE;
 | 
				
			||||||
 | 
					  int offset,local,perm, ptype;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // Xp
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Xp,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    XP_PROJ;
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Xp);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  XP_RECON;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Yp
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Yp,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    YP_PROJ;
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Yp);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  YP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Zp
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Zp,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    ZP_PROJ;
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Zp);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  ZP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Tp
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Tp,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    TP_PROJ;
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Tp);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  TP_RECON_ACCUM;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // Xm
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Xm,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    XM_PROJ;
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Xm);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  XM_RECON_ACCUM;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // Ym
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Ym,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    YM_PROJ;
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Ym);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  YM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Zm
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Zm,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    ZM_PROJ;
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Zm);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  ZM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Tm
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Tm,ss);
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    LOAD_CHIMU;
 | 
				
			||||||
 | 
					    TM_PROJ;
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN(Tm);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  TM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    SiteSpinor & ref (out._odata[ss]);
 | 
				
			||||||
 | 
					    vstream(ref()(0)(0),result_00);
 | 
				
			||||||
 | 
					    vstream(ref()(0)(1),result_01);
 | 
				
			||||||
 | 
					    vstream(ref()(0)(2),result_02);
 | 
				
			||||||
 | 
					    vstream(ref()(1)(0),result_10);
 | 
				
			||||||
 | 
					    vstream(ref()(1)(1),result_11);
 | 
				
			||||||
 | 
					    vstream(ref()(1)(2),result_12);
 | 
				
			||||||
 | 
					    vstream(ref()(2)(0),result_20);
 | 
				
			||||||
 | 
					    vstream(ref()(2)(1),result_21);
 | 
				
			||||||
 | 
					    vstream(ref()(2)(2),result_22);
 | 
				
			||||||
 | 
					    vstream(ref()(3)(0),result_30);
 | 
				
			||||||
 | 
					    vstream(ref()(3)(1),result_31);
 | 
				
			||||||
 | 
					    vstream(ref()(3)(2),result_32);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////
 | 
				
			||||||
  // Specialise Gparity to simple implementation
 | 
					  // Specialise Gparity to simple implementation
 | 
				
			||||||
  ////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
												     int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
 | 
					  assert(0);
 | 
				
			||||||
  //check consistency of return types between these functions and the ones in WilsonKernels.cc
 | 
					 | 
				
			||||||
  return 0;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
													int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
					  assert(0);
 | 
				
			||||||
  return 0;
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
												     int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
					  assert(0);
 | 
				
			||||||
  return 0;
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
													int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
					  assert(0);
 | 
				
			||||||
  return 0;
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					////////////// Wilson ; uses this implementation /////////////////////
 | 
				
			||||||
 | 
					// Need Nc=3 though //
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
 | 
												       int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
												       int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
													  int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
 | 
													  int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
													      int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
													      int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
														 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
														 int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
														 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
 | 
														 int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
													      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
													      int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
													      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
													      int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
														 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
														 int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
														 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
														 int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
 | 
				
			|||||||
#define INHERIT_GIMPL_TYPES(GImpl) \
 | 
					#define INHERIT_GIMPL_TYPES(GImpl) \
 | 
				
			||||||
    typedef typename GImpl::Simd                           Simd;\
 | 
					    typedef typename GImpl::Simd                           Simd;\
 | 
				
			||||||
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
 | 
					    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
 | 
				
			||||||
    typedef typename GImpl::GaugeField               GaugeField;	
 | 
					    typedef typename GImpl::GaugeField               GaugeField;\
 | 
				
			||||||
 | 
					    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
 | 
				
			||||||
 | 
					    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // 
 | 
					    // 
 | 
				
			||||||
    template<class S,int Nrepresentation=Nc>
 | 
					    template<class S,int Nrepresentation=Nc>
 | 
				
			||||||
@@ -62,9 +64,9 @@ template<class Gimpl> class WilsonLoops;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    // Move this elsewhere?
 | 
					    // Move this elsewhere?
 | 
				
			||||||
    static inline void AddGaugeLink(GaugeField& U, GaugeLinkField& W, int mu){  // U[mu] += W 
 | 
					    static inline void AddGaugeLink(GaugeField& U, GaugeLinkField& W, int mu){  // U[mu] += W 
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					    PARALLEL_FOR_LOOP
 | 
				
			||||||
      for(auto ss=0;ss<U._grid->oSites();ss++){
 | 
					      for(auto ss=0;ss<U._grid->oSites();ss++){
 | 
				
			||||||
	U._odata[ss]._internal[mu] = U._odata[ss]._internal[mu] + W._odata[ss]._internal;
 | 
						         U._odata[ss]._internal[mu] = U._odata[ss]._internal[mu] + W._odata[ss]._internal;
 | 
				
			||||||
        }  
 | 
					        }  
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -92,13 +92,13 @@ public:
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    // Create integrator, including the smearing policy
 | 
					    // Create integrator, including the smearing policy
 | 
				
			||||||
    // Smearing policy
 | 
					    // Smearing policy
 | 
				
			||||||
    std::cout << GridLogMessage << " Creating the Stout class\n";
 | 
					    std::cout << GridLogDebug << " Creating the Stout class\n";
 | 
				
			||||||
    double rho = 0.1; // smearing parameter
 | 
					    double rho = 0.1; // smearing parameter, now hardcoded
 | 
				
			||||||
    int Nsmear = 1;   // number of smearing levels
 | 
					    int Nsmear = 1;   // number of smearing levels
 | 
				
			||||||
    Smear_Stout<Gimpl> Stout(rho);
 | 
					    Smear_Stout<Gimpl> Stout(rho);
 | 
				
			||||||
    std::cout << GridLogMessage << " Creating the SmearedConfiguration class\n";
 | 
					    std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
 | 
				
			||||||
    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
 | 
					    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
 | 
				
			||||||
    std::cout << GridLogMessage << " done\n";
 | 
					    std::cout << GridLogDebug << " done\n";
 | 
				
			||||||
    //////////////
 | 
					    //////////////
 | 
				
			||||||
    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
 | 
					    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
 | 
				
			||||||
    IntegratorParameters MDpar(20);
 | 
					    IntegratorParameters MDpar(20);
 | 
				
			||||||
@@ -116,27 +116,27 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    if ( StartType == HotStart ) {
 | 
					    if ( StartType == HotStart ) {
 | 
				
			||||||
      // Hot start
 | 
					      // Hot start
 | 
				
			||||||
      HMCpar.NoMetropolisUntil =0;
 | 
					      HMCpar.NoMetropolisUntil =10;
 | 
				
			||||||
      HMCpar.MetropolisTest = true;
 | 
					      HMCpar.MetropolisTest = true;
 | 
				
			||||||
      sRNG.SeedFixedIntegers(SerSeed);
 | 
					      sRNG.SeedFixedIntegers(SerSeed);
 | 
				
			||||||
      pRNG.SeedFixedIntegers(ParSeed);
 | 
					      pRNG.SeedFixedIntegers(ParSeed);
 | 
				
			||||||
      SU3::HotConfiguration(pRNG, U);
 | 
					      SU3::HotConfiguration(pRNG, U);
 | 
				
			||||||
    } else if ( StartType == ColdStart ) { 
 | 
					    } else if ( StartType == ColdStart ) { 
 | 
				
			||||||
      // Cold start
 | 
					      // Cold start
 | 
				
			||||||
      HMCpar.NoMetropolisUntil =0;
 | 
					      HMCpar.NoMetropolisUntil =10;
 | 
				
			||||||
      HMCpar.MetropolisTest = true;
 | 
					      HMCpar.MetropolisTest = true;
 | 
				
			||||||
      sRNG.SeedFixedIntegers(SerSeed);
 | 
					      sRNG.SeedFixedIntegers(SerSeed);
 | 
				
			||||||
      pRNG.SeedFixedIntegers(ParSeed);
 | 
					      pRNG.SeedFixedIntegers(ParSeed);
 | 
				
			||||||
      SU3::ColdConfiguration(pRNG, U);
 | 
					      SU3::ColdConfiguration(pRNG, U);
 | 
				
			||||||
    } else if ( StartType == TepidStart ) {       
 | 
					    } else if ( StartType == TepidStart ) {       
 | 
				
			||||||
      // Tepid start
 | 
					      // Tepid start
 | 
				
			||||||
      HMCpar.NoMetropolisUntil =0;
 | 
					      HMCpar.NoMetropolisUntil =10;
 | 
				
			||||||
      HMCpar.MetropolisTest = true;
 | 
					      HMCpar.MetropolisTest = true;
 | 
				
			||||||
      sRNG.SeedFixedIntegers(SerSeed);
 | 
					      sRNG.SeedFixedIntegers(SerSeed);
 | 
				
			||||||
      pRNG.SeedFixedIntegers(ParSeed);
 | 
					      pRNG.SeedFixedIntegers(ParSeed);
 | 
				
			||||||
      SU3::TepidConfiguration(pRNG, U);
 | 
					      SU3::TepidConfiguration(pRNG, U);
 | 
				
			||||||
    } else if ( StartType == CheckpointStart ) { 
 | 
					    } else if ( StartType == CheckpointStart ) { 
 | 
				
			||||||
      HMCpar.NoMetropolisUntil =0;
 | 
					      HMCpar.NoMetropolisUntil =10;
 | 
				
			||||||
      HMCpar.MetropolisTest = true;
 | 
					      HMCpar.MetropolisTest = true;
 | 
				
			||||||
      // CheckpointRestart
 | 
					      // CheckpointRestart
 | 
				
			||||||
      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
 | 
					      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -61,6 +61,31 @@ namespace Grid {
 | 
				
			|||||||
      "         "
 | 
					      "         "
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    SpinMatrix makeGammaProd(const unsigned int i)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      SpinMatrix g;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      g = 1.;
 | 
				
			||||||
 | 
					      if (i & 0x1)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        g = g*Gamma(Gamma::GammaMatrix::GammaX);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      if (i & 0x2)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        g = g*Gamma(Gamma::GammaMatrix::GammaY);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      if (i & 0x4)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        g = g*Gamma(Gamma::GammaMatrix::GammaZ);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      if (i & 0x8)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        g = g*Gamma(Gamma::GammaMatrix::GammaT);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      return g;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    //    void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){
 | 
					    //    void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){
 | 
				
			||||||
    //      vHalfSpinColourVector hspin;
 | 
					    //      vHalfSpinColourVector hspin;
 | 
				
			||||||
    //      spProjXp(hspin,in);
 | 
					    //      spProjXp(hspin,in);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -83,6 +83,9 @@ namespace QCD {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					    // Make gamma products (Chroma convention)
 | 
				
			||||||
 | 
					    SpinMatrix makeGammaProd(const unsigned int i);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    /* Gx
 | 
					    /* Gx
 | 
				
			||||||
     *  0 0  0  i    
 | 
					     *  0 0  0  i    
 | 
				
			||||||
     *  0 0  i  0    
 | 
					     *  0 0  i  0    
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -608,14 +608,14 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
 | 
				
			|||||||
    LatticeMatrix Umu(out._grid);
 | 
					    LatticeMatrix Umu(out._grid);
 | 
				
			||||||
    for(int mu=0;mu<Nd;mu++){
 | 
					    for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
      LieRandomize(pRNG,Umu,0.01);
 | 
					      LieRandomize(pRNG,Umu,0.01);
 | 
				
			||||||
      pokeLorentz(out,Umu,mu);
 | 
					      PokeIndex<LorentzIndex>(out,Umu,mu);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
 | 
					  static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
 | 
				
			||||||
    LatticeMatrix Umu(out._grid);
 | 
					    LatticeMatrix Umu(out._grid);
 | 
				
			||||||
    Umu=1.0;
 | 
					    Umu=1.0;
 | 
				
			||||||
    for(int mu=0;mu<Nd;mu++){
 | 
					    for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
      pokeLorentz(out,Umu,mu);
 | 
					      PokeIndex<LorentzIndex>(out,Umu,mu);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
  return new GridRedBlackCartesian(FourDimGrid); 
 | 
					  return new GridRedBlackCartesian(FourDimGrid); 
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::vector<int> simd(4,1);
 | 
				
			||||||
 | 
					  return makeFourDimGrid(latt,simd,mpi);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
					GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  int N4=FourDimGrid->_ndimension;
 | 
					  int N4=FourDimGrid->_ndimension;
 | 
				
			||||||
@@ -58,6 +62,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
 | 
				
			|||||||
  return new GridCartesian(latt5,simd5,mpi5); 
 | 
					  return new GridCartesian(latt5,simd5,mpi5); 
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
					GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  int N4=FourDimGrid->_ndimension;
 | 
					  int N4=FourDimGrid->_ndimension;
 | 
				
			||||||
@@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
 | 
				
			|||||||
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 | 
					  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int N4=FourDimGrid->_ndimension;
 | 
				
			||||||
 | 
					  int nsimd = FourDimGrid->Nsimd();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> latt5(1,Ls);
 | 
				
			||||||
 | 
					  std::vector<int> simd5(1,nsimd);
 | 
				
			||||||
 | 
					  std::vector<int>  mpi5(1,1);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  for(int d=0;d<N4;d++){
 | 
				
			||||||
 | 
					    latt5.push_back(FourDimGrid->_fdimensions[d]);
 | 
				
			||||||
 | 
					    simd5.push_back(1);
 | 
				
			||||||
 | 
					     mpi5.push_back(FourDimGrid->_processors[d]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return new GridCartesian(latt5,simd5,mpi5); 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int N4=FourDimGrid->_ndimension;
 | 
				
			||||||
 | 
					  int nsimd = FourDimGrid->Nsimd();
 | 
				
			||||||
 | 
					  int cbd=0;
 | 
				
			||||||
 | 
					  std::vector<int> latt5(1,Ls);
 | 
				
			||||||
 | 
					  std::vector<int> simd5(1,nsimd);
 | 
				
			||||||
 | 
					  std::vector<int>  mpi5(1,1);
 | 
				
			||||||
 | 
					  std::vector<int>   cb5(1,1);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					  for(int d=0;d<N4;d++){
 | 
				
			||||||
 | 
					    latt5.push_back(FourDimGrid->_fdimensions[d]);
 | 
				
			||||||
 | 
					    simd5.push_back(1);
 | 
				
			||||||
 | 
					     mpi5.push_back(FourDimGrid->_processors[d]);
 | 
				
			||||||
 | 
					      cb5.push_back(1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -35,9 +35,14 @@ class SpaceTimeGrid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
 | 
					  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
 | 
				
			||||||
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
 | 
					  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
 | 
					  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
 | 
				
			||||||
  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
 | 
					  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
 | 
				
			||||||
 | 
					  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
 | 
				
			||||||
 | 
					  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -52,9 +52,9 @@ namespace Grid {
 | 
				
			|||||||
	// or this-> ; there is no "this" in a static method. This forces explicit Gimpl scope
 | 
						// or this-> ; there is no "this" in a static method. This forces explicit Gimpl scope
 | 
				
			||||||
	// resolution throughout the usage in this file, and rather defeats the purpose of deriving
 | 
						// resolution throughout the usage in this file, and rather defeats the purpose of deriving
 | 
				
			||||||
	// from Gimpl.
 | 
						// from Gimpl.
 | 
				
			||||||
	plaq= Gimpl::CovShiftBackward(U[mu],mu,
 | 
						plaq = Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
				      Gimpl::CovShiftBackward(U[nu],nu,
 | 
							   Gimpl::CovShiftBackward(U[nu],nu,
 | 
				
			||||||
							      Gimpl::CovShiftForward (U[mu],mu,U[nu])));
 | 
							   Gimpl::CovShiftForward (U[mu],mu,U[nu])));
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					      //////////////////////////////////////////////////
 | 
				
			||||||
      // trace of directed plaquette oriented in mu,nu plane
 | 
					      // trace of directed plaquette oriented in mu,nu plane
 | 
				
			||||||
@@ -100,16 +100,16 @@ namespace Grid {
 | 
				
			|||||||
      //////////////////////////////////////////////////
 | 
					      //////////////////////////////////////////////////
 | 
				
			||||||
      // average over all x,y,z,t and over all planes of plaquette
 | 
					      // average over all x,y,z,t and over all planes of plaquette
 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					      //////////////////////////////////////////////////
 | 
				
			||||||
      static RealD avgPlaquette(const GaugeLorentz &Umu){
 | 
						static RealD avgPlaquette(const GaugeLorentz &Umu){
 | 
				
			||||||
 | 
							RealD sumplaq = sumPlaquette(Umu);
 | 
				
			||||||
 | 
							double vol = Umu._grid->gSites();
 | 
				
			||||||
 | 
							double faces = (1.0*Nd*(Nd-1))/2.0;
 | 
				
			||||||
 | 
							return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
 | 
				
			||||||
 | 
					   	}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	RealD sumplaq = sumPlaquette(Umu);
 | 
					      //////////////////////////////////////////////////
 | 
				
			||||||
	
 | 
					      // average over traced single links
 | 
				
			||||||
	double vol = Umu._grid->gSites();
 | 
					      //////////////////////////////////////////////////
 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	double faces = (1.0*Nd*(Nd-1))/2.0;
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      static RealD linkTrace(const GaugeLorentz &Umu){
 | 
					      static RealD linkTrace(const GaugeLorentz &Umu){
 | 
				
			||||||
	std::vector<GaugeMat> U(4,Umu._grid);
 | 
						std::vector<GaugeMat> U(4,Umu._grid);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
@@ -126,47 +126,6 @@ namespace Grid {
 | 
				
			|||||||
	
 | 
						
 | 
				
			||||||
	return p.real()/vol/4.0/3.0;
 | 
						return p.real()/vol/4.0/3.0;
 | 
				
			||||||
      };
 | 
					      };
 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					 | 
				
			||||||
      // the sum over all staples on each site
 | 
					 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					 | 
				
			||||||
      static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	GridBase *grid = Umu._grid;
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	std::vector<GaugeMat> U(4,grid);
 | 
					 | 
				
			||||||
	for(int d=0;d<Nd;d++){
 | 
					 | 
				
			||||||
	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	staple = zero;
 | 
					 | 
				
			||||||
		
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	for(int nu=0;nu<Nd;nu++){
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  if(nu != mu) {
 | 
					 | 
				
			||||||
	    
 | 
					 | 
				
			||||||
	    // mu
 | 
					 | 
				
			||||||
	    // ^
 | 
					 | 
				
			||||||
	    // |__>  nu
 | 
					 | 
				
			||||||
	    
 | 
					 | 
				
			||||||
	    //    __ 
 | 
					 | 
				
			||||||
	    //      |
 | 
					 | 
				
			||||||
	    //    __|
 | 
					 | 
				
			||||||
	    //
 | 
					 | 
				
			||||||
	    
 | 
					 | 
				
			||||||
	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftForward (U[nu],nu, 
 | 
					 | 
				
			||||||
							       Gimpl::CovShiftBackward(U[mu],mu,
 | 
					 | 
				
			||||||
										       Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
 | 
					 | 
				
			||||||
	    
 | 
					 | 
				
			||||||
	    //  __ 
 | 
					 | 
				
			||||||
	    // |   
 | 
					 | 
				
			||||||
	    // |__ 
 | 
					 | 
				
			||||||
	    //
 | 
					 | 
				
			||||||
	    //
 | 
					 | 
				
			||||||
	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftBackward(U[nu],nu,		  		  
 | 
					 | 
				
			||||||
							       Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					      //////////////////////////////////////////////////
 | 
				
			||||||
      // the sum over all staples on each site in direction mu,nu
 | 
					      // the sum over all staples on each site in direction mu,nu
 | 
				
			||||||
@@ -210,6 +169,51 @@ namespace Grid {
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// the sum over all staples on each site
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GridBase *grid = Umu._grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::vector<GaugeMat> U(Nd,grid);
 | 
				
			||||||
 | 
					    for(int d=0;d<Nd;d++){
 | 
				
			||||||
 | 
					      U[d] = PeekIndex<LorentzIndex>(Umu,d);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    staple = zero;
 | 
				
			||||||
 | 
					    GaugeMat tmp(grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for(int nu=0;nu<Nd;nu++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if(nu != mu) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // mu
 | 
				
			||||||
 | 
					      // ^
 | 
				
			||||||
 | 
					      // |__>  nu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      //    __ 
 | 
				
			||||||
 | 
					      //      |
 | 
				
			||||||
 | 
					      //    __|
 | 
				
			||||||
 | 
					      //
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						staple+=Gimpl::ShiftStaple(
 | 
				
			||||||
 | 
						        Gimpl::CovShiftForward (U[nu],nu, 
 | 
				
			||||||
 | 
							Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
 | 
							Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      //  __ 
 | 
				
			||||||
 | 
					      // |   
 | 
				
			||||||
 | 
					      // |__ 
 | 
				
			||||||
 | 
					      //
 | 
				
			||||||
 | 
					      //
 | 
				
			||||||
 | 
						staple+=Gimpl::ShiftStaple(  
 | 
				
			||||||
 | 
					                Gimpl::CovShiftBackward(U[nu],nu,		  		  
 | 
				
			||||||
 | 
							Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					      //////////////////////////////////////////////////
 | 
				
			||||||
      // the sum over all staples on each site in direction mu,nu, upper part
 | 
					      // the sum over all staples on each site in direction mu,nu, upper part
 | 
				
			||||||
@@ -247,246 +251,246 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //////////////////////////////////////////////////////
 | 
				
			||||||
      //////////////////////////////////////////////////////
 | 
					  // Similar to above for rectangle is required
 | 
				
			||||||
      // Similar to above for rectangle is required
 | 
					  //////////////////////////////////////////////////////
 | 
				
			||||||
      //////////////////////////////////////////////////////
 | 
					  static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
 | 
				
			||||||
      static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
 | 
					  {
 | 
				
			||||||
      {
 | 
					    rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
 | 
				
			||||||
	rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
 | 
						adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
 | 
				
			||||||
	  adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
 | 
					    rect = rect + 
 | 
				
			||||||
	rect = rect + 
 | 
					 | 
				
			||||||
          Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[nu],nu,U[nu]))* // ->||
 | 
					          Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[nu],nu,U[nu]))* // ->||
 | 
				
			||||||
	  adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
 | 
					      adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    GaugeMat sp(U[0]._grid);
 | 
				
			||||||
 | 
					    dirRectangle(sp,U,mu,nu);
 | 
				
			||||||
 | 
					    rect=trace(sp);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    LatticeComplex siteRect(U[0]._grid);
 | 
				
			||||||
 | 
					    Rect=zero;
 | 
				
			||||||
 | 
					    for(int mu=1;mu<Nd;mu++){
 | 
				
			||||||
 | 
					      for(int nu=0;nu<mu;nu++){
 | 
				
			||||||
 | 
						traceDirRectangle(siteRect,U,mu,nu);
 | 
				
			||||||
 | 
						Rect = Rect + siteRect;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
 | 
					    }
 | 
				
			||||||
      {
 | 
					  }
 | 
				
			||||||
	GaugeMat sp(U[0]._grid);
 | 
					 | 
				
			||||||
	dirRectangle(sp,U,mu,nu);
 | 
					 | 
				
			||||||
	rect=trace(sp);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
 | 
					 | 
				
			||||||
      {
 | 
					 | 
				
			||||||
	LatticeComplex siteRect(U[0]._grid);
 | 
					 | 
				
			||||||
	Rect=zero;
 | 
					 | 
				
			||||||
	for(int mu=1;mu<Nd;mu++){
 | 
					 | 
				
			||||||
	  for(int nu=0;nu<mu;nu++){
 | 
					 | 
				
			||||||
	    traceDirRectangle(siteRect,U,mu,nu);
 | 
					 | 
				
			||||||
	    Rect = Rect + siteRect;
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					 | 
				
			||||||
      // sum over all x,y,z,t and over all planes of plaquette
 | 
					 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					 | 
				
			||||||
      static RealD sumRectangle(const GaugeLorentz &Umu){
 | 
					 | 
				
			||||||
	std::vector<GaugeMat> U(4,Umu._grid);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for(int mu=0;mu<Nd;mu++){
 | 
					 //////////////////////////////////////////////////
 | 
				
			||||||
	  U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
					  // sum over all x,y,z,t and over all planes of plaquette
 | 
				
			||||||
	}
 | 
					  //////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  static RealD sumRectangle(const GaugeLorentz &Umu){
 | 
				
			||||||
 | 
					    std::vector<GaugeMat> U(Nd,Umu._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	LatticeComplex Rect(Umu._grid);
 | 
					    for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	siteRectangle(Rect,U);
 | 
					    LatticeComplex Rect(Umu._grid);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
	TComplex Tp = sum(Rect);
 | 
					    siteRectangle(Rect,U);
 | 
				
			||||||
	Complex p  = TensorRemove(Tp);
 | 
					 | 
				
			||||||
	return p.real();
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					 | 
				
			||||||
      // average over all x,y,z,t and over all planes of plaquette
 | 
					 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					 | 
				
			||||||
      static RealD avgRectangle(const GaugeLorentz &Umu){
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
	RealD sumrect = sumRectangle(Umu);
 | 
					    TComplex Tp = sum(Rect);
 | 
				
			||||||
 | 
					    Complex p  = TensorRemove(Tp);
 | 
				
			||||||
 | 
					    return p.real();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  //////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // average over all x,y,z,t and over all planes of plaquette
 | 
				
			||||||
 | 
					  //////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  static RealD avgRectangle(const GaugeLorentz &Umu){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	double vol = Umu._grid->gSites();
 | 
					    RealD sumrect = sumRectangle(Umu);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
	double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
 | 
					    double vol = Umu._grid->gSites();
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
	return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
 | 
					    double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					    return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
 | 
				
			||||||
      // the sum over all staples on each site
 | 
					  }
 | 
				
			||||||
      //////////////////////////////////////////////////
 | 
					 | 
				
			||||||
      static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
 | 
					 | 
				
			||||||
	U2 = U * Cshift(U,mu,1);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      ////////////////////////////////////////////////////////////////////////////
 | 
					  //////////////////////////////////////////////////
 | 
				
			||||||
      // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
 | 
					  // the sum over all staples on each site
 | 
				
			||||||
      // but need to track two deep where cross boundary and apply a conjugation).
 | 
					  //////////////////////////////////////////////////
 | 
				
			||||||
      // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
 | 
					  static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
 | 
				
			||||||
      ////////////////////////////////////////////////////////////////////////////
 | 
					    U2 = U * Cshift(U,mu,1);
 | 
				
			||||||
      static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	Stap = zero;
 | 
					  ////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
 | 
				
			||||||
 | 
					  // but need to track two deep where cross boundary and apply a conjugation).
 | 
				
			||||||
 | 
					  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
 | 
				
			||||||
 | 
					  ////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	GridBase *grid = U[0]._grid;
 | 
					    Stap = zero;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	GaugeMat Staple2x1 (grid);
 | 
					    GridBase *grid = U[0]._grid;
 | 
				
			||||||
	GaugeMat tmp (grid);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for(int nu=0;nu<Nd;nu++){
 | 
					    GaugeMat Staple2x1 (grid);
 | 
				
			||||||
	  if ( nu!=mu) {
 | 
					    GaugeMat tmp (grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    // Up staple    ___ ___ 
 | 
					    for(int nu=0;nu<Nd;nu++){
 | 
				
			||||||
	    //             |       |
 | 
					      if ( nu!=mu) {
 | 
				
			||||||
	    tmp = Cshift(adj(U[nu]),nu,-1); 
 | 
					 | 
				
			||||||
	    tmp = adj(U2[mu])*tmp;
 | 
					 | 
				
			||||||
	    tmp = Cshift(tmp,mu,-2);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);
 | 
						// Up staple    ___ ___ 
 | 
				
			||||||
 | 
						//             |       |
 | 
				
			||||||
 | 
						tmp = Cshift(adj(U[nu]),nu,-1); 
 | 
				
			||||||
 | 
						tmp = adj(U2[mu])*tmp;
 | 
				
			||||||
 | 
						tmp = Cshift(tmp,mu,-2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    // Down staple
 | 
						// Down staple
 | 
				
			||||||
	    //             |___ ___|
 | 
						//             |___ ___|
 | 
				
			||||||
	    //
 | 
						//
 | 
				
			||||||
	    tmp = adj(U2[mu])*U[nu];
 | 
						tmp = adj(U2[mu])*U[nu];
 | 
				
			||||||
	    Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));
 | 
						Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //              ___ ___
 | 
						//              ___ ___
 | 
				
			||||||
	    //             |    ___|
 | 
						//             |    ___|
 | 
				
			||||||
	    //             |___ ___|
 | 
						//             |___ ___|
 | 
				
			||||||
	    //
 | 
						//
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);
 | 
						Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //              ___ ___
 | 
						//              ___ ___
 | 
				
			||||||
	    //             |___    |
 | 
						//             |___    |
 | 
				
			||||||
	    //             |___ ___|
 | 
						//             |___ ___|
 | 
				
			||||||
	    //
 | 
						//
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //	tmp= Staple2x1* Cshift(U[mu],mu,-2);
 | 
						//	tmp= Staple2x1* Cshift(U[mu],mu,-2);
 | 
				
			||||||
	    //	Stap+= Cshift(tmp,mu,1) ;
 | 
						//	Stap+= Cshift(tmp,mu,1) ;
 | 
				
			||||||
	    Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;
 | 
						Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //       --    
 | 
						//       --    
 | 
				
			||||||
	    //      |  |              
 | 
						//      |  |              
 | 
				
			||||||
	    //          
 | 
						//          
 | 
				
			||||||
	    //      |  | 
 | 
						//      |  | 
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	    tmp = Cshift(adj(U2[nu]),nu,-2);
 | 
						tmp = Cshift(adj(U2[nu]),nu,-2);
 | 
				
			||||||
	    tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
 | 
						tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
 | 
				
			||||||
	    tmp = U2[nu]*Cshift(tmp,nu,2);
 | 
						tmp = U2[nu]*Cshift(tmp,nu,2);
 | 
				
			||||||
	    Stap+= Cshift(tmp, mu, 1);
 | 
						Stap+= Cshift(tmp, mu, 1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //      |  |              
 | 
						//      |  |              
 | 
				
			||||||
	    //          
 | 
						//          
 | 
				
			||||||
	    //      |  | 
 | 
						//      |  | 
 | 
				
			||||||
	    //       -- 
 | 
						//       -- 
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	    tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
 | 
						tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
 | 
				
			||||||
	    tmp = adj(U2[nu])*tmp;
 | 
						tmp = adj(U2[nu])*tmp;
 | 
				
			||||||
	    tmp = Cshift(tmp,nu,-2);
 | 
						tmp = Cshift(tmp,nu,-2);
 | 
				
			||||||
	    Stap+=Cshift(tmp, mu, 1);
 | 
						Stap+=Cshift(tmp, mu, 1);
 | 
				
			||||||
	  }}
 | 
					    }}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
 | 
					  static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
 | 
				
			||||||
      {
 | 
					  {
 | 
				
			||||||
	RectStapleUnoptimised(Stap,Umu,mu);
 | 
					    RectStapleUnoptimised(Stap,Umu,mu);
 | 
				
			||||||
      }
 | 
					  }
 | 
				
			||||||
      static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
 | 
					  static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
 | 
				
			||||||
			     std::vector<GaugeMat> &U2,
 | 
								 std::vector<GaugeMat> &U2,
 | 
				
			||||||
			     std::vector<GaugeMat> &U, int mu)
 | 
								 std::vector<GaugeMat> &U, int mu)
 | 
				
			||||||
      {
 | 
					  {
 | 
				
			||||||
	if ( Gimpl::isPeriodicGaugeField() ){ 
 | 
					    if ( Gimpl::isPeriodicGaugeField() ){ 
 | 
				
			||||||
	  RectStapleOptimised(Stap,U2,U,mu);
 | 
					      RectStapleOptimised(Stap,U2,U,mu);
 | 
				
			||||||
	} else {
 | 
					    } else {
 | 
				
			||||||
	  RectStapleUnoptimised(Stap,Umu,mu);
 | 
					      RectStapleUnoptimised(Stap,Umu,mu);
 | 
				
			||||||
	}
 | 
					    }
 | 
				
			||||||
      }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
 | 
					  static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
 | 
				
			||||||
	GridBase *grid = Umu._grid;
 | 
					    GridBase *grid = Umu._grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::vector<GaugeMat> U(4,grid);
 | 
					    std::vector<GaugeMat> U(Nd,grid);
 | 
				
			||||||
	for(int d=0;d<Nd;d++){
 | 
					    for(int d=0;d<Nd;d++){
 | 
				
			||||||
	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
 | 
					      U[d] = PeekIndex<LorentzIndex>(Umu,d);
 | 
				
			||||||
	}
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	Stap=zero;
 | 
					    Stap=zero;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for(int nu=0;nu<Nd;nu++){
 | 
					    for(int nu=0;nu<Nd;nu++){
 | 
				
			||||||
	  if ( nu!=mu) {
 | 
					      if ( nu!=mu) {
 | 
				
			||||||
	    //           __ ___ 
 | 
					    //           __ ___ 
 | 
				
			||||||
	    //          |    __ |
 | 
					    //          |    __ |
 | 
				
			||||||
	    //
 | 
					    //
 | 
				
			||||||
	    Stap+= Gimpl::ShiftStaple(
 | 
					    Stap+= Gimpl::ShiftStaple(
 | 
				
			||||||
				      Gimpl::CovShiftForward (U[mu],mu,
 | 
							  Gimpl::CovShiftForward (U[mu],mu,
 | 
				
			||||||
							      Gimpl::CovShiftForward (U[nu],nu,
 | 
							  Gimpl::CovShiftForward (U[nu],nu,
 | 
				
			||||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
							  Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
													      Gimpl::CovShiftBackward(U[mu],mu,
 | 
					                  Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
																      Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
 | 
							  Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //              __ 
 | 
					    //              __ 
 | 
				
			||||||
	    //          |__ __ |
 | 
					    //          |__ __ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    Stap+= Gimpl::ShiftStaple(
 | 
					    Stap+= Gimpl::ShiftStaple(
 | 
				
			||||||
				      Gimpl::CovShiftForward (U[mu],mu,
 | 
					                  Gimpl::CovShiftForward (U[mu],mu,
 | 
				
			||||||
							      Gimpl::CovShiftBackward(U[nu],nu,
 | 
							  Gimpl::CovShiftBackward(U[nu],nu,
 | 
				
			||||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
							  Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
													      Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);
 | 
					                  Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //           __ 
 | 
					    //           __ 
 | 
				
			||||||
	    //          |__ __ |
 | 
					    //          |__ __ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    Stap+= Gimpl::ShiftStaple(
 | 
					    Stap+= Gimpl::ShiftStaple(
 | 
				
			||||||
				      Gimpl::CovShiftBackward(U[nu],nu,
 | 
							  Gimpl::CovShiftBackward(U[nu],nu,
 | 
				
			||||||
							      Gimpl::CovShiftBackward(U[mu],mu,
 | 
							  Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
							  Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
													      Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);
 | 
							  Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //           __ ___ 
 | 
					    //           __ ___ 
 | 
				
			||||||
	    //          |__    |
 | 
					    //          |__    |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    Stap+= Gimpl::ShiftStaple(
 | 
					    Stap+= Gimpl::ShiftStaple(
 | 
				
			||||||
				      Gimpl::CovShiftForward (U[nu],nu,
 | 
							   Gimpl::CovShiftForward (U[nu],nu,
 | 
				
			||||||
							      Gimpl::CovShiftBackward(U[mu],mu,
 | 
						           Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
					                   Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
													      Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);
 | 
					                   Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //       --    
 | 
					     //       --    
 | 
				
			||||||
	    //      |  |              
 | 
					     //      |  |              
 | 
				
			||||||
	    //          
 | 
					     //          
 | 
				
			||||||
	    //      |  | 
 | 
					     //      |  | 
 | 
				
			||||||
     
 | 
					     
 | 
				
			||||||
	    Stap+= Gimpl::ShiftStaple(
 | 
					    Stap+= Gimpl::ShiftStaple(
 | 
				
			||||||
				      Gimpl::CovShiftForward(U[nu],nu,
 | 
							   Gimpl::CovShiftForward(U[nu],nu,
 | 
				
			||||||
							     Gimpl::CovShiftForward(U[nu],nu,
 | 
							   Gimpl::CovShiftForward(U[nu],nu,
 | 
				
			||||||
										    Gimpl::CovShiftBackward(U[mu],mu,
 | 
					                   Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
													    Gimpl::CovShiftBackward(U[nu],nu,
 | 
					                   Gimpl::CovShiftBackward(U[nu],nu,
 | 
				
			||||||
																    Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
 | 
							   Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    //      |  |              
 | 
					     //      |  |              
 | 
				
			||||||
	    //          
 | 
					     //          
 | 
				
			||||||
	    //      |  | 
 | 
					     //      |  | 
 | 
				
			||||||
	    //       -- 
 | 
					     //       -- 
 | 
				
			||||||
     
 | 
					     
 | 
				
			||||||
	    Stap+= Gimpl::ShiftStaple(
 | 
					    Stap+= Gimpl::ShiftStaple(
 | 
				
			||||||
				      Gimpl::CovShiftBackward(U[nu],nu,
 | 
							   Gimpl::CovShiftBackward(U[nu],nu,
 | 
				
			||||||
							      Gimpl::CovShiftBackward(U[nu],nu,
 | 
							   Gimpl::CovShiftBackward(U[nu],nu,
 | 
				
			||||||
										      Gimpl::CovShiftBackward(U[mu],mu,
 | 
					                   Gimpl::CovShiftBackward(U[mu],mu,
 | 
				
			||||||
													      Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
 | 
					                   Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
 | 
				
			||||||
	  }}
 | 
					    }}
 | 
				
			||||||
      }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    };
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
 | 
					 typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
 | 
				
			||||||
    typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
 | 
					 typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
 | 
				
			||||||
    typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
 | 
					 typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
 | 
				
			||||||
    typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
 | 
					 typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  }}
 | 
					}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
@@ -32,6 +32,40 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include <type_traits>
 | 
					#include <type_traits>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					  // helper function to read space-separated values
 | 
				
			||||||
 | 
					  template <typename T>
 | 
				
			||||||
 | 
					  std::vector<T> strToVec(const std::string s)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    std::istringstream sstr(s);
 | 
				
			||||||
 | 
					    T                  buf;
 | 
				
			||||||
 | 
					    std::vector<T>     v;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    while(!sstr.eof())
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      sstr >> buf;
 | 
				
			||||||
 | 
					      v.push_back(buf);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return v;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // output to streams for vectors
 | 
				
			||||||
 | 
					  template < class T >
 | 
				
			||||||
 | 
					  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    os << "[";
 | 
				
			||||||
 | 
					    for (auto &x: v)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      os << x << " ";
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if (v.size() > 0)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      os << "\b";
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    os << "]";
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return os;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  class Serializable {};
 | 
					  class Serializable {};
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
@@ -138,23 +172,6 @@ namespace Grid {
 | 
				
			|||||||
    r.read(s, output);
 | 
					    r.read(s, output);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  template < class T >
 | 
					 | 
				
			||||||
  inline std::ostream& operator << (std::ostream& os, const std::vector<T>& v)
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    os << "[";
 | 
					 | 
				
			||||||
    for (auto &x: v)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      os << x << " ";
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    if (v.size() > 0)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      os << "\b";
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    os << "]";
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    return os;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  // Writer template implementation ////////////////////////////////////////////
 | 
					  // Writer template implementation ////////////////////////////////////////////
 | 
				
			||||||
  template <typename T>
 | 
					  template <typename T>
 | 
				
			||||||
  Writer<T>::Writer(void)
 | 
					  Writer<T>::Writer(void)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -120,7 +120,7 @@ THE SOFTWARE.
 | 
				
			|||||||
  \
 | 
					  \
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  template <typename T>\
 | 
					  template <typename T>\
 | 
				
			||||||
  static void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
 | 
					  static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
 | 
				
			||||||
    push(WR,s);\
 | 
					    push(WR,s);\
 | 
				
			||||||
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
 | 
					    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
 | 
				
			||||||
    pop(WR);\
 | 
					    pop(WR);\
 | 
				
			||||||
@@ -128,14 +128,14 @@ THE SOFTWARE.
 | 
				
			|||||||
  \
 | 
					  \
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  template <typename T>\
 | 
					  template <typename T>\
 | 
				
			||||||
  static void read(Reader<T> &RD,const std::string &s, cname &obj){	\
 | 
					  static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
 | 
				
			||||||
    push(RD,s);\
 | 
					    push(RD,s);\
 | 
				
			||||||
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
 | 
					    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
 | 
				
			||||||
    pop(RD);\
 | 
					    pop(RD);\
 | 
				
			||||||
  } \
 | 
					  } \
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  friend std::ostream & operator << (std::ostream &os, const cname &obj ) { \
 | 
					  friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
 | 
				
			||||||
    os<<"class "<<#cname<<" {"<<std::endl;\
 | 
					    os<<"class "<<#cname<<" {"<<std::endl;\
 | 
				
			||||||
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
 | 
					    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
 | 
				
			||||||
      os<<"}";								\
 | 
					      os<<"}";								\
 | 
				
			||||||
@@ -165,7 +165,7 @@ namespace Grid {
 | 
				
			|||||||
  class EnumIO<name> {\
 | 
					  class EnumIO<name> {\
 | 
				
			||||||
    public:\
 | 
					    public:\
 | 
				
			||||||
      template <typename T>\
 | 
					      template <typename T>\
 | 
				
			||||||
      static void write(Writer<T> &WR,const std::string &s, const name &obj){ \
 | 
					      static inline void write(Writer<T> &WR,const std::string &s, const name &obj){ \
 | 
				
			||||||
        switch (obj) {\
 | 
					        switch (obj) {\
 | 
				
			||||||
          GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
 | 
					          GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
 | 
				
			||||||
          default: Grid::write(WR,s,#undefname); break;\
 | 
					          default: Grid::write(WR,s,#undefname); break;\
 | 
				
			||||||
@@ -173,7 +173,7 @@ namespace Grid {
 | 
				
			|||||||
      }\
 | 
					      }\
 | 
				
			||||||
      \
 | 
					      \
 | 
				
			||||||
      template <typename T>\
 | 
					      template <typename T>\
 | 
				
			||||||
      static void read(Reader<T> &RD,const std::string &s, name &obj){ \
 | 
					      static inline void read(Reader<T> &RD,const std::string &s, name &obj){ \
 | 
				
			||||||
        std::string buf;\
 | 
					        std::string buf;\
 | 
				
			||||||
        Grid::read(RD, s, buf);\
 | 
					        Grid::read(RD, s, buf);\
 | 
				
			||||||
        if (buf == #undefname) {obj = name::undefname;}\
 | 
					        if (buf == #undefname) {obj = name::undefname;}\
 | 
				
			||||||
@@ -182,7 +182,7 @@ namespace Grid {
 | 
				
			|||||||
      }\
 | 
					      }\
 | 
				
			||||||
  };\
 | 
					  };\
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  std::ostream & operator << (std::ostream &os, const name &obj ) { \
 | 
					  inline std::ostream & operator << (std::ostream &os, const name &obj ) { \
 | 
				
			||||||
    switch (obj) {\
 | 
					    switch (obj) {\
 | 
				
			||||||
        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
 | 
					        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
 | 
				
			||||||
        default: os << #undefname; break;\
 | 
					        default: os << #undefname; break;\
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -80,6 +80,20 @@ void XmlReader::pop(void)
 | 
				
			|||||||
  node_ = node_.parent();
 | 
					  node_ = node_.parent();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					bool XmlReader::nextElement(const std::string &s)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  if (node_.next_sibling(s.c_str()))
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    node_ = node_.next_sibling(s.c_str());
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return true;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  else
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    return false;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <>
 | 
					template <>
 | 
				
			||||||
void XmlReader::readDefault(const string &s, string &output)
 | 
					void XmlReader::readDefault(const string &s, string &output)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -68,6 +68,7 @@ namespace Grid
 | 
				
			|||||||
    virtual ~XmlReader(void) = default;
 | 
					    virtual ~XmlReader(void) = default;
 | 
				
			||||||
    void push(const std::string &s);
 | 
					    void push(const std::string &s);
 | 
				
			||||||
    void pop(void);
 | 
					    void pop(void);
 | 
				
			||||||
 | 
					    bool nextElement(const std::string &s);
 | 
				
			||||||
    template <typename U>
 | 
					    template <typename U>
 | 
				
			||||||
    void readDefault(const std::string &s, U &output);
 | 
					    void readDefault(const std::string &s, U &output);
 | 
				
			||||||
    template <typename U>
 | 
					    template <typename U>
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										1139
									
								
								lib/simd/Avx512Asm.h
									
									
									
									
									
								
							
							
						
						
									
										1139
									
								
								lib/simd/Avx512Asm.h
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -410,22 +410,22 @@ namespace Optimization {
 | 
				
			|||||||
  struct Permute{
 | 
					  struct Permute{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    static inline __m256 Permute0(__m256 in){
 | 
					    static inline __m256 Permute0(__m256 in){
 | 
				
			||||||
      return _mm256_permute2f128_ps(in,in,0x01);
 | 
					      return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    static inline __m256 Permute1(__m256 in){
 | 
					    static inline __m256 Permute1(__m256 in){
 | 
				
			||||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
					      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    static inline __m256 Permute2(__m256 in){
 | 
					    static inline __m256 Permute2(__m256 in){
 | 
				
			||||||
      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
					      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    static inline __m256 Permute3(__m256 in){
 | 
					    static inline __m256 Permute3(__m256 in){
 | 
				
			||||||
      return in;
 | 
					      return in;
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    static inline __m256d Permute0(__m256d in){
 | 
					    static inline __m256d Permute0(__m256d in){
 | 
				
			||||||
      return _mm256_permute2f128_pd(in,in,0x01);
 | 
					      return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    static inline __m256d Permute1(__m256d in){
 | 
					    static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
 | 
				
			||||||
      return _mm256_shuffle_pd(in,in,0x5);
 | 
					      return _mm256_shuffle_pd(in,in,0x5);
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    static inline __m256d Permute2(__m256d in){
 | 
					    static inline __m256d Permute2(__m256d in){
 | 
				
			||||||
@@ -437,6 +437,111 @@ namespace Optimization {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if defined (AVX2) || defined (AVXFMA4) 
 | 
				
			||||||
 | 
					#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
 | 
				
			||||||
 | 
					#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if defined (AVX1) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define _mm256_alignr_epi32(ret,a,b,n) {	\
 | 
				
			||||||
 | 
					    __m128 aa, bb;				\
 | 
				
			||||||
 | 
											\
 | 
				
			||||||
 | 
					    aa  = _mm256_extractf128_ps(a,1);		\
 | 
				
			||||||
 | 
					    bb  = _mm256_extractf128_ps(b,1);		\
 | 
				
			||||||
 | 
					    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
 | 
				
			||||||
 | 
					    ret = _mm256_insertf128_ps(ret,aa,1);	\
 | 
				
			||||||
 | 
											\
 | 
				
			||||||
 | 
					    aa  = _mm256_extractf128_ps(a,0);		\
 | 
				
			||||||
 | 
					    bb  = _mm256_extractf128_ps(b,0);		\
 | 
				
			||||||
 | 
					    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
 | 
				
			||||||
 | 
					    ret = _mm256_insertf128_ps(ret,aa,0);	\
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define _mm256_alignr_epi64(ret,a,b,n) {	\
 | 
				
			||||||
 | 
					    __m128d aa, bb;				\
 | 
				
			||||||
 | 
											\
 | 
				
			||||||
 | 
					    aa  = _mm256_extractf128_pd(a,1);		\
 | 
				
			||||||
 | 
					    bb  = _mm256_extractf128_pd(b,1);		\
 | 
				
			||||||
 | 
					    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
 | 
				
			||||||
 | 
					    ret = _mm256_insertf128_pd(ret,aa,1);	\
 | 
				
			||||||
 | 
											\
 | 
				
			||||||
 | 
					    aa  = _mm256_extractf128_pd(a,0);		\
 | 
				
			||||||
 | 
					    bb  = _mm256_extractf128_pd(b,0);		\
 | 
				
			||||||
 | 
					    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
 | 
				
			||||||
 | 
					    ret = _mm256_insertf128_pd(ret,aa,0);	\
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline std::ostream & operator << (std::ostream& stream, const __m256 a)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      const float *p=(const float *)&a;
 | 
				
			||||||
 | 
					      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
 | 
				
			||||||
 | 
					      return stream;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      const double *p=(const double *)&a;
 | 
				
			||||||
 | 
					      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
 | 
				
			||||||
 | 
					      return stream;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Rotate{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static inline __m256 rotate(__m256 in,int n){ 
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0: return tRotate<0>(in);break;
 | 
				
			||||||
 | 
					      case 1: return tRotate<1>(in);break;
 | 
				
			||||||
 | 
					      case 2: return tRotate<2>(in);break;
 | 
				
			||||||
 | 
					      case 3: return tRotate<3>(in);break;
 | 
				
			||||||
 | 
					      case 4: return tRotate<4>(in);break;
 | 
				
			||||||
 | 
					      case 5: return tRotate<5>(in);break;
 | 
				
			||||||
 | 
					      case 6: return tRotate<6>(in);break;
 | 
				
			||||||
 | 
					      case 7: return tRotate<7>(in);break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    static inline __m256d rotate(__m256d in,int n){ 
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0: return tRotate<0>(in);break;
 | 
				
			||||||
 | 
					      case 1: return tRotate<1>(in);break;
 | 
				
			||||||
 | 
					      case 2: return tRotate<2>(in);break;
 | 
				
			||||||
 | 
					      case 3: return tRotate<3>(in);break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    template<int n>
 | 
				
			||||||
 | 
					    static inline __m256 tRotate(__m256 in){ 
 | 
				
			||||||
 | 
					      __m256 tmp = Permute::Permute0(in);
 | 
				
			||||||
 | 
					      __m256 ret;
 | 
				
			||||||
 | 
					      if ( n > 3 ) { 
 | 
				
			||||||
 | 
						_mm256_alignr_epi32(ret,in,tmp,n);  
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
					        _mm256_alignr_epi32(ret,tmp,in,n);          
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      //      std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<int n>
 | 
				
			||||||
 | 
					    static inline __m256d tRotate(__m256d in){ 
 | 
				
			||||||
 | 
					      __m256d tmp = Permute::Permute0(in);
 | 
				
			||||||
 | 
					      __m256d ret;
 | 
				
			||||||
 | 
					      if ( n > 1 ) {
 | 
				
			||||||
 | 
						_mm256_alignr_epi64(ret,in,tmp,n);          
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
					        _mm256_alignr_epi64(ret,tmp,in,n);          
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      //      std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //Complex float Reduce
 | 
					  //Complex float Reduce
 | 
				
			||||||
  template<>
 | 
					  template<>
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include <immintrin.h>
 | 
					#include <immintrin.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid{
 | 
				
			||||||
namespace Optimization {
 | 
					namespace Optimization {
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  struct Vsplat{
 | 
					  struct Vsplat{
 | 
				
			||||||
@@ -246,26 +246,30 @@ namespace Optimization {
 | 
				
			|||||||
  struct TimesMinusI{
 | 
					  struct TimesMinusI{
 | 
				
			||||||
    //Complex single
 | 
					    //Complex single
 | 
				
			||||||
    inline __m512 operator()(__m512 in, __m512 ret){
 | 
					    inline __m512 operator()(__m512 in, __m512 ret){
 | 
				
			||||||
      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
 | 
					      //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
 | 
				
			||||||
      return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));   // 0x4E??
 | 
					      //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0));   // 0x4E??
 | 
				
			||||||
 | 
					      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
				
			||||||
 | 
					      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex double
 | 
					    //Complex double
 | 
				
			||||||
    inline __m512d operator()(__m512d in, __m512d ret){
 | 
					    inline __m512d operator()(__m512d in, __m512d ret){
 | 
				
			||||||
      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
 | 
					      //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
 | 
				
			||||||
      return _mm512_shuffle_pd(tmp,tmp,0x55);
 | 
					      //return _mm512_shuffle_pd(tmp,tmp,0x55);
 | 
				
			||||||
 | 
					      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
 | 
				
			||||||
 | 
					      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
 | 
				
			||||||
    } 
 | 
					    } 
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct TimesI{
 | 
					  struct TimesI{
 | 
				
			||||||
    //Complex single
 | 
					    //Complex single
 | 
				
			||||||
    inline __m512 operator()(__m512 in, __m512 ret){
 | 
					    inline __m512 operator()(__m512 in, __m512 ret){
 | 
				
			||||||
      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
					      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
				
			||||||
      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
 | 
					      return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); 
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex double
 | 
					    //Complex double
 | 
				
			||||||
    inline __m512d operator()(__m512d in, __m512d ret){
 | 
					    inline __m512d operator()(__m512d in, __m512d ret){
 | 
				
			||||||
      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
 | 
					      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
 | 
				
			||||||
      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
 | 
					      return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); 
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -305,6 +309,54 @@ namespace Optimization {
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Rotate{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static inline __m512 rotate(__m512 in,int n){ 
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0: return tRotate<0>(in);break;
 | 
				
			||||||
 | 
					      case 1: return tRotate<1>(in);break;
 | 
				
			||||||
 | 
					      case 2: return tRotate<2>(in);break;
 | 
				
			||||||
 | 
					      case 3: return tRotate<3>(in);break;
 | 
				
			||||||
 | 
					      case 4: return tRotate<4>(in);break;
 | 
				
			||||||
 | 
					      case 5: return tRotate<5>(in);break;
 | 
				
			||||||
 | 
					      case 6: return tRotate<6>(in);break;
 | 
				
			||||||
 | 
					      case 7: return tRotate<7>(in);break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      case 8 : return tRotate<8>(in);break;
 | 
				
			||||||
 | 
					      case 9 : return tRotate<9>(in);break;
 | 
				
			||||||
 | 
					      case 10: return tRotate<10>(in);break;
 | 
				
			||||||
 | 
					      case 11: return tRotate<11>(in);break;
 | 
				
			||||||
 | 
					      case 12: return tRotate<12>(in);break;
 | 
				
			||||||
 | 
					      case 13: return tRotate<13>(in);break;
 | 
				
			||||||
 | 
					      case 14: return tRotate<14>(in);break;
 | 
				
			||||||
 | 
					      case 15: return tRotate<15>(in);break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    static inline __m512d rotate(__m512d in,int n){ 
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0: return tRotate<0>(in);break;
 | 
				
			||||||
 | 
					      case 1: return tRotate<1>(in);break;
 | 
				
			||||||
 | 
					      case 2: return tRotate<2>(in);break;
 | 
				
			||||||
 | 
					      case 3: return tRotate<3>(in);break;
 | 
				
			||||||
 | 
					      case 4: return tRotate<4>(in);break;
 | 
				
			||||||
 | 
					      case 5: return tRotate<5>(in);break;
 | 
				
			||||||
 | 
					      case 6: return tRotate<6>(in);break;
 | 
				
			||||||
 | 
					      case 7: return tRotate<7>(in);break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<int n> static inline __m512 tRotate(__m512 in){ 
 | 
				
			||||||
 | 
					      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<int n> static inline __m512d tRotate(__m512d in){ 
 | 
				
			||||||
 | 
					      return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);          
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //////////////////////////////////////////////
 | 
					  //////////////////////////////////////////////
 | 
				
			||||||
  // Some Template specialization
 | 
					  // Some Template specialization
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
@@ -345,7 +397,7 @@ namespace Optimization {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Here assign types 
 | 
					// Here assign types 
 | 
				
			||||||
namespace Grid {
 | 
					
 | 
				
			||||||
  typedef __m512 SIMD_Ftype;  // Single precision type
 | 
					  typedef __m512 SIMD_Ftype;  // Single precision type
 | 
				
			||||||
  typedef __m512d SIMD_Dtype; // Double precision type
 | 
					  typedef __m512d SIMD_Dtype; // Double precision type
 | 
				
			||||||
  typedef __m512i SIMD_Itype; // Integer type
 | 
					  typedef __m512i SIMD_Itype; // Integer type
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -35,6 +35,7 @@ Author: neo <cossu@post.kek.jp>
 | 
				
			|||||||
// Time-stamp: <2015-06-09 14:28:02 neo>
 | 
					// Time-stamp: <2015-06-09 14:28:02 neo>
 | 
				
			||||||
//----------------------------------------------------------------------
 | 
					//----------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
namespace Optimization {
 | 
					namespace Optimization {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template<class vtype>
 | 
					  template<class vtype>
 | 
				
			||||||
@@ -54,51 +55,67 @@ namespace Optimization {
 | 
				
			|||||||
  
 | 
					  
 | 
				
			||||||
  struct Vsplat{
 | 
					  struct Vsplat{
 | 
				
			||||||
    //Complex float
 | 
					    //Complex float
 | 
				
			||||||
    inline float operator()(float a, float b){
 | 
					    inline u128f operator()(float a, float b){
 | 
				
			||||||
      return 0;
 | 
					      u128f out; 
 | 
				
			||||||
 | 
					      out.f[0] = a;
 | 
				
			||||||
 | 
					      out.f[1] = b;
 | 
				
			||||||
 | 
					      out.f[2] = a;
 | 
				
			||||||
 | 
					      out.f[3] = b;
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Real float
 | 
					    // Real float
 | 
				
			||||||
    inline float operator()(float a){
 | 
					    inline u128f operator()(float a){
 | 
				
			||||||
      return 0;
 | 
					      u128f out; 
 | 
				
			||||||
 | 
					      out.f[0] = a;
 | 
				
			||||||
 | 
					      out.f[1] = a;
 | 
				
			||||||
 | 
					      out.f[2] = a;
 | 
				
			||||||
 | 
					      out.f[3] = a;
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex double
 | 
					    //Complex double
 | 
				
			||||||
    inline double operator()(double a, double b){
 | 
					    inline u128d operator()(double a, double b){
 | 
				
			||||||
      return 0;
 | 
					      u128d out; 
 | 
				
			||||||
 | 
					      out.f[0] = a;
 | 
				
			||||||
 | 
					      out.f[1] = b;
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Real double
 | 
					    //Real double
 | 
				
			||||||
    inline double operator()(double a){
 | 
					    inline u128d operator()(double a){
 | 
				
			||||||
      return 0;
 | 
					      u128d out; 
 | 
				
			||||||
 | 
					      out.f[0] = a;
 | 
				
			||||||
 | 
					      out.f[1] = a;
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Integer
 | 
					    //Integer
 | 
				
			||||||
    inline int operator()(Integer a){
 | 
					    inline int operator()(Integer a){
 | 
				
			||||||
      return 0;
 | 
					      return a;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Vstore{
 | 
					  struct Vstore{
 | 
				
			||||||
    //Float 
 | 
					    //Float 
 | 
				
			||||||
    inline void operator()(float a, float* F){
 | 
					    inline void operator()(u128f a, float* F){
 | 
				
			||||||
      
 | 
					      memcpy(F,a.f,4*sizeof(float));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Double
 | 
					    //Double
 | 
				
			||||||
    inline void operator()(double a, double* D){
 | 
					    inline void operator()(u128d a, double* D){
 | 
				
			||||||
     
 | 
					      memcpy(D,a.f,2*sizeof(double));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Integer
 | 
					    //Integer
 | 
				
			||||||
    inline void operator()(int a, Integer* I){
 | 
					    inline void operator()(int a, Integer* I){
 | 
				
			||||||
      
 | 
					      I[0] = a;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Vstream{
 | 
					  struct Vstream{
 | 
				
			||||||
    //Float
 | 
					    //Float
 | 
				
			||||||
    inline void operator()(float * a, float b){
 | 
					    inline void operator()(float * a, u128f b){
 | 
				
			||||||
     
 | 
					      memcpy(a,b.f,4*sizeof(float));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Double
 | 
					    //Double
 | 
				
			||||||
    inline void operator()(double * a, double b){
 | 
					    inline void operator()(double * a, u128d b){
 | 
				
			||||||
     
 | 
					      memcpy(a,b.f,2*sizeof(double));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -106,24 +123,40 @@ namespace Optimization {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  struct Vset{
 | 
					  struct Vset{
 | 
				
			||||||
    // Complex float 
 | 
					    // Complex float 
 | 
				
			||||||
    inline float operator()(Grid::ComplexF *a){
 | 
					    inline u128f operator()(Grid::ComplexF *a){
 | 
				
			||||||
      return 0;
 | 
					      u128f out; 
 | 
				
			||||||
 | 
					      out.f[0] = a[0].real();
 | 
				
			||||||
 | 
					      out.f[1] = a[0].imag();
 | 
				
			||||||
 | 
					      out.f[2] = a[1].real();
 | 
				
			||||||
 | 
					      out.f[3] = a[1].imag();
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Complex double 
 | 
					    // Complex double 
 | 
				
			||||||
    inline double operator()(Grid::ComplexD *a){
 | 
					    inline u128d operator()(Grid::ComplexD *a){
 | 
				
			||||||
      return 0;
 | 
					      u128d out; 
 | 
				
			||||||
 | 
					      out.f[0] = a[0].real();
 | 
				
			||||||
 | 
					      out.f[1] = a[0].imag();
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Real float 
 | 
					    // Real float 
 | 
				
			||||||
    inline float operator()(float *a){
 | 
					    inline u128f operator()(float *a){
 | 
				
			||||||
      return  0;
 | 
					      u128f out; 
 | 
				
			||||||
 | 
					      out.f[0] = a[0];
 | 
				
			||||||
 | 
					      out.f[1] = a[1];
 | 
				
			||||||
 | 
					      out.f[2] = a[2];
 | 
				
			||||||
 | 
					      out.f[3] = a[3];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Real double
 | 
					    // Real double
 | 
				
			||||||
    inline double operator()(double *a){
 | 
					    inline u128d operator()(double *a){
 | 
				
			||||||
      return 0;
 | 
					      u128d out; 
 | 
				
			||||||
 | 
					      out.f[0] = a[0];
 | 
				
			||||||
 | 
					      out.f[1] = a[1];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Integer
 | 
					    // Integer
 | 
				
			||||||
    inline int operator()(Integer *a){
 | 
					    inline int operator()(Integer *a){
 | 
				
			||||||
      return 0;
 | 
					      return a[0];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -145,130 +178,279 @@ namespace Optimization {
 | 
				
			|||||||
  /////////////////////////////////////////////////////
 | 
					  /////////////////////////////////////////////////////
 | 
				
			||||||
  struct Sum{
 | 
					  struct Sum{
 | 
				
			||||||
    //Complex/Real float
 | 
					    //Complex/Real float
 | 
				
			||||||
    inline float operator()(float a, float b){
 | 
					    inline u128f operator()(u128f a, u128f b){
 | 
				
			||||||
      return 0;
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = a.f[0] + b.f[0];
 | 
				
			||||||
 | 
					      out.f[1] = a.f[1] + b.f[1];
 | 
				
			||||||
 | 
					      out.f[2] = a.f[2] + b.f[2];
 | 
				
			||||||
 | 
					      out.f[3] = a.f[3] + b.f[3];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex/Real double
 | 
					    //Complex/Real double
 | 
				
			||||||
    inline double operator()(double a, double b){
 | 
					    inline u128d operator()(u128d a, u128d b){
 | 
				
			||||||
      return 0;
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      out.f[0] = a.f[0] + b.f[0];
 | 
				
			||||||
 | 
					      out.f[1] = a.f[1] + b.f[1];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Integer
 | 
					    //Integer
 | 
				
			||||||
    inline int operator()(int a, int b){
 | 
					    inline int operator()(int a, int b){
 | 
				
			||||||
      return 0;
 | 
					      return a + b;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Sub{
 | 
					  struct Sub{
 | 
				
			||||||
    //Complex/Real float
 | 
					    //Complex/Real float
 | 
				
			||||||
    inline float operator()(float a, float b){
 | 
					    inline u128f operator()(u128f a, u128f b){
 | 
				
			||||||
      return 0;
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = a.f[0] - b.f[0];
 | 
				
			||||||
 | 
					      out.f[1] = a.f[1] - b.f[1];
 | 
				
			||||||
 | 
					      out.f[2] = a.f[2] - b.f[2];
 | 
				
			||||||
 | 
					      out.f[3] = a.f[3] - b.f[3];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex/Real double
 | 
					    //Complex/Real double
 | 
				
			||||||
    inline double operator()(double a, double b){
 | 
					    inline u128d operator()(u128d a, u128d b){
 | 
				
			||||||
      return 0;
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      out.f[0] = a.f[0] - b.f[0];
 | 
				
			||||||
 | 
					      out.f[1] = a.f[1] - b.f[1];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Integer
 | 
					    //Integer
 | 
				
			||||||
    inline int operator()(int a, int b){
 | 
					    inline int operator()(int a, int b){
 | 
				
			||||||
      return 0;
 | 
					      return a-b;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct MultComplex{
 | 
					  struct MultComplex{
 | 
				
			||||||
    // Complex float
 | 
					    // Complex float
 | 
				
			||||||
    inline float operator()(float a, float b){
 | 
					    inline u128f operator()(u128f a, u128f b){
 | 
				
			||||||
      return 0;
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
 | 
				
			||||||
 | 
					      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
 | 
				
			||||||
 | 
					      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
 | 
				
			||||||
 | 
					      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Complex double
 | 
					    // Complex double
 | 
				
			||||||
    inline double operator()(double a, double b){
 | 
					    inline u128d operator()(u128d a, u128d b){
 | 
				
			||||||
      return 0;
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
 | 
				
			||||||
 | 
					      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Mult{
 | 
					  struct Mult{
 | 
				
			||||||
    inline float  mac(float a, float b,double c){
 | 
					    //CK: Appear unneeded
 | 
				
			||||||
      return 0;
 | 
					    // inline float  mac(float a, float b,double c){
 | 
				
			||||||
    }
 | 
					    //   return 0;
 | 
				
			||||||
    inline double mac(double a, double b,double c){
 | 
					    // }
 | 
				
			||||||
      return 0;
 | 
					    // inline double mac(double a, double b,double c){
 | 
				
			||||||
    }
 | 
					    //   return 0;
 | 
				
			||||||
 | 
					    // }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Real float
 | 
					    // Real float
 | 
				
			||||||
    inline float operator()(float a, float b){
 | 
					    inline u128f operator()(u128f a, u128f b){
 | 
				
			||||||
      return 0;
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = a.f[0]*b.f[0];
 | 
				
			||||||
 | 
					      out.f[1] = a.f[1]*b.f[1];
 | 
				
			||||||
 | 
					      out.f[2] = a.f[2]*b.f[2];
 | 
				
			||||||
 | 
					      out.f[3] = a.f[3]*b.f[3];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Real double
 | 
					    // Real double
 | 
				
			||||||
    inline double operator()(double a, double b){
 | 
					    inline u128d operator()(u128d a, u128d b){
 | 
				
			||||||
      return 0;
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      out.f[0] = a.f[0]*b.f[0];
 | 
				
			||||||
 | 
					      out.f[1] = a.f[1]*b.f[1];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Integer
 | 
					    // Integer
 | 
				
			||||||
    inline int operator()(int a, int b){
 | 
					    inline int operator()(int a, int b){
 | 
				
			||||||
      return 0;
 | 
					      return a*b;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Conj{
 | 
					  struct Conj{
 | 
				
			||||||
    // Complex single
 | 
					    // Complex single
 | 
				
			||||||
    inline float operator()(float in){
 | 
					    inline u128f operator()(u128f in){
 | 
				
			||||||
      return 0;
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = in.f[0];
 | 
				
			||||||
 | 
					      out.f[1] = -in.f[1];
 | 
				
			||||||
 | 
					      out.f[2] = in.f[2];
 | 
				
			||||||
 | 
					      out.f[3] = -in.f[3];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Complex double
 | 
					    // Complex double
 | 
				
			||||||
    inline double operator()(double in){
 | 
					    inline u128d operator()(u128d in){
 | 
				
			||||||
      return 0;
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      out.f[0] = in.f[0];
 | 
				
			||||||
 | 
					      out.f[1] = -in.f[1];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // do not define for integer input
 | 
					    // do not define for integer input
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct TimesMinusI{
 | 
					  struct TimesMinusI{
 | 
				
			||||||
    //Complex single
 | 
					    //Complex single
 | 
				
			||||||
    inline float operator()(float in, float ret){
 | 
					    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
 | 
				
			||||||
      return 0;
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = in.f[1];
 | 
				
			||||||
 | 
					      out.f[1] = -in.f[0];
 | 
				
			||||||
 | 
					      out.f[2] = in.f[3];
 | 
				
			||||||
 | 
					      out.f[3] = -in.f[2];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex double
 | 
					    //Complex double
 | 
				
			||||||
    inline double operator()(double in, double ret){
 | 
					    inline u128d operator()(u128d in, u128d ret){
 | 
				
			||||||
      return 0;
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      out.f[0] = in.f[1];
 | 
				
			||||||
 | 
					      out.f[1] = -in.f[0];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct TimesI{
 | 
					  struct TimesI{
 | 
				
			||||||
    //Complex single
 | 
					    //Complex single
 | 
				
			||||||
    inline float operator()(float in, float ret){
 | 
					    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
 | 
				
			||||||
      return 0;
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = -in.f[1];
 | 
				
			||||||
 | 
					      out.f[1] = in.f[0];
 | 
				
			||||||
 | 
					      out.f[2] = -in.f[3];
 | 
				
			||||||
 | 
					      out.f[3] = in.f[2];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex double
 | 
					    //Complex double
 | 
				
			||||||
    inline double operator()(double in, double ret){
 | 
					    inline u128d operator()(u128d in, u128d ret){
 | 
				
			||||||
      return 0;
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      out.f[0] = -in.f[1];
 | 
				
			||||||
 | 
					      out.f[1] = in.f[0];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //////////////////////////////////////////////
 | 
					  //////////////////////////////////////////////
 | 
				
			||||||
  // Some Template specialization
 | 
					  // Some Template specialization
 | 
				
			||||||
 | 
					  struct Permute{
 | 
				
			||||||
 | 
					    //We just have to mirror the permutes of Grid_sse4.h
 | 
				
			||||||
 | 
					    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
 | 
				
			||||||
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = in.f[2];
 | 
				
			||||||
 | 
					      out.f[1] = in.f[3];
 | 
				
			||||||
 | 
					      out.f[2] = in.f[0];
 | 
				
			||||||
 | 
					      out.f[3] = in.f[1];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
 | 
				
			||||||
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      out.f[0] = in.f[1];
 | 
				
			||||||
 | 
					      out.f[1] = in.f[0];
 | 
				
			||||||
 | 
					      out.f[2] = in.f[3];
 | 
				
			||||||
 | 
					      out.f[3] = in.f[2];
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    static inline u128f Permute2(u128f in){
 | 
				
			||||||
 | 
					      return in;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    static inline u128f Permute3(u128f in){
 | 
				
			||||||
 | 
					      return in;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static inline u128d Permute0(u128d in){ //AB -> BA
 | 
				
			||||||
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      out.f[0] = in.f[1];
 | 
				
			||||||
 | 
					      out.f[1] = in.f[0];
 | 
				
			||||||
 | 
					      return out;      
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    static inline u128d Permute1(u128d in){
 | 
				
			||||||
 | 
					      return in;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    static inline u128d Permute2(u128d in){
 | 
				
			||||||
 | 
					      return in;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    static inline u128d Permute3(u128d in){
 | 
				
			||||||
 | 
					      return in;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
  template < typename vtype > 
 | 
					  template < typename vtype > 
 | 
				
			||||||
    void permute(vtype &a, vtype b, int perm) {
 | 
					    void permute(vtype &a, vtype b, int perm) {
 | 
				
			||||||
   };
 | 
					   };
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					  struct Rotate{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static inline u128f rotate(u128f in,int n){
 | 
				
			||||||
 | 
					      u128f out;
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0:
 | 
				
			||||||
 | 
					        out.f[0] = in.f[0];
 | 
				
			||||||
 | 
					        out.f[1] = in.f[1];
 | 
				
			||||||
 | 
					        out.f[2] = in.f[2];
 | 
				
			||||||
 | 
					        out.f[3] = in.f[3];
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
 | 
					      case 1:
 | 
				
			||||||
 | 
					        out.f[0] = in.f[1];
 | 
				
			||||||
 | 
					        out.f[1] = in.f[2];
 | 
				
			||||||
 | 
					        out.f[2] = in.f[3];
 | 
				
			||||||
 | 
					        out.f[3] = in.f[0];
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
 | 
					      case 2:
 | 
				
			||||||
 | 
					        out.f[0] = in.f[2];
 | 
				
			||||||
 | 
					        out.f[1] = in.f[3];
 | 
				
			||||||
 | 
					        out.f[2] = in.f[0];
 | 
				
			||||||
 | 
					        out.f[3] = in.f[1];
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
 | 
					      case 3:
 | 
				
			||||||
 | 
					        out.f[0] = in.f[3];
 | 
				
			||||||
 | 
					        out.f[1] = in.f[0];
 | 
				
			||||||
 | 
					        out.f[2] = in.f[1];
 | 
				
			||||||
 | 
					        out.f[3] = in.f[2];
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    static inline u128d rotate(u128d in,int n){
 | 
				
			||||||
 | 
					      u128d out;
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0:
 | 
				
			||||||
 | 
					        out.f[0] = in.f[0];
 | 
				
			||||||
 | 
					        out.f[1] = in.f[1];
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
 | 
					      case 1:
 | 
				
			||||||
 | 
					        out.f[0] = in.f[1];
 | 
				
			||||||
 | 
					        out.f[1] = in.f[0];
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      return out;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //Complex float Reduce
 | 
					  //Complex float Reduce
 | 
				
			||||||
  template<>
 | 
					  template<>
 | 
				
			||||||
  inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
 | 
					  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
 | 
				
			||||||
    return 0;
 | 
					    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  //Real float Reduce
 | 
					  //Real float Reduce
 | 
				
			||||||
  template<>
 | 
					  template<>
 | 
				
			||||||
  inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
 | 
					  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
 | 
				
			||||||
    return 0;
 | 
					    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  //Complex double Reduce
 | 
					  //Complex double Reduce
 | 
				
			||||||
  template<>
 | 
					  template<>
 | 
				
			||||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
 | 
					  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
 | 
				
			||||||
    return 0;
 | 
					    return Grid::ComplexD(in.f[0],in.f[1]);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  //Real double Reduce
 | 
					  //Real double Reduce
 | 
				
			||||||
  template<>
 | 
					  template<>
 | 
				
			||||||
  inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
 | 
					  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
 | 
				
			||||||
    return 0;
 | 
					    return in.f[0] + in.f[1];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //Integer Reduce
 | 
					  //Integer Reduce
 | 
				
			||||||
@@ -282,10 +464,9 @@ namespace Optimization {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Here assign types 
 | 
					// Here assign types 
 | 
				
			||||||
namespace Grid {
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  typedef float SIMD_Ftype;  // Single precision type
 | 
					  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
 | 
				
			||||||
  typedef double SIMD_Dtype; // Double precision type
 | 
					  typedef Optimization::u128d SIMD_Dtype; // Double precision type
 | 
				
			||||||
  typedef int SIMD_Itype; // Integer type
 | 
					  typedef int SIMD_Itype; // Integer type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // prefetch utilities
 | 
					  // prefetch utilities
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
//----------------------------------------------------------------------
 | 
					//----------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <immintrin.h>
 | 
					#include <immintrin.h>
 | 
				
			||||||
 | 
					#include <zmmintrin.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid{
 | 
				
			||||||
namespace Optimization {
 | 
					namespace Optimization {
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  struct Vsplat{
 | 
					  struct Vsplat{
 | 
				
			||||||
@@ -316,6 +318,54 @@ namespace Optimization {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					  struct Rotate{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static inline __m512 rotate(__m512 in,int n){ 
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0: return tRotate<0>(in);break;
 | 
				
			||||||
 | 
					      case 1: return tRotate<1>(in);break;
 | 
				
			||||||
 | 
					      case 2: return tRotate<2>(in);break;
 | 
				
			||||||
 | 
					      case 3: return tRotate<3>(in);break;
 | 
				
			||||||
 | 
					      case 4: return tRotate<4>(in);break;
 | 
				
			||||||
 | 
					      case 5: return tRotate<5>(in);break;
 | 
				
			||||||
 | 
					      case 6: return tRotate<6>(in);break;
 | 
				
			||||||
 | 
					      case 7: return tRotate<7>(in);break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      case 8 : return tRotate<8>(in);break;
 | 
				
			||||||
 | 
					      case 9 : return tRotate<9>(in);break;
 | 
				
			||||||
 | 
					      case 10: return tRotate<10>(in);break;
 | 
				
			||||||
 | 
					      case 11: return tRotate<11>(in);break;
 | 
				
			||||||
 | 
					      case 12: return tRotate<12>(in);break;
 | 
				
			||||||
 | 
					      case 13: return tRotate<13>(in);break;
 | 
				
			||||||
 | 
					      case 14: return tRotate<14>(in);break;
 | 
				
			||||||
 | 
					      case 15: return tRotate<15>(in);break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    static inline __m512d rotate(__m512d in,int n){ 
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0: return tRotate<0>(in);break;
 | 
				
			||||||
 | 
					      case 1: return tRotate<1>(in);break;
 | 
				
			||||||
 | 
					      case 2: return tRotate<2>(in);break;
 | 
				
			||||||
 | 
					      case 3: return tRotate<3>(in);break;
 | 
				
			||||||
 | 
					      case 4: return tRotate<4>(in);break;
 | 
				
			||||||
 | 
					      case 5: return tRotate<5>(in);break;
 | 
				
			||||||
 | 
					      case 6: return tRotate<6>(in);break;
 | 
				
			||||||
 | 
					      case 7: return tRotate<7>(in);break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<int n> static inline __m512 tRotate(__m512 in){ 
 | 
				
			||||||
 | 
					      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<int n> static inline __m512d tRotate(__m512d in){ 
 | 
				
			||||||
 | 
					      return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);          
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //////////////////////////////////////////////
 | 
					  //////////////////////////////////////////////
 | 
				
			||||||
@@ -358,7 +408,7 @@ namespace Optimization {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Here assign types 
 | 
					// Here assign types 
 | 
				
			||||||
namespace Grid {
 | 
					
 | 
				
			||||||
  typedef __m512 SIMD_Ftype;  // Single precision type
 | 
					  typedef __m512 SIMD_Ftype;  // Single precision type
 | 
				
			||||||
  typedef __m512d SIMD_Dtype; // Double precision type
 | 
					  typedef __m512d SIMD_Dtype; // Double precision type
 | 
				
			||||||
  typedef __m512i SIMD_Itype; // Integer type
 | 
					  typedef __m512i SIMD_Itype; // Integer type
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -267,10 +267,10 @@ namespace Optimization {
 | 
				
			|||||||
  struct Permute{
 | 
					  struct Permute{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    static inline __m128 Permute0(__m128 in){
 | 
					    static inline __m128 Permute0(__m128 in){
 | 
				
			||||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
 | 
					      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    static inline __m128 Permute1(__m128 in){
 | 
					    static inline __m128 Permute1(__m128 in){
 | 
				
			||||||
      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
					      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    static inline __m128 Permute2(__m128 in){
 | 
					    static inline __m128 Permute2(__m128 in){
 | 
				
			||||||
      return in;
 | 
					      return in;
 | 
				
			||||||
@@ -279,7 +279,7 @@ namespace Optimization {
 | 
				
			|||||||
      return in;
 | 
					      return in;
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    static inline __m128d Permute0(__m128d in){
 | 
					    static inline __m128d Permute0(__m128d in){ //AB -> BA
 | 
				
			||||||
      return _mm_shuffle_pd(in,in,0x1);
 | 
					      return _mm_shuffle_pd(in,in,0x1);
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    static inline __m128d Permute1(__m128d in){
 | 
					    static inline __m128d Permute1(__m128d in){
 | 
				
			||||||
@@ -294,6 +294,32 @@ namespace Optimization {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Rotate{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    static inline __m128 rotate(__m128 in,int n){ 
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0: return tRotate<0>(in);break;
 | 
				
			||||||
 | 
					      case 1: return tRotate<1>(in);break;
 | 
				
			||||||
 | 
					      case 2: return tRotate<2>(in);break;
 | 
				
			||||||
 | 
					      case 3: return tRotate<3>(in);break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    static inline __m128d rotate(__m128d in,int n){ 
 | 
				
			||||||
 | 
					      switch(n){
 | 
				
			||||||
 | 
					      case 0: return tRotate<0>(in);break;
 | 
				
			||||||
 | 
					      case 1: return tRotate<1>(in);break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 | 
				
			||||||
 | 
					#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
 | 
				
			||||||
 | 
					    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
  //////////////////////////////////////////////
 | 
					  //////////////////////////////////////////////
 | 
				
			||||||
  // Some Template specialization
 | 
					  // Some Template specialization
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -299,16 +299,44 @@ namespace Grid {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
 | 
					    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      if      (perm==3) permute3(y,b);
 | 
					      if ( perm & RotateBit ) {
 | 
				
			||||||
      else if (perm==2) permute2(y,b);
 | 
						int dist = perm&0xF;
 | 
				
			||||||
      else if (perm==1) permute1(y,b);
 | 
					        y=rotate(b,dist);
 | 
				
			||||||
      else if (perm==0) permute0(y,b);
 | 
						return;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      switch(perm){
 | 
				
			||||||
 | 
					      case 3: permute3(y,b); break;
 | 
				
			||||||
 | 
					      case 2: permute2(y,b); break;
 | 
				
			||||||
 | 
					      case 1: permute1(y,b); break;
 | 
				
			||||||
 | 
					      case 0: permute0(y,b); break;
 | 
				
			||||||
 | 
					      default: assert(0);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
  };// end of Grid_simd class definition 
 | 
					  };// end of Grid_simd class definition 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // General rotate
 | 
				
			||||||
 | 
					  ////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  template <class S, class V, IfNotComplex<S> =0> 
 | 
				
			||||||
 | 
					  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    nrot = nrot % Grid_simd<S,V>::Nsimd();
 | 
				
			||||||
 | 
					    Grid_simd<S,V> ret;
 | 
				
			||||||
 | 
					    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
 | 
				
			||||||
 | 
					    ret.v = Optimization::Rotate::rotate(b.v,nrot);
 | 
				
			||||||
 | 
					    return ret;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  template <class S, class V, IfComplex<S> =0> 
 | 
				
			||||||
 | 
					  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    nrot = nrot % Grid_simd<S,V>::Nsimd();
 | 
				
			||||||
 | 
					    Grid_simd<S,V> ret;
 | 
				
			||||||
 | 
					    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
 | 
				
			||||||
 | 
					    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
 | 
				
			||||||
 | 
					    return ret;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////
 | 
					  ///////////////////////
 | 
				
			||||||
  // Splat
 | 
					  // Splat
 | 
				
			||||||
  ///////////////////////
 | 
					  ///////////////////////
 | 
				
			||||||
@@ -339,6 +367,9 @@ namespace Grid {
 | 
				
			|||||||
  template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,S(0.0,0.0)); }// use xor?
 | 
					  template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,S(0.0,0.0)); }// use xor?
 | 
				
			||||||
  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 
 | 
					  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));} 
 | 
				
			||||||
 | 
					  template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));} 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // if not complex overload here 
 | 
					  // if not complex overload here 
 | 
				
			||||||
  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
 | 
					  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
 | 
				
			||||||
  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
 | 
					  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										197
									
								
								lib/simd/Intel512avx.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										197
									
								
								lib/simd/Intel512avx.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,197 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/simd/Avx512Asm.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#ifndef GRID_ASM_AV512_H
 | 
				
			||||||
 | 
					#define GRID_ASM_AV512_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////	  
 | 
				
			||||||
 | 
					// Knights Landing specials
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////	  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 | 
				
			||||||
 | 
					#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
 | 
				
			||||||
 | 
					#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
 | 
				
			||||||
 | 
					#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
 | 
				
			||||||
 | 
					#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
				
			||||||
 | 
					  VSHUFMEMf(O,P,tmp) \
 | 
				
			||||||
 | 
					  VMULMEMf(O,P,B,Biirr) \
 | 
				
			||||||
 | 
					  VMULMEMf(O,P,C,Ciirr) \
 | 
				
			||||||
 | 
					  VMULf(tmp,B,Briir) \
 | 
				
			||||||
 | 
					  VMULf(tmp,C,Criir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
				
			||||||
 | 
					  VSHUFMEMd(O,P,tmp)  \
 | 
				
			||||||
 | 
					  VMULMEMd(O,P,B,Biirr)  \ 
 | 
				
			||||||
 | 
					  VMULMEMd(O,P,C,Ciirr)  \
 | 
				
			||||||
 | 
					  VMULd(tmp,B,Briir)  \
 | 
				
			||||||
 | 
					  VMULd(tmp,C,Criir) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
				
			||||||
 | 
					  VSHUFMEMf(O,P,tmp) \
 | 
				
			||||||
 | 
					  VMADDMEMf(O,P,B,Biirr) \
 | 
				
			||||||
 | 
					  VMADDMEMf(O,P,C,Ciirr) \
 | 
				
			||||||
 | 
					  VMADDf(tmp,B,Briir) \
 | 
				
			||||||
 | 
					  VMADDf(tmp,C,Criir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
 | 
				
			||||||
 | 
					  VSHUFMEMd(O,P,tmp) \
 | 
				
			||||||
 | 
					  VMADDMEMd(O,P,B,Biirr) \
 | 
				
			||||||
 | 
					  VMADDMEMd(O,P,C,Ciirr) \
 | 
				
			||||||
 | 
					  VMADDd(tmp,B,Briir) \
 | 
				
			||||||
 | 
					  VMADDd(tmp,C,Criir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Merges accumulation for complex dot chain; less efficient under avx512
 | 
				
			||||||
 | 
					#define ZEND1f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Criir "," #Criir "," #tmp   ";\n"\
 | 
				
			||||||
 | 
					                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
 | 
				
			||||||
 | 
					                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
 | 
				
			||||||
 | 
					                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
 | 
				
			||||||
 | 
					                         	  "vsubpd  " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 | 
				
			||||||
 | 
					#define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
 | 
				
			||||||
 | 
					#define VMOVRDUPf(OFF,PTR,DEST)         "vmovsldup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VMOVIDUPf(OFF,PTR,DEST)         "vmovshdup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VRDUPd(SRC,DEST)       "vpshufd  $0x44," #SRC"," #DEST  ";\n" // 32 bit level: 1,0,3,2
 | 
				
			||||||
 | 
					#define VRDUPf(SRC,DEST)         "vmovsldup " #SRC ", " #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
 | 
				
			||||||
 | 
					#define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
 | 
				
			||||||
 | 
					#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
 | 
				
			||||||
 | 
					#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps   " #O"*64("#P "),"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					#define VMULIDUPd(O,P,B,accum) "vmulpd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 | 
				
			||||||
 | 
					  /*
 | 
				
			||||||
 | 
					   * TimesI is used only in the XP recon
 | 
				
			||||||
 | 
					   * Could zero the regs and use RECON_ACCUM
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
 | 
				
			||||||
 | 
					#define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					#define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESI0d(A,DEST, Z)   VSHUFd(A,DEST)	 
 | 
				
			||||||
 | 
					#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0f(A,DEST,Z)  VSHUFf(A,DEST)					
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0d(A,DEST,Z)  VSHUFd(A,DEST)					
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
 | 
				
			||||||
 | 
					#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
				
			||||||
 | 
					#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
 | 
				
			||||||
 | 
					#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 | 
				
			||||||
 | 
					#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// o_p must point to floating 1.0f/d
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					// Ai, Ar -> tmp (r i)
 | 
				
			||||||
 | 
					// tmp *1.0 
 | 
				
			||||||
 | 
					// ACC i - Ar ; ACC r + Ai
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1f(A,ACC,tmp)  VMADDMEMf(1,%r10,tmp,ACC)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2f(A,ACC,tmp)  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1d(A,ACC,tmp)  VMADDMEMd(1,%r10,tmp,ACC)  
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2d(A,ACC,tmp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Ai, Ar -> tmp (r i)
 | 
				
			||||||
 | 
					// tmp *1.0 
 | 
				
			||||||
 | 
					// ACC i + Ar ; ACC r - Ai
 | 
				
			||||||
 | 
					#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
 | 
				
			||||||
 | 
					#define  VACCTIMESI1f(A,ACC,tmp)  VMADDMEMf(0,%r10,tmp,ACC)  
 | 
				
			||||||
 | 
					#define  VACCTIMESI2f(A,ACC,tmp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
 | 
				
			||||||
 | 
					#define  VACCTIMESI1d(A,ACC,tmp)  VMADDMEMd(0,%r10,tmp,ACC)  
 | 
				
			||||||
 | 
					#define  VACCTIMESI2d(A,ACC,tmp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM3f(A,B) "vshufps     $0xb1," #A "," #B "," #B ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VPERM0d(A,B) "vshuff64x2  $0x4e," #A "," #B "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM1d(A,B) "vshuff64x2  $0xb1," #A "," #B "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM2d(A,B) "vshufpd     $0x55," #A "," #B "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM3d(A,B) VMOVd(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										141
									
								
								lib/simd/Intel512common.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										141
									
								
								lib/simd/Intel512common.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,141 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/simd/Avx512Asm.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#ifndef GRID_ASM_INTEL_COMMON_512_H
 | 
				
			||||||
 | 
					#define GRID_ASM_INTEL_COMMON_512_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// Opcodes common 
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					#define MASK_REGS \
 | 
				
			||||||
 | 
					  __asm__ ("mov     $0xAAAA, %%eax \n"\ 
 | 
				
			||||||
 | 
					           "kmovw    %%eax, %%k6 \n"\
 | 
				
			||||||
 | 
					           "mov     $0x5555, %%eax \n"\
 | 
				
			||||||
 | 
					           "kmovw    %%eax, %%k7 \n" : : : "%eax");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
				
			||||||
 | 
					#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESIf(A,DEST, Z) \
 | 
				
			||||||
 | 
					  VTIMESI0f(A,DEST, Z) \
 | 
				
			||||||
 | 
					  VTIMESI1f(A,DEST, Z) \
 | 
				
			||||||
 | 
					  VTIMESI2f(A,DEST, Z) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESId(A,DEST, Z) \
 | 
				
			||||||
 | 
					  VTIMESI0d(A,DEST, Z) \
 | 
				
			||||||
 | 
					  VTIMESI1d(A,DEST, Z) \
 | 
				
			||||||
 | 
					  VTIMESI2d(A,DEST, Z) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESMINUSIf(A,DEST, Z) \
 | 
				
			||||||
 | 
					        VTIMESMINUSI0f(A,DEST, Z) \
 | 
				
			||||||
 | 
					        VTIMESMINUSI1f(A,DEST, Z) \
 | 
				
			||||||
 | 
					        VTIMESMINUSI2f(A,DEST, Z) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESMINUSId(A,DEST, Z) \
 | 
				
			||||||
 | 
					        VTIMESMINUSI0d(A,DEST, Z) \
 | 
				
			||||||
 | 
					        VTIMESMINUSI1d(A,DEST, Z) \
 | 
				
			||||||
 | 
					        VTIMESMINUSI2d(A,DEST, Z) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESIf(A,ACC,tmp)			\
 | 
				
			||||||
 | 
					 VACCTIMESI0f(A,ACC,tmp)			\
 | 
				
			||||||
 | 
					 VACCTIMESI1f(A,ACC,tmp)			\
 | 
				
			||||||
 | 
					 VACCTIMESI2f(A,ACC,tmp)			
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESId(A,ACC,tmp)			\
 | 
				
			||||||
 | 
					 VACCTIMESI0d(A,ACC,tmp)			\
 | 
				
			||||||
 | 
					 VACCTIMESI1d(A,ACC,tmp)			\
 | 
				
			||||||
 | 
					 VACCTIMESI2d(A,ACC,tmp)			
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSIf(A,ACC,tmp)			\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0f(A,ACC,tmp)				\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1f(A,ACC,tmp)				\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2f(A,ACC,tmp)			
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSId(A,ACC,tmp)			\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0d(A,ACC,tmp)				\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1d(A,ACC,tmp)				\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2d(A,ACC,tmp)			
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
 | 
				
			||||||
 | 
					#define LOAD64(A,ptr)  LOAD64i(A,ptr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
 | 
				
			||||||
 | 
					#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
 | 
				
			||||||
 | 
					#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 | 
				
			||||||
 | 
					#define VEVICT(O,A)   
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 | 
				
			||||||
 | 
					//  "clevict0 "#O"*64("#A");\n" 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VSUBf(A,B,DEST)        "vsubps   " #A "," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VSUBd(A,B,DEST)        "vsubpd   " #A "," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VADDMEMf(O,A,B,DEST)        "vaddps   "#O"*64("#A ")," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VADDMEMd(O,A,B,DEST)        "vaddpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VSUBMEMf(O,A,B,DEST)        "vsubps   "#O"*64("#A ")," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VSUBMEMd(O,A,B,DEST)        "vsubpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMULf(A,B,DEST)        "vmulps   " #A "," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VMULd(A,B,DEST)        "vmulpd   " #A "," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMADDf(A,B,DEST)       "vfmadd231ps   " #A "," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VMADDd(A,B,DEST)       "vfmadd231pd   " #A "," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMULMEMf(O,A,B,DEST)   "vmulps   " #O"*64("#A ")," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VMULMEMd(O,A,B,DEST)   "vmulpd   " #O"*64("#A ")," #B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VMADDMEMf(O,A,B,DEST)       "vfmadd231ps   " #O"*64("#A "),"#B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					#define VMADDMEMd(O,A,B,DEST)       "vfmadd231pd   " #O"*64("#A "),"#B "," #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 | 
				
			||||||
 | 
					#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VPREFETCHNTA(O,A) 
 | 
				
			||||||
 | 
					#define VPREFETCH(O,A)    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
				
			||||||
 | 
					#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Swaps Re/Im ; could unify this with IMCI
 | 
				
			||||||
 | 
					#define VSHUFd(A,DEST)         "vpshufd  $0x4e," #A "," #DEST  ";\n"    
 | 
				
			||||||
 | 
					#define VSHUFf(A,DEST)         "vpshufd  $0xb1," #A "," #DEST  ";\n"    
 | 
				
			||||||
 | 
					#define VSHUFMEMd(OFF,A,DEST)  "vpshufd  $0x4e, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 | 
				
			||||||
 | 
					#define VSHUFMEMf(OFF,A,DEST)  "vpshufd  $0xb1, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 2,3,0,1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TRAP " int3 ;\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										154
									
								
								lib/simd/Intel512double.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								lib/simd/Intel512double.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,154 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/simd/Avx512Asm.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					// No guard can be multiply included as undef clearage
 | 
				
			||||||
 | 
					#undef VZERO
 | 
				
			||||||
 | 
					#undef VMOV
 | 
				
			||||||
 | 
					#undef VLOAD
 | 
				
			||||||
 | 
					#undef VSTORE
 | 
				
			||||||
 | 
					#define VZERO(A)                  VZEROd(A)
 | 
				
			||||||
 | 
					#define VMOV(A,B)                 VMOVd(A,B)
 | 
				
			||||||
 | 
					#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
 | 
				
			||||||
 | 
					#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VADD
 | 
				
			||||||
 | 
					#undef VSUB
 | 
				
			||||||
 | 
					#undef VMUL
 | 
				
			||||||
 | 
					#undef VMADD
 | 
				
			||||||
 | 
					#define VADD(A,B,C)               VADDd(A,B,C)
 | 
				
			||||||
 | 
					#define VSUB(A,B,C)               VSUBd(A,B,C)
 | 
				
			||||||
 | 
					#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
 | 
				
			||||||
 | 
					#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VTIMESI
 | 
				
			||||||
 | 
					#undef VTIMESI0 
 | 
				
			||||||
 | 
					#undef VTIMESI1
 | 
				
			||||||
 | 
					#undef VTIMESI2 
 | 
				
			||||||
 | 
					#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI0
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI1
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI2
 | 
				
			||||||
 | 
					#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESI
 | 
				
			||||||
 | 
					#undef VACCTIMESI0
 | 
				
			||||||
 | 
					#undef VACCTIMESI1
 | 
				
			||||||
 | 
					#undef VACCTIMESI2
 | 
				
			||||||
 | 
					#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI0
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI1
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI2
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESI1MEM
 | 
				
			||||||
 | 
					#undef VACCTIMESI2MEM
 | 
				
			||||||
 | 
					#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI1MEM
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI2MEM
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VPERM0
 | 
				
			||||||
 | 
					#undef VPERM1
 | 
				
			||||||
 | 
					#undef VPERM2
 | 
				
			||||||
 | 
					#undef VPERM3
 | 
				
			||||||
 | 
					#define VPERM0(A,B)               VPERM0d(A,B)
 | 
				
			||||||
 | 
					#define VPERM1(A,B)               VPERM1d(A,B)
 | 
				
			||||||
 | 
					#define VPERM2(A,B)               VPERM2d(A,B)
 | 
				
			||||||
 | 
					#define VPERM3(A,B)               VPERM3d(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VSHUFMEM
 | 
				
			||||||
 | 
					#undef VADDMEM
 | 
				
			||||||
 | 
					#undef VSUBMEM
 | 
				
			||||||
 | 
					#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
 | 
				
			||||||
 | 
					#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
 | 
				
			||||||
 | 
					#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VMOVIDUP
 | 
				
			||||||
 | 
					#undef VMOVRDUP
 | 
				
			||||||
 | 
					#undef VMADDSUB
 | 
				
			||||||
 | 
					#undef VSHUF
 | 
				
			||||||
 | 
					#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
 | 
				
			||||||
 | 
					#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
 | 
				
			||||||
 | 
					#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
 | 
				
			||||||
 | 
					#define VSHUF(A,B)                                       VSHUFd(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef ZEND1
 | 
				
			||||||
 | 
					#undef ZEND2
 | 
				
			||||||
 | 
					#undef ZLOAD
 | 
				
			||||||
 | 
					#undef ZMUL
 | 
				
			||||||
 | 
					#undef ZMADD
 | 
				
			||||||
 | 
					#undef ZMULMEM2SP
 | 
				
			||||||
 | 
					#undef ZMADDMEM2SP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
 | 
				
			||||||
 | 
					#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
 | 
				
			||||||
 | 
					#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
 | 
				
			||||||
 | 
					#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VRDUP
 | 
				
			||||||
 | 
					#undef VIDUP
 | 
				
			||||||
 | 
					#undef VMADDSUBMEM
 | 
				
			||||||
 | 
					#undef VMADDMEM
 | 
				
			||||||
 | 
					#undef VMULMEM
 | 
				
			||||||
 | 
					#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST) 
 | 
				
			||||||
 | 
					#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST) 
 | 
				
			||||||
 | 
					#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
 | 
				
			||||||
 | 
					#define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
 | 
				
			||||||
 | 
					#define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
 | 
				
			||||||
 | 
					#undef VMADDSUBRDUP   
 | 
				
			||||||
 | 
					#undef VMADDSUBIDUP   
 | 
				
			||||||
 | 
					#undef VMULRDUP   
 | 
				
			||||||
 | 
					#undef VMULIDUP   
 | 
				
			||||||
 | 
					#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
 | 
				
			||||||
 | 
					#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
 | 
				
			||||||
 | 
					#define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
 | 
				
			||||||
 | 
					#define VMULIDUP(O,P,B,accum)     VMULIDUPd(O,P,B,accum) 
 | 
				
			||||||
							
								
								
									
										127
									
								
								lib/simd/Intel512imci.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										127
									
								
								lib/simd/Intel512imci.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,127 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/simd/Avx512Asm.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#ifndef GRID_ASM_AV512_H
 | 
				
			||||||
 | 
					#define GRID_ASM_AV512_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////	  
 | 
				
			||||||
 | 
					// Knights Corner specials
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////	  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 | 
				
			||||||
 | 
					#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
 | 
				
			||||||
 | 
					#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
 | 
				
			||||||
 | 
					#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
 | 
				
			||||||
 | 
					#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
				
			||||||
 | 
					  VSHUFMEMf(O,P,tmp) \
 | 
				
			||||||
 | 
					  VMULMEMf(O,P,B,Biirr) \
 | 
				
			||||||
 | 
					  VMULMEMf(O,P,C,Ciirr) \
 | 
				
			||||||
 | 
					  VMULf(tmp,B,Briir) \
 | 
				
			||||||
 | 
					  VMULf(tmp,C,Criir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
				
			||||||
 | 
					  VSHUFMEMd(O,P,tmp)  \
 | 
				
			||||||
 | 
					  VMULMEMd(O,P,B,Biirr)  \ 
 | 
				
			||||||
 | 
					  VMULMEMd(O,P,C,Ciirr)  \
 | 
				
			||||||
 | 
					  VMULd(tmp,B,Briir)  \
 | 
				
			||||||
 | 
					  VMULd(tmp,C,Criir) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
 | 
				
			||||||
 | 
					  VSHUFMEMf(O,P,tmp) \
 | 
				
			||||||
 | 
					  VMADDMEMf(O,P,B,Biirr) \
 | 
				
			||||||
 | 
					  VMADDMEMf(O,P,C,Ciirr) \
 | 
				
			||||||
 | 
					  VMADDf(tmp,B,Briir) \
 | 
				
			||||||
 | 
					  VMADDf(tmp,C,Criir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
 | 
				
			||||||
 | 
					  VSHUFMEMd(O,P,tmp) \
 | 
				
			||||||
 | 
					  VMADDMEMd(O,P,B,Biirr) \
 | 
				
			||||||
 | 
					  VMADDMEMd(O,P,C,Ciirr) \
 | 
				
			||||||
 | 
					  VMADDd(tmp,B,Briir) \
 | 
				
			||||||
 | 
					  VMADDd(tmp,C,Criir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND1d(Criir,Ciirr, tmp) "vaddpd  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					#define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND1f(Criir,Ciirr, tmp) "vaddps  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					#define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESI0f(A,DEST, Z)   
 | 
				
			||||||
 | 
					#define VTIMESI1f(A,DEST, Z)   "vaddps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					#define VTIMESI2f(A,DEST, Z)   "vsubps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESI0d(A,DEST, Z)   
 | 
				
			||||||
 | 
					#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0f(A,DEST,Z)  
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0d(A,DEST,Z)  
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define  VACCTIMESI0f(A,ACC,tmp)
 | 
				
			||||||
 | 
					#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 | 
				
			||||||
 | 
					#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define  VACCTIMESI0d(A,ACC,tmp)
 | 
				
			||||||
 | 
					#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 | 
				
			||||||
 | 
					#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0f(A,ACC,tmp)  
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						   // Acc = Acc - i A
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0d(A,ACC,tmp)  
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
 | 
				
			||||||
 | 
					//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VPERM0f(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM1f(A,B) "vpermf32x4  $0xb1," #A "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM2f(A,B) "vmovaps     " #A "{badc}," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM3f(A,B) "vmovaps     " #A "{cdab}," #B ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VPERM0d(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM1d(A,B) "vmovapd     " #A "{badc}," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM2d(A,B) "vmovapd     " #A "{cdab}," #B ";\n"
 | 
				
			||||||
 | 
					#define VPERM3d(A,B) VMOVd(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										155
									
								
								lib/simd/Intel512single.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										155
									
								
								lib/simd/Intel512single.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,155 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/simd/Avx512Asm.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					// No guard can be multiply included as undef clearge of macros
 | 
				
			||||||
 | 
					#undef VZERO
 | 
				
			||||||
 | 
					#undef VMOV
 | 
				
			||||||
 | 
					#undef VLOAD
 | 
				
			||||||
 | 
					#undef VSTORE
 | 
				
			||||||
 | 
					#define VZERO(A)                  VZEROf(A)
 | 
				
			||||||
 | 
					#define VMOV(A,B)                 VMOVf(A,B)
 | 
				
			||||||
 | 
					#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
				
			||||||
 | 
					#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VADD
 | 
				
			||||||
 | 
					#undef VSUB
 | 
				
			||||||
 | 
					#undef VMUL
 | 
				
			||||||
 | 
					#undef VMADD
 | 
				
			||||||
 | 
					#define VADD(A,B,C)               VADDf(A,B,C)
 | 
				
			||||||
 | 
					#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
				
			||||||
 | 
					#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
 | 
				
			||||||
 | 
					#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VTIMESI
 | 
				
			||||||
 | 
					#undef VTIMESI0 
 | 
				
			||||||
 | 
					#undef VTIMESI1
 | 
				
			||||||
 | 
					#undef VTIMESI2 
 | 
				
			||||||
 | 
					#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI0
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI1
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI2
 | 
				
			||||||
 | 
					#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESI
 | 
				
			||||||
 | 
					#undef VACCTIMESI0
 | 
				
			||||||
 | 
					#undef VACCTIMESI1
 | 
				
			||||||
 | 
					#undef VACCTIMESI2
 | 
				
			||||||
 | 
					#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI0
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI1
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI2
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESI1MEM
 | 
				
			||||||
 | 
					#undef VACCTIMESI2MEM
 | 
				
			||||||
 | 
					#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI1MEM
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI2MEM
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VPERM0
 | 
				
			||||||
 | 
					#undef VPERM1
 | 
				
			||||||
 | 
					#undef VPERM2
 | 
				
			||||||
 | 
					#undef VPERM3
 | 
				
			||||||
 | 
					#define VPERM0(A,B)               VPERM0f(A,B)
 | 
				
			||||||
 | 
					#define VPERM1(A,B)               VPERM1f(A,B)
 | 
				
			||||||
 | 
					#define VPERM2(A,B)               VPERM2f(A,B)
 | 
				
			||||||
 | 
					#define VPERM3(A,B)               VPERM3f(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VSHUFMEM
 | 
				
			||||||
 | 
					#undef VADDMEM
 | 
				
			||||||
 | 
					#undef VSUBMEM
 | 
				
			||||||
 | 
					#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
				
			||||||
 | 
					#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
 | 
				
			||||||
 | 
					#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VMOVIDUP
 | 
				
			||||||
 | 
					#undef VMOVRDUP
 | 
				
			||||||
 | 
					#undef VMADDSUB
 | 
				
			||||||
 | 
					#undef VSHUF
 | 
				
			||||||
 | 
					#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
 | 
				
			||||||
 | 
					#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
 | 
				
			||||||
 | 
					#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
 | 
				
			||||||
 | 
					#define VSHUF(A,B)                                       VSHUFf(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef ZEND1
 | 
				
			||||||
 | 
					#undef ZEND2
 | 
				
			||||||
 | 
					#undef ZLOAD
 | 
				
			||||||
 | 
					#undef ZMUL
 | 
				
			||||||
 | 
					#undef ZMADD
 | 
				
			||||||
 | 
					#undef ZMULMEM2SP
 | 
				
			||||||
 | 
					#undef ZMADDMEM2SP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
 | 
				
			||||||
 | 
					#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
 | 
				
			||||||
 | 
					#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
 | 
				
			||||||
 | 
					#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VRDUP
 | 
				
			||||||
 | 
					#undef VIDUP
 | 
				
			||||||
 | 
					#undef VMADDSUBMEM
 | 
				
			||||||
 | 
					#undef VMADDMEM
 | 
				
			||||||
 | 
					#undef VMULMEM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST) 
 | 
				
			||||||
 | 
					#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST) 
 | 
				
			||||||
 | 
					#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
 | 
				
			||||||
 | 
					#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
 | 
				
			||||||
 | 
					#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VMADDSUBRDUP   
 | 
				
			||||||
 | 
					#undef VMADDSUBIDUP   
 | 
				
			||||||
 | 
					#undef VMULRDUP   
 | 
				
			||||||
 | 
					#undef VMULIDUP   
 | 
				
			||||||
 | 
					#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
 | 
				
			||||||
 | 
					#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
 | 
				
			||||||
 | 
					#define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
 | 
				
			||||||
 | 
					#define VMULIDUP(O,P,B,accum)     VMULIDUPf(O,P,B,accum) 
 | 
				
			||||||
 | 
					   
 | 
				
			||||||
							
								
								
									
										849
									
								
								lib/simd/Intel512wilson.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										849
									
								
								lib/simd/Intel512wilson.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,849 @@
 | 
				
			|||||||
 | 
					/*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/simd/Avx512Asm.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#ifndef GRID_ASM_INTEL_512_QCD_H
 | 
				
			||||||
 | 
					#define GRID_ASM_INTEL_512_QCD_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// Register allocations for Wilson Kernel are precision indept
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					#define result_00 %zmm0 
 | 
				
			||||||
 | 
					#define result_01 %zmm1
 | 
				
			||||||
 | 
					#define result_02 %zmm2
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					#define result_10 %zmm3
 | 
				
			||||||
 | 
					#define result_11 %zmm4
 | 
				
			||||||
 | 
					#define result_12 %zmm5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define result_20 %zmm6
 | 
				
			||||||
 | 
					#define result_21 %zmm7
 | 
				
			||||||
 | 
					#define result_22 %zmm8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define result_30 %zmm9
 | 
				
			||||||
 | 
					#define result_31 %zmm10
 | 
				
			||||||
 | 
					#define result_32 %zmm11
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define Chi_00 %zmm12  
 | 
				
			||||||
 | 
					#define Chi_01 %zmm13
 | 
				
			||||||
 | 
					#define Chi_02 %zmm14
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define Chi_10 %zmm15
 | 
				
			||||||
 | 
					#define Chi_11 %zmm16
 | 
				
			||||||
 | 
					#define Chi_12 %zmm17  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define UChi_00 %zmm18 
 | 
				
			||||||
 | 
					#define UChi_01 %zmm19
 | 
				
			||||||
 | 
					#define UChi_02 %zmm20
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define UChi_10 %zmm21
 | 
				
			||||||
 | 
					#define UChi_11 %zmm22
 | 
				
			||||||
 | 
					#define UChi_12 %zmm23 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define Uir %zmm24 
 | 
				
			||||||
 | 
					#define Uri %zmm25  
 | 
				
			||||||
 | 
					#define T1 %zmm24
 | 
				
			||||||
 | 
					#define T2 %zmm25
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define Z0 %zmm26
 | 
				
			||||||
 | 
					#define Z1 %zmm27
 | 
				
			||||||
 | 
					#define Z2 %zmm28
 | 
				
			||||||
 | 
					#define Z3 %zmm29
 | 
				
			||||||
 | 
					#define Z4 %zmm30
 | 
				
			||||||
 | 
					#define Z5 %zmm31
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TMP Chi_00
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define Chimu_00 Chi_00
 | 
				
			||||||
 | 
					#define Chimu_01 Chi_01
 | 
				
			||||||
 | 
					#define Chimu_02 Chi_02
 | 
				
			||||||
 | 
					#define Chimu_10 Chi_10
 | 
				
			||||||
 | 
					#define Chimu_11 Chi_11
 | 
				
			||||||
 | 
					#define Chimu_12 Chi_12
 | 
				
			||||||
 | 
					#define Chimu_20 UChi_00
 | 
				
			||||||
 | 
					#define Chimu_21 UChi_01
 | 
				
			||||||
 | 
					#define Chimu_22 UChi_02
 | 
				
			||||||
 | 
					#define Chimu_30 UChi_10
 | 
				
			||||||
 | 
					#define Chimu_31 UChi_11
 | 
				
			||||||
 | 
					#define Chimu_32 UChi_12
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <simd/Intel512common.h>
 | 
				
			||||||
 | 
					#include <simd/Intel512avx.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// Macros used to build wilson kernel -- can rationalise and simplify
 | 
				
			||||||
 | 
					// a little as some duplication developed during trying different
 | 
				
			||||||
 | 
					// variants during optimisation. Could cut back to only those used.
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//  const SiteSpinor * ptr = & in._odata[offset];	
 | 
				
			||||||
 | 
					#define LOAD_CHIMU(PTR)	 LOAD_CHIMUi(PTR)
 | 
				
			||||||
 | 
					#define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 | 
				
			||||||
 | 
					#define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 | 
				
			||||||
 | 
					#define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
 | 
				
			||||||
 | 
					#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define LOAD_CHIMUi \
 | 
				
			||||||
 | 
						   LOAD_CHIMU01i	\
 | 
				
			||||||
 | 
						   LOAD_CHIMU23i	);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define LOAD_CHIMU01i\
 | 
				
			||||||
 | 
						   VLOAD(0,%r8,Chimu_00)		\
 | 
				
			||||||
 | 
						   VLOAD(1,%r8,Chimu_01)		\
 | 
				
			||||||
 | 
						   VLOAD(2,%r8,Chimu_02)		\
 | 
				
			||||||
 | 
						   VLOAD(3,%r8,Chimu_10)		\
 | 
				
			||||||
 | 
						   VLOAD(4,%r8,Chimu_11)		\
 | 
				
			||||||
 | 
						   VLOAD(5,%r8,Chimu_12)		
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define LOAD_CHIMU23i\
 | 
				
			||||||
 | 
						   VLOAD(6,%r8,Chimu_20)		\
 | 
				
			||||||
 | 
						   VLOAD(7,%r8,Chimu_21)		\
 | 
				
			||||||
 | 
						   VLOAD(8,%r8,Chimu_22)		\
 | 
				
			||||||
 | 
						   VLOAD(9,%r8,Chimu_30)		\
 | 
				
			||||||
 | 
						   VLOAD(10,%r8,Chimu_31)		\
 | 
				
			||||||
 | 
						   VLOAD(11,%r8,Chimu_32)		
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define SHUF_CHIMU23i\
 | 
				
			||||||
 | 
						   VSHUFMEM(6,%r8,Chimu_20)		\
 | 
				
			||||||
 | 
						   VSHUFMEM(7,%r8,Chimu_21)		\
 | 
				
			||||||
 | 
						   VSHUFMEM(8,%r8,Chimu_22)		\
 | 
				
			||||||
 | 
						   VSHUFMEM(9,%r8,Chimu_30)		\
 | 
				
			||||||
 | 
						   VSHUFMEM(10,%r8,Chimu_31)		\
 | 
				
			||||||
 | 
						   VSHUFMEM(11,%r8,Chimu_32)		
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//  const SiteHalfSpinor *ptr = &buf[offset];	
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define LOAD_CHIi				\
 | 
				
			||||||
 | 
					  VLOAD(0,%r8,Chi_00)					\
 | 
				
			||||||
 | 
					  VLOAD(1,%r8,Chi_01)					\
 | 
				
			||||||
 | 
					  VLOAD(2,%r8,Chi_02)					\
 | 
				
			||||||
 | 
					  VLOAD(3,%r8,Chi_10)					\
 | 
				
			||||||
 | 
					  VLOAD(4,%r8,Chi_11)					\
 | 
				
			||||||
 | 
					  VLOAD(5,%r8,Chi_12)	
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define SAVE_UCHIi(PTR)				\
 | 
				
			||||||
 | 
					  LOAD64(%r8,PTR)				\
 | 
				
			||||||
 | 
					  __asm__ (					\
 | 
				
			||||||
 | 
					  VSTORE(0,%r8,UChi_00)				\
 | 
				
			||||||
 | 
					  VSTORE(1,%r8,UChi_01)				\
 | 
				
			||||||
 | 
					  VSTORE(2,%r8,UChi_02)				\
 | 
				
			||||||
 | 
					  VSTORE(3,%r8,UChi_10)				\
 | 
				
			||||||
 | 
					  VSTORE(4,%r8,UChi_11)				\
 | 
				
			||||||
 | 
					  VSTORE(5,%r8,UChi_12)				\
 | 
				
			||||||
 | 
											);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define SAVE_CHIi(PTR)				\
 | 
				
			||||||
 | 
					  LOAD64(%r8,PTR)				\
 | 
				
			||||||
 | 
					  __asm__ (					\
 | 
				
			||||||
 | 
					  VSTORE(0,%r8,Chi_00)				\
 | 
				
			||||||
 | 
					  VSTORE(1,%r8,Chi_01)				\
 | 
				
			||||||
 | 
					  VSTORE(2,%r8,Chi_02)				\
 | 
				
			||||||
 | 
					  VSTORE(3,%r8,Chi_10)				\
 | 
				
			||||||
 | 
					  VSTORE(4,%r8,Chi_11)				\
 | 
				
			||||||
 | 
					  VSTORE(5,%r8,Chi_12)				\
 | 
				
			||||||
 | 
											);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define SAVE_RESULTi(PTR)\
 | 
				
			||||||
 | 
						   LOAD64(%r8,PTR)			\
 | 
				
			||||||
 | 
					  __asm__ (					\
 | 
				
			||||||
 | 
						   VSTORE(0,%r8,result_00)		\
 | 
				
			||||||
 | 
						   VSTORE(1,%r8,result_01)		\
 | 
				
			||||||
 | 
						   VSTORE(2,%r8,result_02)		\
 | 
				
			||||||
 | 
						   VSTORE(3,%r8,result_10)		\
 | 
				
			||||||
 | 
						   VSTORE(4,%r8,result_11)		\
 | 
				
			||||||
 | 
						   VSTORE(5,%r8,result_12)		\
 | 
				
			||||||
 | 
						   VSTORE(6,%r8,result_20)		\
 | 
				
			||||||
 | 
						   VSTORE(7,%r8,result_21)		\
 | 
				
			||||||
 | 
						   VSTORE(8,%r8,result_22)		\
 | 
				
			||||||
 | 
						   VSTORE(9,%r8,result_30)		\
 | 
				
			||||||
 | 
						   VSTORE(10,%r8,result_31)		\
 | 
				
			||||||
 | 
						   VSTORE(11,%r8,result_32) 		\
 | 
				
			||||||
 | 
											);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// Dirac algebra
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//      hspin(0)=fspin(0)+timesI(fspin(3));
 | 
				
			||||||
 | 
					//      hspin(1)=fspin(1)+timesI(fspin(2));
 | 
				
			||||||
 | 
					#define XP_PROJMEM(PTR) \
 | 
				
			||||||
 | 
					  LOAD64(%r8,PTR)							\
 | 
				
			||||||
 | 
					  __asm__ (								\
 | 
				
			||||||
 | 
						   LOAD_CHIi						\
 | 
				
			||||||
 | 
						   SHUF_CHIMU23i						\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_00,Chi_00,Chimu_30)		\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_01,Chi_01,Chimu_31)		\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_02,Chi_02,Chimu_32)		\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_10,Chi_10,Chimu_20)		\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_11,Chi_11,Chimu_21)		\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_12,Chi_12,Chimu_22)		\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_00,Chi_00,Chimu_30)		\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_01,Chi_01,Chimu_31)		\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_02,Chi_02,Chimu_32)		\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_10,Chi_10,Chimu_20)		\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_11,Chi_11,Chimu_21)		\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_12,Chi_12,Chimu_22)		);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define YP_PROJMEM(ptr) \
 | 
				
			||||||
 | 
					  LOAD64(%r8,ptr)		\
 | 
				
			||||||
 | 
					  __asm__ (					\
 | 
				
			||||||
 | 
					  LOAD_CHIMU01i					\
 | 
				
			||||||
 | 
					  VSUBMEM(9,%r8 ,Chimu_00,Chi_00)		\
 | 
				
			||||||
 | 
					  VSUBMEM(10,%r8,Chimu_01,Chi_01)		\
 | 
				
			||||||
 | 
					  VSUBMEM(11,%r8,Chimu_02,Chi_02)		\
 | 
				
			||||||
 | 
					  VADDMEM(6,%r8,Chimu_10,Chi_10)		\
 | 
				
			||||||
 | 
					  VADDMEM(7,%r8,Chimu_11,Chi_11)		\
 | 
				
			||||||
 | 
					  VADDMEM(8,%r8,Chimu_12,Chi_12)		);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZP_PROJMEM(PTR) \
 | 
				
			||||||
 | 
					  LOAD64(%r8,PTR)							\
 | 
				
			||||||
 | 
					  __asm__ (								\
 | 
				
			||||||
 | 
						   LOAD_CHIi						\
 | 
				
			||||||
 | 
						   SHUF_CHIMU23i						\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_00,Chi_00,Chimu_20)				\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_01,Chi_01,Chimu_21)		   	        \
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_02,Chi_02,Chimu_22)				\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30)			\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31)			\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32)			\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_00,Chi_00,Chimu_20)				\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_01,Chi_01,Chimu_21)				\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_02,Chi_02,Chimu_22)				\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30)		\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31)		\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32)	);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TP_PROJMEM(ptr)				\
 | 
				
			||||||
 | 
					  LOAD64(%r8,ptr)				\
 | 
				
			||||||
 | 
					  __asm__ (					\
 | 
				
			||||||
 | 
						   LOAD_CHIMU01i			\
 | 
				
			||||||
 | 
						   VADDMEM(6,%r8 ,Chimu_00,Chi_00)	\
 | 
				
			||||||
 | 
						   VADDMEM(7,%r8,Chimu_01,Chi_01)	\
 | 
				
			||||||
 | 
						   VADDMEM(8,%r8,Chimu_02,Chi_02)	\
 | 
				
			||||||
 | 
						   VADDMEM(9,%r8,Chimu_10,Chi_10)	\
 | 
				
			||||||
 | 
						   VADDMEM(10,%r8,Chimu_11,Chi_11)	\
 | 
				
			||||||
 | 
						   VADDMEM(11,%r8,Chimu_12,Chi_12)	);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//      hspin(0)=fspin(0)-timesI(fspin(3))
 | 
				
			||||||
 | 
					//      hspin(1)=fspin(1)-timesI(fspin(2))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define XM_PROJMEM(PTR) \
 | 
				
			||||||
 | 
					  LOAD64(%r8,PTR)\
 | 
				
			||||||
 | 
					  __asm__ (								\
 | 
				
			||||||
 | 
						   SHUF_CHIMU23i						\
 | 
				
			||||||
 | 
						   LOAD_CHIi \
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define YM_PROJMEM(ptr)				\
 | 
				
			||||||
 | 
					  LOAD64(%r8,ptr)				\
 | 
				
			||||||
 | 
					  __asm__ (					\
 | 
				
			||||||
 | 
					  LOAD_CHIMU01i					\
 | 
				
			||||||
 | 
					  VADDMEM(9,%r8 ,Chimu_00,Chi_00)		\
 | 
				
			||||||
 | 
					  VADDMEM(10,%r8,Chimu_01,Chi_01)		\
 | 
				
			||||||
 | 
					  VADDMEM(11,%r8,Chimu_02,Chi_02)		\
 | 
				
			||||||
 | 
					  VSUBMEM(6,%r8,Chimu_10,Chi_10)		\
 | 
				
			||||||
 | 
					  VSUBMEM(7,%r8,Chimu_11,Chi_11)		\
 | 
				
			||||||
 | 
					  VSUBMEM(8,%r8,Chimu_12,Chi_12)			);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZM_PROJMEM(PTR) \
 | 
				
			||||||
 | 
					  LOAD64(%r8,PTR)							\
 | 
				
			||||||
 | 
					  __asm__ (								\
 | 
				
			||||||
 | 
						   SHUF_CHIMU23i						\
 | 
				
			||||||
 | 
					           LOAD_CHIi \
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
 | 
				
			||||||
 | 
						   VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
 | 
				
			||||||
 | 
						   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
 | 
				
			||||||
 | 
						   VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TM_PROJMEM(ptr)				\
 | 
				
			||||||
 | 
					  LOAD64(%r8,ptr)				\
 | 
				
			||||||
 | 
					  __asm__ (					\
 | 
				
			||||||
 | 
					  LOAD_CHIMU01i					\
 | 
				
			||||||
 | 
					  VSUBMEM(6,%r8,Chimu_00,Chi_00)		\
 | 
				
			||||||
 | 
					  VSUBMEM(7,%r8,Chimu_01,Chi_01)		\
 | 
				
			||||||
 | 
					  VSUBMEM(8,%r8,Chimu_02,Chi_02)		\
 | 
				
			||||||
 | 
					  VSUBMEM(9,%r8,Chimu_10,Chi_10)		\
 | 
				
			||||||
 | 
					  VSUBMEM(10,%r8,Chimu_11,Chi_11)		\
 | 
				
			||||||
 | 
					  VSUBMEM(11,%r8,Chimu_12,Chi_12)		);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//      fspin(0)=hspin(0)
 | 
				
			||||||
 | 
					//      fspin(1)=hspin(1)
 | 
				
			||||||
 | 
					//      fspin(2)=timesMinusI(hspin(1))
 | 
				
			||||||
 | 
					//      fspin(3)=timesMinusI(hspin(0))
 | 
				
			||||||
 | 
					#define XP_RECON __asm__ (			\
 | 
				
			||||||
 | 
								  VZERO(TMP)		\
 | 
				
			||||||
 | 
								  VTIMESMINUSI0(UChi_00,result_30,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI0(UChi_10,result_20,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI0(UChi_01,result_31,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI0(UChi_11,result_21,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI0(UChi_02,result_32,TMP)   \
 | 
				
			||||||
 | 
								  VTIMESMINUSI0(UChi_12,result_22,TMP)	\
 | 
				
			||||||
 | 
								  VMOV(UChi_00,result_00)	\
 | 
				
			||||||
 | 
								  VMOV(UChi_10,result_10)	\
 | 
				
			||||||
 | 
								  VMOV(UChi_01,result_01)	\
 | 
				
			||||||
 | 
								  VMOV(UChi_11,result_11)	\
 | 
				
			||||||
 | 
								  VMOV(UChi_02,result_02)	\
 | 
				
			||||||
 | 
								  VMOV(UChi_12,result_12)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI1(UChi_10,result_20,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI1(UChi_11,result_21,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI1(UChi_12,result_22,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI1(UChi_00,result_30,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI1(UChi_01,result_31,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI1(UChi_02,result_32,TMP)   \
 | 
				
			||||||
 | 
								  VTIMESMINUSI2(UChi_10,result_20,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI2(UChi_11,result_21,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI2(UChi_12,result_22,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI2(UChi_00,result_30,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI2(UChi_01,result_31,TMP)	\
 | 
				
			||||||
 | 
								  VTIMESMINUSI2(UChi_02,result_32,TMP)   \
 | 
				
			||||||
 | 
											);
 | 
				
			||||||
 | 
					  // NB could save 6 ops using addsub => 12 cycles
 | 
				
			||||||
 | 
					#define XP_RECON_ACCUM __asm__ ( \
 | 
				
			||||||
 | 
					  VZERO(TMP)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_00,result_00)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_10,result_10)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_01,result_01)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_11,result_11)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_02,result_02)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_12,result_12)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
 | 
				
			||||||
 | 
									 );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define XM_RECON __asm__ ( \
 | 
				
			||||||
 | 
					  VZERO(TMP)\
 | 
				
			||||||
 | 
					  VTIMESI0(UChi_00,result_30,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI0(UChi_10,result_20,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI0(UChi_01,result_31,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI0(UChi_11,result_21,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI0(UChi_02,result_32,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI0(UChi_12,result_22,TMP)\
 | 
				
			||||||
 | 
					  VMOV(UChi_00,result_00)\
 | 
				
			||||||
 | 
					  VMOV(UChi_10,result_10)\
 | 
				
			||||||
 | 
					  VMOV(UChi_01,result_01)\
 | 
				
			||||||
 | 
					  VMOV(UChi_11,result_11)\
 | 
				
			||||||
 | 
					  VMOV(UChi_02,result_02)\
 | 
				
			||||||
 | 
					  VMOV(UChi_12,result_12)\
 | 
				
			||||||
 | 
					  VTIMESI1(UChi_00,result_30,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI1(UChi_10,result_20,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI1(UChi_01,result_31,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI1(UChi_11,result_21,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI1(UChi_02,result_32,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI1(UChi_12,result_22,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI2(UChi_10,result_20,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI2(UChi_11,result_21,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI2(UChi_12,result_22,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI2(UChi_00,result_30,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI2(UChi_01,result_31,TMP)\
 | 
				
			||||||
 | 
					  VTIMESI2(UChi_02,result_32,TMP)\
 | 
				
			||||||
 | 
								   );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define XM_RECON_ACCUM __asm__ ( \
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_10,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_00,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_11,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_01,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_12,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_02,result_32,Z5)\
 | 
				
			||||||
 | 
					  \
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_10,result_10)\
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_00,result_00)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_11,result_11)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_01,result_01)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_12,result_12)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_02,result_02)\
 | 
				
			||||||
 | 
					  \
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_10,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_00,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_11,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_01,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_12,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_02,result_32,Z5)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_10,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_11,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_12,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_00,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_01,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_02,result_32,Z5)\
 | 
				
			||||||
 | 
									 );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define YP_RECON_ACCUM __asm__ ( \
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_00,result_00)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_10,result_10)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_01,result_01)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_11,result_11)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_02,result_02)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_12,result_12)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_20,result_20)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_21,result_21)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_22,result_22)\
 | 
				
			||||||
 | 
					  VSUB(UChi_00,result_30,result_30)\
 | 
				
			||||||
 | 
					  VSUB(UChi_01,result_31,result_31)\
 | 
				
			||||||
 | 
					  VSUB(UChi_02,result_32,result_32) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define YM_RECON_ACCUM __asm__ ( \
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_00,result_00)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_10,result_10)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_01,result_01)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_11,result_11)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_02,result_02)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_12,result_12)\
 | 
				
			||||||
 | 
					  VSUB(UChi_10,result_20,result_20)\
 | 
				
			||||||
 | 
					  VSUB(UChi_11,result_21,result_21)\
 | 
				
			||||||
 | 
					  VSUB(UChi_12,result_22,result_22)\
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_30,result_30)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_31,result_31)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_32,result_32) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZP_RECON_ACCUM __asm__ ( \
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_10,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_11,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_12,result_32,Z5)\
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_00,result_00)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_10,result_10)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_01,result_01)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_11,result_11)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_02,result_02)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_12,result_12)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_10,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_11,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_12,result_32,Z5)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_10,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_11,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_12,result_32,Z5)\
 | 
				
			||||||
 | 
									 );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZM_RECON_ACCUM __asm__ ( \
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_00,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_01,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESI0(UChi_02,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_00,result_00)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_10,result_10)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_01,result_01)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_11,result_11)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_02,result_02)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_12,result_12)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_00,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_01,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESI1(UChi_02,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_00,result_20,Z0)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_01,result_21,Z1)\
 | 
				
			||||||
 | 
					  VACCTIMESI2(UChi_02,result_22,Z2)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
 | 
				
			||||||
 | 
					  VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
 | 
				
			||||||
 | 
									 );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TP_RECON_ACCUM __asm__ ( \
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_00,result_00)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_10,result_10)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_01,result_01)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_11,result_11)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_02,result_02)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_12,result_12)\
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_20,result_20)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_30,result_30)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_21,result_21)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_31,result_31)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_22,result_22)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_32,result_32) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TM_RECON_ACCUM __asm__ ( \
 | 
				
			||||||
 | 
					  VADD(UChi_00,result_00,result_00)\
 | 
				
			||||||
 | 
					  VADD(UChi_10,result_10,result_10)\
 | 
				
			||||||
 | 
					  VADD(UChi_01,result_01,result_01)\
 | 
				
			||||||
 | 
					  VADD(UChi_11,result_11,result_11)\
 | 
				
			||||||
 | 
					  VADD(UChi_02,result_02,result_02)\
 | 
				
			||||||
 | 
					  VADD(UChi_12,result_12,result_12)\
 | 
				
			||||||
 | 
					  VSUB(UChi_00,result_20,result_20)\
 | 
				
			||||||
 | 
					  VSUB(UChi_10,result_30,result_30)\
 | 
				
			||||||
 | 
					  VSUB(UChi_01,result_21,result_21)\
 | 
				
			||||||
 | 
					  VSUB(UChi_11,result_31,result_31)\
 | 
				
			||||||
 | 
					  VSUB(UChi_02,result_22,result_22)\
 | 
				
			||||||
 | 
					  VSUB(UChi_12,result_32,result_32) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PREFETCH_CHIMU(A) \
 | 
				
			||||||
 | 
					  LOAD64(%r9,A)						\
 | 
				
			||||||
 | 
						   __asm__ (						\
 | 
				
			||||||
 | 
					  VPREFETCHG(12,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(13,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(14,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(15,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(16,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(17,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(18,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(19,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(20,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(21,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(22,%r9)\
 | 
				
			||||||
 | 
					  VPREFETCHG(23,%r9));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PERMUTE_DIR0 __asm__ ( 	\
 | 
				
			||||||
 | 
					  VPERM0(Chi_00,Chi_00)	\
 | 
				
			||||||
 | 
					  VPERM0(Chi_01,Chi_01)	\
 | 
				
			||||||
 | 
					  VPERM0(Chi_02,Chi_02)	\
 | 
				
			||||||
 | 
					  VPERM0(Chi_10,Chi_10)	\
 | 
				
			||||||
 | 
					  VPERM0(Chi_11,Chi_11)	\
 | 
				
			||||||
 | 
					  VPERM0(Chi_12,Chi_12) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PERMUTE_DIR1 __asm__ (	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_00,Chi_00)	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_01,Chi_01)	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_02,Chi_02)	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_10,Chi_10)	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_11,Chi_11)	\
 | 
				
			||||||
 | 
					  VPERM1(Chi_12,Chi_12));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PERMUTE_DIR2 __asm__ (	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_00,Chi_00)	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_01,Chi_01)	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_02,Chi_02)	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_10,Chi_10)	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_11,Chi_11)	\
 | 
				
			||||||
 | 
					  VPERM2(Chi_12,Chi_12) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PERMUTE_DIR3 __asm__ (	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_00,Chi_00)	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_01,Chi_01)	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_02,Chi_02)	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_10,Chi_10)	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_11,Chi_11)	\
 | 
				
			||||||
 | 
					  VPERM3(Chi_12,Chi_12) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MULT_ADDSUB_2SPIN(ptr,pf)					\
 | 
				
			||||||
 | 
					  LOAD64(%r8,ptr)						\
 | 
				
			||||||
 | 
					  LOAD64(%r9,pf)						\
 | 
				
			||||||
 | 
						   __asm__ (						\
 | 
				
			||||||
 | 
						   VPREFETCH2(9,%r8)				   \
 | 
				
			||||||
 | 
						   VPREFETCH2(10,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(11,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(12,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(13,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(14,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(15,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(16,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(17,%r8)					   \
 | 
				
			||||||
 | 
						   VSHUF(Chi_00,T1)				\
 | 
				
			||||||
 | 
						   VMOVIDUP(0,%r8,Z0 )					\
 | 
				
			||||||
 | 
					           VMOVIDUP(3,%r8,Z1 )					\
 | 
				
			||||||
 | 
					           VMOVIDUP(6,%r8,Z2 )	          VSHUF(Chi_10,T2)		\
 | 
				
			||||||
 | 
						   /*6*/							\
 | 
				
			||||||
 | 
					           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 )	\
 | 
				
			||||||
 | 
					           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 )	\
 | 
				
			||||||
 | 
					           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 )	\
 | 
				
			||||||
 | 
					           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
 | 
				
			||||||
 | 
					           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
 | 
				
			||||||
 | 
					           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
 | 
				
			||||||
 | 
						   VPREFETCHG(0,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(1,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(2,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(3,%r9)					   \
 | 
				
			||||||
 | 
						   /*18*/						\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_10,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 )	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_10,UChi_12)				\
 | 
				
			||||||
 | 
						   VPREFETCHG(4,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(5,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(6,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(7,%r9)					   \
 | 
				
			||||||
 | 
						   /*28*/						\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T2,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 )	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T2,UChi_11)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 )	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 )	\
 | 
				
			||||||
 | 
						   VPREFETCH2(12,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(13,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(14,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(15,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(16,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(17,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(18,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(19,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(20,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(21,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(22,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(23,%r9)					   \
 | 
				
			||||||
 | 
					           /*38*/						\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_11,UChi_10)				\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 )	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_11,UChi_12)				\
 | 
				
			||||||
 | 
						   VPREFETCHG(9,%r8)				   \
 | 
				
			||||||
 | 
						   VPREFETCHG(10,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(11,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(12,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(13,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(14,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(15,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(16,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(17,%r8)					   \
 | 
				
			||||||
 | 
						   /*48*/						\
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
 | 
				
			||||||
 | 
					           VMADDSUB(Z0,T2,UChi_10)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T1,UChi_01)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z1,T2,UChi_11)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T1,UChi_02)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z2,T2,UChi_12)			      \
 | 
				
			||||||
 | 
						   VPREFETCHG(8,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(9,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(10,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(11,%r9)					   \
 | 
				
			||||||
 | 
						   /*55*/					      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_02,UChi_00)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z3,Chi_12,UChi_10)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_02,UChi_01)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z4,Chi_12,UChi_11)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_02,UChi_02)			      \
 | 
				
			||||||
 | 
					           VMADDSUB(Z5,Chi_12,UChi_12)			      \
 | 
				
			||||||
 | 
						   /*61 insns*/							);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MULT_ADDSUB_2SPIN_LS(ptr,pf)				   \
 | 
				
			||||||
 | 
					  LOAD64(%r8,ptr)						   \
 | 
				
			||||||
 | 
					  LOAD64(%r9,pf)						   \
 | 
				
			||||||
 | 
					  __asm__ (							   \
 | 
				
			||||||
 | 
					           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
 | 
				
			||||||
 | 
					           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
 | 
				
			||||||
 | 
					           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
 | 
				
			||||||
 | 
					           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 | 
				
			||||||
 | 
						   VPREFETCHG(0,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(1,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(2,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(3,%r9)					   \
 | 
				
			||||||
 | 
						   /*8*/						   \
 | 
				
			||||||
 | 
					           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 | 
				
			||||||
 | 
						   VPREFETCHG(4,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(5,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(6,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(7,%r9)					   \
 | 
				
			||||||
 | 
						   /*16*/					  	   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
 | 
				
			||||||
 | 
						   VPREFETCHG(8,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(9,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(10,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(11,%r9)					   \
 | 
				
			||||||
 | 
					           /*22*/						   \
 | 
				
			||||||
 | 
					           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 | 
				
			||||||
 | 
						   VPREFETCH2(12,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(13,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(14,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(15,%r9)					   \
 | 
				
			||||||
 | 
						   /*30*/						   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
 | 
				
			||||||
 | 
						   VPREFETCH2(16,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(17,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(18,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(19,%r9)					   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 | 
				
			||||||
 | 
						   /*36*/					           \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 | 
				
			||||||
 | 
						   VPREFETCH2(20,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(21,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(22,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(23,%r9)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(2,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCHG(3,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(4,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH2(5,%r8)					   \
 | 
				
			||||||
 | 
						   /*42 insns*/						);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
 | 
				
			||||||
 | 
					  LOAD64(%r8,ptr)						   \
 | 
				
			||||||
 | 
					  LOAD64(%r9,pf)						   \
 | 
				
			||||||
 | 
					  __asm__ (							   \
 | 
				
			||||||
 | 
					           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
 | 
				
			||||||
 | 
					           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
 | 
				
			||||||
 | 
					           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
 | 
				
			||||||
 | 
					           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 | 
				
			||||||
 | 
						   /*8*/						   \
 | 
				
			||||||
 | 
					           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 | 
				
			||||||
 | 
						   /*16*/					  	   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
 | 
				
			||||||
 | 
					           /*22*/						   \
 | 
				
			||||||
 | 
					           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 | 
				
			||||||
 | 
						   /*30*/						   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
 | 
				
			||||||
 | 
					           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 | 
				
			||||||
 | 
						   /*36*/					           \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
 | 
				
			||||||
 | 
					           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 | 
				
			||||||
 | 
						   /*	   VPREFETCHG(2,%r8)*/				   \
 | 
				
			||||||
 | 
						   /*	   VPREFETCHG(3,%r8)*/				   \
 | 
				
			||||||
 | 
						   /*42 insns*/						);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define Z6 Chi_00
 | 
				
			||||||
 | 
					#define MULT_ADDSUB_2SPIN_NEW(ptr,pf)			       \
 | 
				
			||||||
 | 
					  LOAD64(%r8,ptr)					       \
 | 
				
			||||||
 | 
					  __asm__ (							  \
 | 
				
			||||||
 | 
					   VSHUFMEM(0,%r8,Z0)					          \
 | 
				
			||||||
 | 
					   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
 | 
				
			||||||
 | 
					   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
 | 
				
			||||||
 | 
					   VMUL(Z0,Chi_00,Z1)         VMUL(Z0,Chi_10,Z2)		  \
 | 
				
			||||||
 | 
					   VSHUFMEM(3,%r8,Z0)						  \
 | 
				
			||||||
 | 
					   VMUL(Z0,Chi_00,Z3)         VMUL(Z0,Chi_10,Z4)		  \
 | 
				
			||||||
 | 
					   VSHUFMEM(6,%r8,Z0)						  \
 | 
				
			||||||
 | 
					   VMUL(Z0,Chi_00,Z5)         VMUL(Z0,Chi_10,Z6)		  \
 | 
				
			||||||
 | 
					   VMULMEM(0,%r8,T1,UChi_00)  VMULMEM(0,%r8,T2,UChi_10)		  \
 | 
				
			||||||
 | 
					   VMULMEM(3,%r8,T1,UChi_01)  VMULMEM(3,%r8,T2,UChi_11)		  \
 | 
				
			||||||
 | 
					   VMULMEM(6,%r8,T1,UChi_02)  VMULMEM(6,%r8,T2,UChi_12)		  \
 | 
				
			||||||
 | 
					   /*11 cycles*/						  \
 | 
				
			||||||
 | 
					   VSHUFMEM(1,%r8,Z0)						  \
 | 
				
			||||||
 | 
					   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
 | 
				
			||||||
 | 
					   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
 | 
				
			||||||
 | 
					   VMADD(Z0,Chi_01,Z1)        VMADD(Z0,Chi_11,Z2)		  \
 | 
				
			||||||
 | 
					   VSHUFMEM(4,%r8,Z0)						  \
 | 
				
			||||||
 | 
					   VMADD(Z0,Chi_01,Z3)        VMADD(Z0,Chi_11,Z4)		  \
 | 
				
			||||||
 | 
					   VSHUFMEM(7,%r8,Z0)						  \
 | 
				
			||||||
 | 
					   VMADD(Z0,Chi_01,Z5)        VMADD(Z0,Chi_11,Z6)		  \
 | 
				
			||||||
 | 
					   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
 | 
				
			||||||
 | 
					   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
 | 
				
			||||||
 | 
					   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
 | 
				
			||||||
 | 
					   /*22 cycles*/						  \
 | 
				
			||||||
 | 
					   VSHUFMEM(2,%r8,Z0)						  \
 | 
				
			||||||
 | 
					   VRDUP(Chi_02,T1)        VIDUP(Chi_02,Chi_02)			  \
 | 
				
			||||||
 | 
					   VRDUP(Chi_12,T2)        VIDUP(Chi_12,Chi_12)			  \
 | 
				
			||||||
 | 
					   VMADD(Z0,Chi_02,Z1)        VMADD(Z0,Chi_12,Z2)		  \
 | 
				
			||||||
 | 
					   VSHUFMEM(5,%r8,Z0)						  \
 | 
				
			||||||
 | 
					   VMADD(Z0,Chi_02,Z3)        VMADD(Z0,Chi_12,Z4)		  \
 | 
				
			||||||
 | 
					   VSHUFMEM(8,%r8,Z0)						  \
 | 
				
			||||||
 | 
					   VMADD(Z0,Chi_02,Z5)        VMADD(Z0,Chi_12,Z6)		  \
 | 
				
			||||||
 | 
					   /*33 cycles*/						  \
 | 
				
			||||||
 | 
					   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
 | 
				
			||||||
 | 
					   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
 | 
				
			||||||
 | 
					   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
 | 
				
			||||||
 | 
					  /*stall*/						       \
 | 
				
			||||||
 | 
					  /*stall*/						       \
 | 
				
			||||||
 | 
					  /*stall*/						       \
 | 
				
			||||||
 | 
					  VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
 | 
				
			||||||
 | 
					  VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
 | 
				
			||||||
 | 
					  VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
@@ -103,9 +103,11 @@ void LebesgueOrder::IterateI(int ND,
 | 
				
			|||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      for(int d=0;d<ND;d++){
 | 
					      for(int d=0;d<ND;d++){
 | 
				
			||||||
	x[d]=xi[d]+xo[d];
 | 
						x[d]=xi[d]+xo[d];
 | 
				
			||||||
 | 
					//	std::cout << x[d]<<" ";
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					//      std::cout << "\n";
 | 
				
			||||||
      IndexInteger index;
 | 
					      IndexInteger index;
 | 
				
			||||||
      grid->IndexFromCoor(x,index,grid->_rdimensions);
 | 
					      Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
 | 
				
			||||||
      _LebesgueReorder.push_back(index);
 | 
					      _LebesgueReorder.push_back(index);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -188,6 +190,7 @@ void LebesgueOrder::ZGraph(void)
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  assert( _LebesgueReorder.size() == vol );
 | 
					  assert( _LebesgueReorder.size() == vol );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /*
 | 
				
			||||||
  std::vector<int> coor(4);
 | 
					  std::vector<int> coor(4);
 | 
				
			||||||
  for(IndexInteger asite=0;asite<vol;asite++){
 | 
					  for(IndexInteger asite=0;asite<vol;asite++){
 | 
				
			||||||
    grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
 | 
					    grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
 | 
				
			||||||
@@ -198,5 +201,6 @@ void LebesgueOrder::ZGraph(void)
 | 
				
			|||||||
		<< coor[3]<<"]"
 | 
							<< coor[3]<<"]"
 | 
				
			||||||
		<<std::endl;
 | 
							<<std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -44,8 +44,8 @@ template<class vsimd,class scalar>
 | 
				
			|||||||
inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y, 
 | 
					inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y, 
 | 
				
			||||||
		    std::vector<scalar *> &extracted,int offset){
 | 
							    std::vector<scalar *> &extracted,int offset){
 | 
				
			||||||
  // FIXME: bounce off memory is painful
 | 
					  // FIXME: bounce off memory is painful
 | 
				
			||||||
 | 
					  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
 | 
				
			||||||
  int Nextr=extracted.size();
 | 
					  int Nextr=extracted.size();
 | 
				
			||||||
  int Nsimd=vsimd::Nsimd();
 | 
					 | 
				
			||||||
  int s=Nsimd/Nextr;
 | 
					  int s=Nsimd/Nextr;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  scalar*buf = (scalar *)y;
 | 
					  scalar*buf = (scalar *)y;
 | 
				
			||||||
@@ -59,8 +59,10 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
				
			|||||||
template<class vsimd,class scalar>
 | 
					template<class vsimd,class scalar>
 | 
				
			||||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y, 
 | 
					inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y, 
 | 
				
			||||||
		  std::vector<scalar *> &extracted,int offset){
 | 
							  std::vector<scalar *> &extracted,int offset){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int Nextr=extracted.size();
 | 
					  int Nextr=extracted.size();
 | 
				
			||||||
  int Nsimd=vsimd::Nsimd();
 | 
					 | 
				
			||||||
  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
 | 
					  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
 | 
				
			||||||
                     // replicate n-fold. Use to allow Integer masks to 
 | 
					                     // replicate n-fold. Use to allow Integer masks to 
 | 
				
			||||||
                     // predicate floating point of various width assignments and maintain conformable.
 | 
					                     // predicate floating point of various width assignments and maintain conformable.
 | 
				
			||||||
@@ -85,6 +87,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
				
			|||||||
  scalar *buf = (scalar *)&y;
 | 
					  scalar *buf = (scalar *)&y;
 | 
				
			||||||
  for(int i=0;i<Nextr;i++){
 | 
					  for(int i=0;i<Nextr;i++){
 | 
				
			||||||
    extracted[i]=buf[i*s];
 | 
					    extracted[i]=buf[i*s];
 | 
				
			||||||
 | 
					#ifdef PARANOID
 | 
				
			||||||
    for(int ii=1;ii<s;ii++){
 | 
					    for(int ii=1;ii<s;ii++){
 | 
				
			||||||
      if ( buf[i*s]!=buf[i*s+ii] ){
 | 
					      if ( buf[i*s]!=buf[i*s+ii] ){
 | 
				
			||||||
	std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
 | 
						std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
 | 
				
			||||||
@@ -96,6 +99,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
      assert(buf[i*s]==buf[i*s+ii]);
 | 
					      assert(buf[i*s]==buf[i*s+ii]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
@@ -106,7 +110,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 | 
				
			|||||||
template<class vsimd,class scalar>
 | 
					template<class vsimd,class scalar>
 | 
				
			||||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type  &y,std::vector<scalar> &extracted){
 | 
					inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type  &y,std::vector<scalar> &extracted){
 | 
				
			||||||
  int Nextr=extracted.size();
 | 
					  int Nextr=extracted.size();
 | 
				
			||||||
  int Nsimd=vsimd::Nsimd();
 | 
					  static const int Nsimd=vsimd::Nsimd();
 | 
				
			||||||
  int s=Nsimd/Nextr;
 | 
					  int s=Nsimd/Nextr;
 | 
				
			||||||
  scalar *buf = (scalar *)&y;
 | 
					  scalar *buf = (scalar *)&y;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -125,9 +129,9 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
 | 
				
			|||||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
  typedef typename vobj::vector_type vector_type ;
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
					  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
				
			||||||
 | 
					  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
  int Nextr=extracted.size();
 | 
					  int Nextr=extracted.size();
 | 
				
			||||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
					 | 
				
			||||||
  int s=Nsimd/Nextr;
 | 
					  int s=Nsimd/Nextr;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<scalar_type *> pointers(Nextr);
 | 
					  std::vector<scalar_type *> pointers(Nextr);
 | 
				
			||||||
@@ -148,8 +152,8 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
 | 
				
			|||||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
  typedef typename vobj::vector_type vector_type ;
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
					  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
					  static const int Nsimd=vobj::vector_type::Nsimd();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int Nextr=extracted.size();
 | 
					  int Nextr=extracted.size();
 | 
				
			||||||
  int s = Nsimd/Nextr;
 | 
					  int s = Nsimd/Nextr;
 | 
				
			||||||
@@ -172,8 +176,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
				
			|||||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
  typedef typename vobj::vector_type vector_type ;
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
					  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
				
			||||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
					  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int Nextr = extracted.size();
 | 
					  int Nextr = extracted.size();
 | 
				
			||||||
  int splat=Nsimd/Nextr;
 | 
					  int splat=Nsimd/Nextr;
 | 
				
			||||||
@@ -197,7 +201,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
				
			|||||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
  typedef typename vobj::vector_type vector_type ;
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
					  const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
 | 
				
			||||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
					  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int Nextr=extracted.size();
 | 
					  int Nextr=extracted.size();
 | 
				
			||||||
@@ -224,20 +228,17 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
				
			|||||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
  typedef typename vobj::vector_type vector_type ;
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
					  static const int Nsimd=vobj::vector_type::Nsimd();
 | 
				
			||||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
					  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  scalar_type *pointer;
 | 
					 | 
				
			||||||
  scalar_type *vp = (scalar_type *)&vec;
 | 
					  scalar_type *vp = (scalar_type *)&vec;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
					  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int w=0;w<words;w++){
 | 
				
			||||||
  for(int i=0;i<Nsimd;i++){
 | 
					  for(int i=0;i<Nsimd;i++){
 | 
				
			||||||
    pointer=(scalar_type *)&extracted[i][offset];
 | 
					      vp[w*Nsimd+i] = ((scalar_type *)&extracted[i][offset])[w];
 | 
				
			||||||
    for(int w=0;w<words;w++){
 | 
					  }}
 | 
				
			||||||
      vp[w*Nsimd+i] = pointer[w];
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class vobj> inline 
 | 
					template<class vobj> inline 
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -18,7 +18,7 @@ TESTS=`ls T*.cc`
 | 
				
			|||||||
TESTLIST=`echo ${TESTS} | sed s/.cc//g `
 | 
					TESTLIST=`echo ${TESTS} | sed s/.cc//g `
 | 
				
			||||||
 | 
					
 | 
				
			||||||
echo > Make.inc
 | 
					echo > Make.inc
 | 
				
			||||||
echo bin_PROGRAMS = ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
 | 
					echo bin_PROGRAMS += ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
 | 
				
			||||||
echo >> Make.inc
 | 
					echo >> Make.inc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for f in $TESTS
 | 
					for f in $TESTS
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,5 +1,5 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
 | 
					bin_PROGRAMS += Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Test_cayley_cg_SOURCES=Test_cayley_cg.cc
 | 
					Test_cayley_cg_SOURCES=Test_cayley_cg.cc
 | 
				
			||||||
@@ -50,6 +50,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
 | 
				
			|||||||
Test_cshift_red_black_LDADD=-lGrid
 | 
					Test_cshift_red_black_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
 | 
				
			||||||
 | 
					Test_cshift_red_black_rotate_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
 | 
				
			||||||
 | 
					Test_cshift_rotate_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
 | 
					Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
 | 
				
			||||||
Test_dwf_cg_prec_LDADD=-lGrid
 | 
					Test_dwf_cg_prec_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -90,6 +98,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 | 
				
			|||||||
Test_dwf_lanczos_LDADD=-lGrid
 | 
					Test_dwf_lanczos_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
 | 
				
			||||||
 | 
					Test_dwf_rb5d_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Test_gamma_SOURCES=Test_gamma.cc
 | 
					Test_gamma_SOURCES=Test_gamma.cc
 | 
				
			||||||
Test_gamma_LDADD=-lGrid
 | 
					Test_gamma_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,8 +8,20 @@ endif
 | 
				
			|||||||
AM_CXXFLAGS = -I$(top_srcdir)/lib
 | 
					AM_CXXFLAGS = -I$(top_srcdir)/lib
 | 
				
			||||||
AM_LDFLAGS = -L$(top_builddir)/lib
 | 
					AM_LDFLAGS = -L$(top_builddir)/lib
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if USE_LAPACK
 | 
				
			||||||
 | 
					AM_CXXFLAGS += -DUSE_LAPACK
 | 
				
			||||||
 | 
					if USE_LAPACK_LIB
 | 
				
			||||||
 | 
					#if test "X${ac_LAPACK}X" != XyesX 
 | 
				
			||||||
 | 
					AM_CXXFLAGS += -I$(ac_LAPACK)/include
 | 
				
			||||||
 | 
					AM_LDFLAGS += -L$(ac_LAPACK)/lib
 | 
				
			||||||
 | 
					#fi
 | 
				
			||||||
 | 
					endif
 | 
				
			||||||
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if BUILD_ZMM
 | 
					if BUILD_ZMM
 | 
				
			||||||
  bin_PROGRAMS=Test_zmm
 | 
					  bin_PROGRAMS=Test_zmm
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  bin_PROGRAMS=
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
include Make.inc
 | 
					include Make.inc
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -96,13 +96,13 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  std::vector<int> peer(4);
 | 
						  std::vector<int> peer(4);
 | 
				
			||||||
	  Complex tmp  =cm;
 | 
						  Complex tmp  =cm;
 | 
				
			||||||
	  Integer index=real(tmp);
 | 
						  Integer index=real(tmp);
 | 
				
			||||||
	  Fine.CoorFromIndex(peer,index,latt_size);
 | 
						  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  if (nrm > 0){
 | 
						  if (nrm > 0){
 | 
				
			||||||
	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
						    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    index=real(scm);
 | 
						    index=real(scm);
 | 
				
			||||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
						    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}}}}
 | 
						}}}}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -132,7 +132,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  std::vector<int> peer(4);
 | 
						  std::vector<int> peer(4);
 | 
				
			||||||
	  Complex ctmp = cm;
 | 
						  Complex ctmp = cm;
 | 
				
			||||||
	  Integer index=real(ctmp);
 | 
						  Integer index=real(ctmp);
 | 
				
			||||||
	  Fine.CoorFromIndex(peer,index,latt_size);
 | 
						  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  if (nrm > 0){
 | 
						  if (nrm > 0){
 | 
				
			||||||
	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
						    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
				
			||||||
@@ -140,7 +140,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
							     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    index=real(scm);
 | 
						    index=real(scm);
 | 
				
			||||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
						    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    exit(-1);
 | 
						    exit(-1);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
@@ -180,7 +180,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  std::vector<int> peer(4);
 | 
						  std::vector<int> peer(4);
 | 
				
			||||||
	  Complex ctmp=cmeo;
 | 
						  Complex ctmp=cmeo;
 | 
				
			||||||
	  Integer index=real(ctmp);
 | 
						  Integer index=real(ctmp);
 | 
				
			||||||
	  Fine.CoorFromIndex(peer,index,latt_size);
 | 
						  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  double nrm = abs(cmeo()()()-scm);
 | 
						  double nrm = abs(cmeo()()()-scm);
 | 
				
			||||||
	  if (nrm != 0) {
 | 
						  if (nrm != 0) {
 | 
				
			||||||
@@ -189,7 +189,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
							     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    index=real(scm);
 | 
						    index=real(scm);
 | 
				
			||||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
						    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    exx=1;
 | 
						    exx=1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -205,7 +205,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
							     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    index=real(scm);
 | 
						    index=real(scm);
 | 
				
			||||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
						    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    exx=1;
 | 
						    exx=1;
 | 
				
			||||||
	  } else if (1) { 
 | 
						  } else if (1) { 
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										223
									
								
								tests/Test_cshift_red_black_rotate.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										223
									
								
								tests/Test_cshift_red_black_rotate.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,223 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./tests/Test_cshift_red_black.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> latt_size   = GridDefaultLatt();
 | 
				
			||||||
 | 
					  int Nd = latt_size.size();
 | 
				
			||||||
 | 
					  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
 | 
				
			||||||
 | 
					  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> mask(Nd,1);
 | 
				
			||||||
 | 
					  mask[0]=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeComplex U(&Fine);
 | 
				
			||||||
 | 
					  LatticeComplex ShiftU(&Fine);
 | 
				
			||||||
 | 
					  LatticeComplex rbShiftU(&Fine);
 | 
				
			||||||
 | 
					  LatticeComplex Ue(&RBFine); 
 | 
				
			||||||
 | 
					  LatticeComplex Uo(&RBFine);
 | 
				
			||||||
 | 
					  LatticeComplex ShiftUe(&RBFine);
 | 
				
			||||||
 | 
					  LatticeComplex ShiftUo(&RBFine);
 | 
				
			||||||
 | 
					  LatticeComplex lex(&Fine);
 | 
				
			||||||
 | 
					  lex=zero;
 | 
				
			||||||
 | 
					  Integer stride =1;
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    double nrm;
 | 
				
			||||||
 | 
					    LatticeComplex coor(&Fine);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int d=0;d<Nd;d++){
 | 
				
			||||||
 | 
					      //      Integer i=10000;
 | 
				
			||||||
 | 
					      Integer i=0;
 | 
				
			||||||
 | 
					      LatticeCoordinate(coor,d);
 | 
				
			||||||
 | 
					      lex = lex + coor*stride+i;
 | 
				
			||||||
 | 
					      stride=stride*latt_size[d];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    U=lex;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  pickCheckerboard(Even,Ue,U);
 | 
				
			||||||
 | 
					  pickCheckerboard(Odd,Uo,U);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  std::cout<<GridLogMessage << U<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  TComplex cm;
 | 
				
			||||||
 | 
					  TComplex cmeo;
 | 
				
			||||||
 | 
					  for(int dir=0;dir<Nd;dir++){
 | 
				
			||||||
 | 
					    //    if ( dir!=1 ) continue;
 | 
				
			||||||
 | 
					    for(int shift=0;shift<latt_size[dir];shift++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage<<"Even grid"<<std::endl;
 | 
				
			||||||
 | 
						ShiftUe = Cshift(Ue,dir,shift);    // Shift everything cb by cb
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
 | 
				
			||||||
 | 
						ShiftUo = Cshift(Uo,dir,shift);    
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
 | 
				
			||||||
 | 
						setCheckerboard(rbShiftU,ShiftUe);
 | 
				
			||||||
 | 
						setCheckerboard(rbShiftU,ShiftUo);
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
 | 
				
			||||||
 | 
						ShiftU  = Cshift(U,dir,shift);    // Shift everything
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::vector<int> coor(4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage << "Checking the non-checkerboard shift"<<std::endl;
 | 
				
			||||||
 | 
						for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 | 
				
			||||||
 | 
						for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 | 
				
			||||||
 | 
						for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 | 
				
			||||||
 | 
						for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  peekSite(cm,ShiftU,coor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  /////////	  double nrm=norm2(U);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::vector<int> scoor(coor);
 | 
				
			||||||
 | 
						  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  Integer slex = scoor[0]
 | 
				
			||||||
 | 
						    + latt_size[0]*scoor[1]
 | 
				
			||||||
 | 
						    + latt_size[0]*latt_size[1]*scoor[2]
 | 
				
			||||||
 | 
						    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  Complex scm(slex);
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  double nrm = abs(scm-cm()()());
 | 
				
			||||||
 | 
						  std::vector<int> peer(4);
 | 
				
			||||||
 | 
						  Complex ctmp = cm;
 | 
				
			||||||
 | 
						  Integer index=real(ctmp);
 | 
				
			||||||
 | 
						  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  if (nrm > 0){
 | 
				
			||||||
 | 
						    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
				
			||||||
 | 
							     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
				
			||||||
 | 
							     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
 | 
						    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						    index=real(scm);
 | 
				
			||||||
 | 
						    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
						    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						    exit(-1);
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}}}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int exx=0;
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
 | 
				
			||||||
 | 
						for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 | 
				
			||||||
 | 
						for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 | 
				
			||||||
 | 
						for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 | 
				
			||||||
 | 
						for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  peekSite(cm,rbShiftU,coor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  Integer checkerboard = RBFine.CheckerBoard(coor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  //	  std::cout << " coor "<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] \n ";
 | 
				
			||||||
 | 
						  //	  std::cout << "shift "<< shift <<" dir "<<dir<< " checker board "<< checkerboard << " ";
 | 
				
			||||||
 | 
						  //	  std::cout << "Uo "   << ShiftUo.checkerboard << " Ue "<<ShiftUe.checkerboard<<std::endl;
 | 
				
			||||||
 | 
						  if ( checkerboard == ShiftUo.checkerboard ) {
 | 
				
			||||||
 | 
						    peekSite(cmeo,ShiftUo,coor);
 | 
				
			||||||
 | 
						  } else { 
 | 
				
			||||||
 | 
						    peekSite(cmeo,ShiftUe,coor);
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::vector<int> scoor(coor);
 | 
				
			||||||
 | 
						  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  Integer slex = scoor[0]
 | 
				
			||||||
 | 
						    + latt_size[0]*scoor[1]
 | 
				
			||||||
 | 
						    + latt_size[0]*latt_size[1]*scoor[2]
 | 
				
			||||||
 | 
						    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  Complex scm(slex);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::vector<int> peer(4);
 | 
				
			||||||
 | 
						  Complex ctmp=cmeo;
 | 
				
			||||||
 | 
						  Integer index=real(ctmp);
 | 
				
			||||||
 | 
						  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  double nrm = abs(cmeo()()()-scm);
 | 
				
			||||||
 | 
						  if (nrm != 0) {
 | 
				
			||||||
 | 
						    std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir
 | 
				
			||||||
 | 
							     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
				
			||||||
 | 
							     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
 | 
						    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						    index=real(scm);
 | 
				
			||||||
 | 
						    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
						    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						    exx=1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  ctmp=cm;
 | 
				
			||||||
 | 
						  index=real(ctmp);
 | 
				
			||||||
 | 
						  nrm = abs(scm-cm()()());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  if (nrm > 0){
 | 
				
			||||||
 | 
						    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
				
			||||||
 | 
							     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
				
			||||||
 | 
							     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
 | 
						    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						    index=real(scm);
 | 
				
			||||||
 | 
						    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
						    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						    exx=1;
 | 
				
			||||||
 | 
						  } else if (1) { 
 | 
				
			||||||
 | 
						    std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
 | 
				
			||||||
 | 
							     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
				
			||||||
 | 
							     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}}}}
 | 
				
			||||||
 | 
						if (exx) exit(-1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Grid_finalize();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										125
									
								
								tests/Test_cshift_rotate.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								tests/Test_cshift_rotate.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,125 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./tests/Test_cshift.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> latt_size   = GridDefaultLatt();
 | 
				
			||||||
 | 
					  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
 | 
				
			||||||
 | 
					  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeComplex U(&Fine);
 | 
				
			||||||
 | 
					  LatticeComplex ShiftU(&Fine);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeComplex lex(&Fine);
 | 
				
			||||||
 | 
					  lex=zero;
 | 
				
			||||||
 | 
					  Integer stride =1;
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    double nrm;
 | 
				
			||||||
 | 
					    LatticeComplex coor(&Fine);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int d=0;d<4;d++){
 | 
				
			||||||
 | 
					      LatticeCoordinate(coor,d);
 | 
				
			||||||
 | 
					      lex = lex + coor*stride;
 | 
				
			||||||
 | 
					      stride=stride*latt_size[d];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    U=lex;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  TComplex cm;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  for(int dir=0;dir<4;dir++){
 | 
				
			||||||
 | 
					    for(int shift=0;shift<latt_size[dir];shift++){
 | 
				
			||||||
 | 
					      if ( Fine.IsBoss() ) 
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ShiftU  = Cshift(U,dir,shift);    // Shift everything
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						std::cout << "U[0]" << U[0]<<std::endl;
 | 
				
			||||||
 | 
						std::cout << "U[1]" << U[1]<<std::endl;
 | 
				
			||||||
 | 
						std::cout << "ShiftU[0]" << ShiftU[0]<<std::endl;
 | 
				
			||||||
 | 
						std::cout << "ShiftU[1]" << ShiftU[1]<<std::endl;
 | 
				
			||||||
 | 
						*/
 | 
				
			||||||
 | 
						std::vector<int> coor(4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 | 
				
			||||||
 | 
						for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 | 
				
			||||||
 | 
						for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 | 
				
			||||||
 | 
						for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  peekSite(cm,ShiftU,coor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  double nrm=norm2(U);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::vector<int> scoor(coor);
 | 
				
			||||||
 | 
						  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  Integer slex = scoor[0]
 | 
				
			||||||
 | 
						    + latt_size[0]*scoor[1]
 | 
				
			||||||
 | 
						    + latt_size[0]*latt_size[1]*scoor[2]
 | 
				
			||||||
 | 
						    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  Complex scm(slex);
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  nrm = abs(scm-cm()()());
 | 
				
			||||||
 | 
						  std::vector<int> peer(4);
 | 
				
			||||||
 | 
						  Complex tmp  =cm;
 | 
				
			||||||
 | 
						  Integer index=real(tmp);
 | 
				
			||||||
 | 
						  Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  if (nrm > 0){
 | 
				
			||||||
 | 
						    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
 | 
						    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						    index=real(scm);
 | 
				
			||||||
 | 
						    Lexicographic::CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
						    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  /*
 | 
				
			||||||
 | 
						  else {
 | 
				
			||||||
 | 
						    std::cerr<<"PASS shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
 | 
						    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  */
 | 
				
			||||||
 | 
						}}}}
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Grid_finalize();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -42,6 +42,8 @@ public:
 | 
				
			|||||||
			  int, domaindecompose,
 | 
								  int, domaindecompose,
 | 
				
			||||||
			  int, domainsize,
 | 
								  int, domainsize,
 | 
				
			||||||
			  int, order,
 | 
								  int, order,
 | 
				
			||||||
 | 
								  int, Ls,
 | 
				
			||||||
 | 
								  double, mq,
 | 
				
			||||||
			  double, lo,
 | 
								  double, lo,
 | 
				
			||||||
			  double, hi,
 | 
								  double, hi,
 | 
				
			||||||
			  int, steps);
 | 
								  int, steps);
 | 
				
			||||||
@@ -263,11 +265,6 @@ public:
 | 
				
			|||||||
      resid = norm2(r) /norm2(src); 
 | 
					      resid = norm2(r) /norm2(src); 
 | 
				
			||||||
      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
 | 
					      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
// Npoly*outer*2 1/2 vol matmuls.
 | 
					 | 
				
			||||||
// 71 iters => 20*71 = 1400 matmuls.
 | 
					 | 
				
			||||||
// 2*71 = 140 comms.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      // Even domain solve
 | 
					      // Even domain solve
 | 
				
			||||||
      r= where(subset==(Integer)0,r,zz);
 | 
					      r= where(subset==(Integer)0,r,zz);
 | 
				
			||||||
      _SmootherOperator.AdjOp(r,vec1);
 | 
					      _SmootherOperator.AdjOp(r,vec1);
 | 
				
			||||||
@@ -332,7 +329,7 @@ public:
 | 
				
			|||||||
    CoarseVector Ctmp(_CoarseOperator.Grid());
 | 
					    CoarseVector Ctmp(_CoarseOperator.Grid());
 | 
				
			||||||
    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
 | 
					    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ConjugateGradient<CoarseVector>  CG(1.0e-3,100000);
 | 
					    ConjugateGradient<CoarseVector>  CG(3.0e-3,100000);
 | 
				
			||||||
    //    ConjugateGradient<FineField>    fCG(3.0e-2,1000);
 | 
					    //    ConjugateGradient<FineField>    fCG(3.0e-2,1000);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
 | 
					    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
 | 
				
			||||||
@@ -345,14 +342,14 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    //    Chebyshev<FineField> Cheby    (0.5,70.0,30,InverseApproximation);
 | 
					    //    Chebyshev<FineField> Cheby    (0.5,70.0,30,InverseApproximation);
 | 
				
			||||||
    //    Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
 | 
					    //    Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
 | 
				
			||||||
    Chebyshev<FineField> Cheby    (2.0,70.0,15,InverseApproximation);
 | 
					    Chebyshev<FineField> Cheby    (params.lo,params.hi,params.order,InverseApproximation);
 | 
				
			||||||
    Chebyshev<FineField> ChebyAccu(2.0,70.0,15,InverseApproximation);
 | 
					    Chebyshev<FineField> ChebyAccu(params.lo,params.hi,params.order,InverseApproximation);
 | 
				
			||||||
    //    Cheby.JacksonSmooth();
 | 
					    //    Cheby.JacksonSmooth();
 | 
				
			||||||
    //    ChebyAccu.JacksonSmooth();
 | 
					    //    ChebyAccu.JacksonSmooth();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    _Aggregates.ProjectToSubspace  (Csrc,in);
 | 
					    //    _Aggregates.ProjectToSubspace  (Csrc,in);
 | 
				
			||||||
    _Aggregates.PromoteFromSubspace(Csrc,out);
 | 
					    //    _Aggregates.PromoteFromSubspace(Csrc,out);
 | 
				
			||||||
    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
 | 
					    //    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    //    ofstream fout("smoother");
 | 
					    //    ofstream fout("smoother");
 | 
				
			||||||
    //    Cheby.csv(fout);
 | 
					    //    Cheby.csv(fout);
 | 
				
			||||||
@@ -479,7 +476,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  read(RD,"params",params);
 | 
					  read(RD,"params",params);
 | 
				
			||||||
  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
 | 
					  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int Ls=8;
 | 
					  const int Ls=params.Ls;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
@@ -490,10 +487,12 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  ///////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////
 | 
				
			||||||
  // Construct a coarsened grid; utility for this?
 | 
					  // Construct a coarsened grid; utility for this?
 | 
				
			||||||
  ///////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////
 | 
				
			||||||
  const int block=2;
 | 
					  std::vector<int> block ({2,2,2,2});
 | 
				
			||||||
 | 
					  const int nbasis= 32;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<int> clatt = GridDefaultLatt();
 | 
					  std::vector<int> clatt = GridDefaultLatt();
 | 
				
			||||||
  for(int d=0;d<clatt.size();d++){
 | 
					  for(int d=0;d<clatt.size();d++){
 | 
				
			||||||
    clatt[d] = clatt[d]/block;
 | 
					    clatt[d] = clatt[d]/block[d];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
 | 
					  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
 | 
				
			||||||
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
 | 
					  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
 | 
				
			||||||
@@ -539,7 +538,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  //  SU3::HotConfiguration(RNG4,Umu);
 | 
					  //  SU3::HotConfiguration(RNG4,Umu);
 | 
				
			||||||
  //  Umu=zero;
 | 
					  //  Umu=zero;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  RealD mass=0.01;
 | 
					  RealD mass=params.mq;
 | 
				
			||||||
  RealD M5=1.8;
 | 
					  RealD M5=1.8;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
@@ -548,9 +547,6 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
					  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
				
			||||||
  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
					  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int nbasis = 32;
 | 
					 | 
				
			||||||
  //  const int nbasis = 4;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
 | 
					  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
 | 
				
			||||||
  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
 | 
					  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
 | 
				
			||||||
  typedef CoarseOperator::CoarseVector                                 CoarseVector;
 | 
					  typedef CoarseOperator::CoarseVector                                 CoarseVector;
 | 
				
			||||||
@@ -564,7 +560,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  assert ( (nbasis & 0x1)==0);
 | 
					  assert ( (nbasis & 0x1)==0);
 | 
				
			||||||
  int nb=nbasis/2;
 | 
					  int nb=nbasis/2;
 | 
				
			||||||
  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
 | 
					  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
 | 
				
			||||||
  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
 | 
					  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
 | 
				
			||||||
 | 
					  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
 | 
				
			||||||
  for(int n=0;n<nb;n++){
 | 
					  for(int n=0;n<nb;n++){
 | 
				
			||||||
    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
 | 
					    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
 | 
				
			||||||
    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
 | 
					    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
 | 
				
			||||||
@@ -600,7 +597,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
 | 
					  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
 | 
				
			||||||
  ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
 | 
					  ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
 | 
				
			||||||
  CG(PosdefLdop,c_src,c_res);
 | 
					  //  CG(PosdefLdop,c_src,c_res);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  //  std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
 | 
					  //  std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
 | 
				
			||||||
@@ -625,17 +622,17 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  Precon.SmootherTest(src);
 | 
					  //  Precon.SmootherTest(src);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  PreconDD.SmootherTest(src);
 | 
					  //  PreconDD.SmootherTest(src);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  PreconDD.SAP(src,result);
 | 
					  //  PreconDD.SAP(src,result);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
 | 
					  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
 | 
				
			||||||
@@ -663,18 +660,18 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
 | 
					  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
 | 
					  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
 | 
				
			||||||
  result=zero;
 | 
					  //  result=zero;
 | 
				
			||||||
  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
 | 
					  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
 | 
				
			||||||
  PGCRDD(HermIndefOp,src,result);
 | 
					  //  PGCRDD(HermIndefOp,src,result);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
 | 
					  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
 | 
					  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,8);
 | 
				
			||||||
  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
 | 
					  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
 | 
				
			||||||
  //  result=zero;
 | 
					  result=zero;
 | 
				
			||||||
  //  PGCR(HermIndefOp,src,result);
 | 
					  PGCR(HermIndefOp,src,result);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
					  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
 | 
					  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
 | 
				
			||||||
 
 | 
				
			|||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user