mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -49,6 +49,7 @@ config.status
 | 
				
			|||||||
.deps
 | 
					.deps
 | 
				
			||||||
Make.inc
 | 
					Make.inc
 | 
				
			||||||
eigen.inc
 | 
					eigen.inc
 | 
				
			||||||
 | 
					Eigen.inc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# http://www.gnu.org/software/autoconf #
 | 
					# http://www.gnu.org/software/autoconf #
 | 
				
			||||||
########################################
 | 
					########################################
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -138,7 +138,7 @@ The following options can be use with the `--enable-comms=` option to target dif
 | 
				
			|||||||
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | 
					| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | 
				
			||||||
| `shmem `       | Cray SHMEM communications                                     |
 | 
					| `shmem `       | Cray SHMEM communications                                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
 | 
					For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Possible SIMD types
 | 
					### Possible SIMD types
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										237
									
								
								benchmarks/Benchmark_mooee.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										237
									
								
								benchmarks/Benchmark_mooee.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,237 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./benchmarks/Benchmark_dwf.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid/Grid.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace std;
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int threads = GridThread::GetThreads();
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> latt4 = GridDefaultLatt();
 | 
				
			||||||
 | 
					  const int Ls=8;
 | 
				
			||||||
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "Making Vec5d innermost grids"<<std::endl;
 | 
				
			||||||
 | 
					  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> seeds4({1,2,3,4});
 | 
				
			||||||
 | 
					  std::vector<int> seeds5({5,6,7,8});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "Seeded"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "made random gauge fields"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD mass=0.1;
 | 
				
			||||||
 | 
					  RealD M5  =1.8;
 | 
				
			||||||
 | 
					  RealD NP = UGrid->_Nprocessors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (1)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    const int ncall=100;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GridParallelRNG RNG5(FGrid);
 | 
				
			||||||
 | 
					    LatticeFermion src(FGrid); random(RNG5,src);
 | 
				
			||||||
 | 
					    LatticeFermion result(FGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    double t0,t1;
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      Dw.Dhop(src,result,0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    LatticeFermion r_eo(FGrid);
 | 
				
			||||||
 | 
					    LatticeFermion src_e (FrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion src_o (FrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion r_e   (FrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion r_o   (FrbGrid);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    pickCheckerboard(Even,src_e,src);
 | 
				
			||||||
 | 
					    pickCheckerboard(Odd,src_o,src);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    setCheckerboard(r_eo,src_o);
 | 
				
			||||||
 | 
					    setCheckerboard(r_eo,src_e);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    r_e = zero;
 | 
				
			||||||
 | 
					    r_o = zero;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for (int i = 0; i < ncall; i++) {
 | 
				
			||||||
 | 
					      Dw.DhopEO(src_o, r_e, DaggerNo);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for (int i = 0; i < ncall; i++) {
 | 
				
			||||||
 | 
					      Dw.Mooee(src_o, r_o);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for (int i = 0; i < ncall; i++) {
 | 
				
			||||||
 | 
					      Dw.MooeeInv(src_o, r_o);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for (int i = 0; i < ncall; i++) {
 | 
				
			||||||
 | 
					      Dw.Meooe(src_o, r_e);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (1)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    const int ncall=100;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GridParallelRNG RNG5(sFGrid);
 | 
				
			||||||
 | 
					    LatticeFermion src(sFGrid); random(RNG5,src);
 | 
				
			||||||
 | 
					    LatticeFermion sref(sFGrid);
 | 
				
			||||||
 | 
					    LatticeFermion result(sFGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
 | 
				
			||||||
 | 
					    DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    double t0,t1;
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      Dw.Dhop(src,result,0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Vec5D Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    LatticeFermion r_eo(sFGrid);
 | 
				
			||||||
 | 
					    LatticeFermion src_e (sFrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion src_o (sFrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion r_e   (sFrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion r_o   (sFrbGrid);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    pickCheckerboard(Even,src_e,src);
 | 
				
			||||||
 | 
					    pickCheckerboard(Odd,src_o,src);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    setCheckerboard(r_eo,src_o);
 | 
				
			||||||
 | 
					    setCheckerboard(r_eo,src_e);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    r_e = zero;
 | 
				
			||||||
 | 
					    r_o = zero;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for (int i = 0; i < ncall; i++) {
 | 
				
			||||||
 | 
					      Dw.DhopEO(src_o, r_e, DaggerNo);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Vec5D DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for (int i = 0; i < ncall; i++) {
 | 
				
			||||||
 | 
					      Dw.Mooee(src_o, r_o);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Vec5D Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for (int i = 0; i < ncall; i++) {
 | 
				
			||||||
 | 
					      Dw.MooeeInv(src_o, r_o);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Vec5D MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for (int i = 0; i < ncall; i++) {
 | 
				
			||||||
 | 
					      Dw.Meooe(src_o, r_e);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    FGrid->Barrier();
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Called Vec5D Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Grid_finalize();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
				
			|||||||
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
 | 
					        AC_DEFINE([AVX1],[1],[AVX intrinsics])
 | 
				
			||||||
        SIMD_FLAGS='-mavx -xavx';;
 | 
					        SIMD_FLAGS='-mavx -xavx';;
 | 
				
			||||||
      AVXFMA)
 | 
					      AVXFMA)
 | 
				
			||||||
        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
 | 
					        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
 | 
				
			||||||
        SIMD_FLAGS='-mavx -mfma';;
 | 
					        SIMD_FLAGS='-mavx -fma';;
 | 
				
			||||||
      AVX2)
 | 
					      AVX2)
 | 
				
			||||||
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
 | 
					        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
 | 
				
			||||||
        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
 | 
					        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
 | 
				
			||||||
@@ -290,7 +290,7 @@ esac
 | 
				
			|||||||
case ${ac_COMMS} in
 | 
					case ${ac_COMMS} in
 | 
				
			||||||
    *-auto)
 | 
					    *-auto)
 | 
				
			||||||
        LX_FIND_MPI
 | 
					        LX_FIND_MPI
 | 
				
			||||||
        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
 | 
					        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi
 | 
				
			||||||
        AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
 | 
					        AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
 | 
				
			||||||
        AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
 | 
					        AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
 | 
				
			||||||
        AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
 | 
					        AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -244,8 +244,11 @@ namespace Grid {
 | 
				
			|||||||
            pokeLocalSite(s,pgbuf,cbuf);
 | 
					            pokeLocalSite(s,pgbuf,cbuf);
 | 
				
			||||||
          }
 | 
					          }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					        if (p != processors[dim] - 1)
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
          result = Cshift(result,dim,L);
 | 
					          result = Cshift(result,dim,L);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      // Loop over orthog coords
 | 
					      // Loop over orthog coords
 | 
				
			||||||
      int NN=pencil_g.lSites();
 | 
					      int NN=pencil_g.lSites();
 | 
				
			||||||
@@ -287,10 +290,10 @@ namespace Grid {
 | 
				
			|||||||
          cgbuf = clbuf;
 | 
					          cgbuf = clbuf;
 | 
				
			||||||
          cgbuf[dim] = clbuf[dim]+L*pc;
 | 
					          cgbuf[dim] = clbuf[dim]+L*pc;
 | 
				
			||||||
          peekLocalSite(s,pgbuf,cgbuf);
 | 
					          peekLocalSite(s,pgbuf,cgbuf);
 | 
				
			||||||
          s = s * div;
 | 
					 | 
				
			||||||
          pokeLocalSite(s,result,clbuf);
 | 
					          pokeLocalSite(s,result,clbuf);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					      result = result*div;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      // destroying plan
 | 
					      // destroying plan
 | 
				
			||||||
      FFTW<scalar>::fftw_destroy_plan(p);
 | 
					      FFTW<scalar>::fftw_destroy_plan(p);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -203,6 +203,7 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 | 
				
			|||||||
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 | 
					typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 | 
				
			||||||
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 | 
					typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 | 
				
			||||||
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 | 
					typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 | 
					typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 | 
				
			||||||
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 | 
					typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 | 
				
			||||||
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 | 
					typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 | 
				
			||||||
@@ -211,6 +212,20 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 | 
				
			|||||||
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 | 
					typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 | 
				
			||||||
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 | 
					typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Ls vectorised 
 | 
				
			||||||
 | 
					typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
 | 
				
			||||||
 | 
					typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
 | 
				
			||||||
 | 
					typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
 | 
				
			||||||
 | 
					typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
 | 
				
			||||||
 | 
					typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
 | 
				
			||||||
 | 
					typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
 | 
				
			||||||
 | 
					typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 | 
					typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 | 
				
			||||||
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 | 
					typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 | 
				
			||||||
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
 | 
					typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
 | 
				
			||||||
@@ -269,6 +284,7 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 | 
				
			|||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 | 
					typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 | 
				
			||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
 | 
					typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  }}
 | 
					  }}
 | 
				
			||||||
///////////////////////////////////////////////////////////////////////////////
 | 
					///////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 | 
					// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -194,6 +194,11 @@ void WilsonFermion5D<Impl>::Report(void)
 | 
				
			|||||||
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
					    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
				
			||||||
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
					    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
   }
 | 
					   }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( DerivCalls > 0 ) {
 | 
					  if ( DerivCalls > 0 ) {
 | 
				
			||||||
@@ -209,12 +214,15 @@ void WilsonFermion5D<Impl>::Report(void)
 | 
				
			|||||||
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
 | 
					    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
 | 
				
			||||||
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
					    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
				
			||||||
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
 | 
					    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
 | 
				
			||||||
  }
 | 
					
 | 
				
			||||||
 | 
					    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (DerivCalls > 0 || DhopCalls > 0){
 | 
					  if (DerivCalls > 0 || DhopCalls > 0){
 | 
				
			||||||
    std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
 | 
					    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
 | 
				
			||||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
 | 
					    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
 | 
				
			||||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
 | 
					    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -167,7 +167,7 @@ namespace Optimization {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    //Integer
 | 
					    //Integer
 | 
				
			||||||
    inline __m256i operator()(__m256i a, __m256i b){
 | 
					    inline __m256i operator()(__m256i a, __m256i b){
 | 
				
			||||||
#if defined (AVX1) || defined (AVXFMA4)
 | 
					#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
 | 
				
			||||||
          __m128i a0,a1;
 | 
					          __m128i a0,a1;
 | 
				
			||||||
          __m128i b0,b1;
 | 
					          __m128i b0,b1;
 | 
				
			||||||
          a0 = _mm256_extractf128_si256(a,0);
 | 
					          a0 = _mm256_extractf128_si256(a,0);
 | 
				
			||||||
@@ -195,7 +195,7 @@ namespace Optimization {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    //Integer
 | 
					    //Integer
 | 
				
			||||||
    inline __m256i operator()(__m256i a, __m256i b){
 | 
					    inline __m256i operator()(__m256i a, __m256i b){
 | 
				
			||||||
#if defined (AVX1) || defined (AVXFMA4)
 | 
					#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
 | 
				
			||||||
          __m128i a0,a1;
 | 
					          __m128i a0,a1;
 | 
				
			||||||
          __m128i b0,b1;
 | 
					          __m128i b0,b1;
 | 
				
			||||||
          a0 = _mm256_extractf128_si256(a,0);
 | 
					          a0 = _mm256_extractf128_si256(a,0);
 | 
				
			||||||
@@ -233,7 +233,7 @@ namespace Optimization {
 | 
				
			|||||||
      a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
					      a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
      return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
					      return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined (AVX2)
 | 
					#if defined (AVX2)  || defined (AVXFMA)
 | 
				
			||||||
      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
 | 
					      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
 | 
				
			||||||
      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
 | 
					      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
 | 
				
			||||||
      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
					      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
@@ -279,7 +279,7 @@ namespace Optimization {
 | 
				
			|||||||
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
					      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
					      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined (AVX2)
 | 
					#if defined (AVX2) || defined (AVXFMA)
 | 
				
			||||||
      __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
 | 
					      __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
 | 
				
			||||||
      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
 | 
					      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
 | 
				
			||||||
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
					      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
@@ -320,7 +320,7 @@ namespace Optimization {
 | 
				
			|||||||
#if defined (AVXFMA4)
 | 
					#if defined (AVXFMA4)
 | 
				
			||||||
      a= _mm256_macc_ps(b,c,a);
 | 
					      a= _mm256_macc_ps(b,c,a);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined (AVX2)
 | 
					#if defined (AVX2) || defined (AVXFMA)
 | 
				
			||||||
      a= _mm256_fmadd_ps( b, c, a);
 | 
					      a= _mm256_fmadd_ps( b, c, a);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -332,7 +332,7 @@ namespace Optimization {
 | 
				
			|||||||
#if defined (AVXFMA4)
 | 
					#if defined (AVXFMA4)
 | 
				
			||||||
      a= _mm256_macc_pd(b,c,a);
 | 
					      a= _mm256_macc_pd(b,c,a);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined (AVX2)
 | 
					#if defined (AVX2) || defined (AVXFMA)
 | 
				
			||||||
      a= _mm256_fmadd_pd( b, c, a);
 | 
					      a= _mm256_fmadd_pd( b, c, a);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -347,7 +347,7 @@ namespace Optimization {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    // Integer
 | 
					    // Integer
 | 
				
			||||||
    inline __m256i operator()(__m256i a, __m256i b){
 | 
					    inline __m256i operator()(__m256i a, __m256i b){
 | 
				
			||||||
#if defined (AVX1) 
 | 
					#if defined (AVX1) || defined (AVXFMA)
 | 
				
			||||||
      __m128i a0,a1;
 | 
					      __m128i a0,a1;
 | 
				
			||||||
      __m128i b0,b1;
 | 
					      __m128i b0,b1;
 | 
				
			||||||
      a0 = _mm256_extractf128_si256(a,0);
 | 
					      a0 = _mm256_extractf128_si256(a,0);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -245,6 +245,21 @@ namespace Optimization {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Div{
 | 
				
			||||||
 | 
					    // Real double
 | 
				
			||||||
 | 
					    inline vector4double operator()(vector4double a, vector4double b){
 | 
				
			||||||
 | 
					      return vec_swdiv(a, b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Real float
 | 
				
			||||||
 | 
					    FLOAT_WRAP_2(operator(), inline)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Integer
 | 
				
			||||||
 | 
					    inline int operator()(int a, int b){
 | 
				
			||||||
 | 
					      return a/b;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Conj{
 | 
					  struct Conj{
 | 
				
			||||||
    // Complex double
 | 
					    // Complex double
 | 
				
			||||||
    inline vector4double operator()(vector4double v){
 | 
					    inline vector4double operator()(vector4double v){
 | 
				
			||||||
@@ -413,6 +428,7 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 | 
				
			|||||||
typedef Optimization::Sum         SumSIMD;
 | 
					typedef Optimization::Sum         SumSIMD;
 | 
				
			||||||
typedef Optimization::Sub         SubSIMD;
 | 
					typedef Optimization::Sub         SubSIMD;
 | 
				
			||||||
typedef Optimization::Mult        MultSIMD;
 | 
					typedef Optimization::Mult        MultSIMD;
 | 
				
			||||||
 | 
					typedef Optimization::Div         DivSIMD;
 | 
				
			||||||
typedef Optimization::MultComplex MultComplexSIMD;
 | 
					typedef Optimization::MultComplex MultComplexSIMD;
 | 
				
			||||||
typedef Optimization::Conj        ConjSIMD;
 | 
					typedef Optimization::Conj        ConjSIMD;
 | 
				
			||||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
					typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -44,7 +44,7 @@ directory
 | 
				
			|||||||
#ifdef SSE4
 | 
					#ifdef SSE4
 | 
				
			||||||
#include "Grid_sse4.h"
 | 
					#include "Grid_sse4.h"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
 | 
					#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
 | 
				
			||||||
#include "Grid_avx.h"
 | 
					#include "Grid_avx.h"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined AVX512
 | 
					#if defined AVX512
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -50,6 +50,12 @@ public:
 | 
				
			|||||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
 | 
					  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
 | 
				
			||||||
  std::string name(void) const { return std::string("Times"); }
 | 
					  std::string name(void) const { return std::string("Times"); }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					class funcDivide {
 | 
				
			||||||
 | 
					public:
 | 
				
			||||||
 | 
					  funcDivide() {};
 | 
				
			||||||
 | 
					  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
 | 
				
			||||||
 | 
					  std::string name(void) const { return std::string("Divide"); }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
class funcConj {
 | 
					class funcConj {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  funcConj() {};
 | 
					  funcConj() {};
 | 
				
			||||||
@@ -341,6 +347,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  Tester<RealF,vRealF>(funcPlus());
 | 
					  Tester<RealF,vRealF>(funcPlus());
 | 
				
			||||||
  Tester<RealF,vRealF>(funcMinus());
 | 
					  Tester<RealF,vRealF>(funcMinus());
 | 
				
			||||||
  Tester<RealF,vRealF>(funcTimes());
 | 
					  Tester<RealF,vRealF>(funcTimes());
 | 
				
			||||||
 | 
					  Tester<RealF,vRealF>(funcDivide());
 | 
				
			||||||
  Tester<RealF,vRealF>(funcAdj());
 | 
					  Tester<RealF,vRealF>(funcAdj());
 | 
				
			||||||
  Tester<RealF,vRealF>(funcConj());
 | 
					  Tester<RealF,vRealF>(funcConj());
 | 
				
			||||||
  Tester<RealF,vRealF>(funcInnerProduct());
 | 
					  Tester<RealF,vRealF>(funcInnerProduct());
 | 
				
			||||||
@@ -371,6 +378,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  Tester<RealD,vRealD>(funcPlus());
 | 
					  Tester<RealD,vRealD>(funcPlus());
 | 
				
			||||||
  Tester<RealD,vRealD>(funcMinus());
 | 
					  Tester<RealD,vRealD>(funcMinus());
 | 
				
			||||||
  Tester<RealD,vRealD>(funcTimes());
 | 
					  Tester<RealD,vRealD>(funcTimes());
 | 
				
			||||||
 | 
					  Tester<RealD,vRealD>(funcDivide());
 | 
				
			||||||
  Tester<RealD,vRealD>(funcAdj());
 | 
					  Tester<RealD,vRealD>(funcAdj());
 | 
				
			||||||
  Tester<RealD,vRealD>(funcConj());
 | 
					  Tester<RealD,vRealD>(funcConj());
 | 
				
			||||||
  Tester<RealD,vRealD>(funcInnerProduct());
 | 
					  Tester<RealD,vRealD>(funcInnerProduct());
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  for(int mu=0;mu<4;mu++){
 | 
					  for(int mu=0;mu<4;mu++){
 | 
				
			||||||
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 | 
					    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 | 
				
			||||||
    LatticeCoordinate(coor,mu);
 | 
					    LatticeCoordinate(coor,mu);
 | 
				
			||||||
    C = C - (TwoPiL * p[mu]) * coor;
 | 
					    C = C + (TwoPiL * p[mu]) * coor;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  C = exp(C*ci);
 | 
					  C = exp(C*ci);
 | 
				
			||||||
@@ -78,10 +78,11 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  FFT theFFT(&Fine);
 | 
					  FFT theFFT(&Fine);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
 | 
					  Ctilde = C;
 | 
				
			||||||
  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
 | 
					  theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
 | 
				
			||||||
  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
 | 
					  theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
 | 
				
			||||||
  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
 | 
					  theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
 | 
				
			||||||
 | 
					  theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //  C=zero;
 | 
					  //  C=zero;
 | 
				
			||||||
  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
 | 
					  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
 | 
				
			||||||
@@ -93,10 +94,11 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  C=C-Ctilde;
 | 
					  C=C-Ctilde;
 | 
				
			||||||
  std::cout << "diff scalar "<<norm2(C) << std::endl;
 | 
					  std::cout << "diff scalar "<<norm2(C) << std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
 | 
					  Stilde = S;
 | 
				
			||||||
  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
 | 
					  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
 | 
				
			||||||
  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
 | 
					  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
 | 
				
			||||||
  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
 | 
					  theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
 | 
				
			||||||
 | 
					  theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  SpinMatrixF Sp; 
 | 
					  SpinMatrixF Sp; 
 | 
				
			||||||
  Sp = zero; Sp = Sp+cVol;
 | 
					  Sp = zero; Sp = Sp+cVol;
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user