mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	
							
								
								
									
										2
									
								
								README
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								README
									
									
									
									
									
								
							@@ -33,6 +33,8 @@ MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is
 | 
				
			|||||||
by setting variables in the command line or in the environment.  Here
 | 
					by setting variables in the command line or in the environment.  Here
 | 
				
			||||||
is are examples:
 | 
					is are examples:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
 | 
					     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
 | 
					     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -37,6 +37,8 @@ MPI, OpenMP, and SIMD parallelism are present in the library.
 | 
				
			|||||||
by setting variables in the command line or in the environment.  Here
 | 
					by setting variables in the command line or in the environment.  Here
 | 
				
			||||||
are examples:
 | 
					are examples:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
 | 
					     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
 | 
					     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										104
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										104
									
								
								configure
									
									
									
									
										vendored
									
									
								
							@@ -4503,6 +4503,11 @@ _ACEOF
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Checks for library functions.
 | 
					# Checks for library functions.
 | 
				
			||||||
 | 
					echo
 | 
				
			||||||
 | 
					echo Checking libraries
 | 
				
			||||||
 | 
					echo :::::::::::::::::::::::::::::::::::::::::::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for ac_func in gettimeofday
 | 
					for ac_func in gettimeofday
 | 
				
			||||||
do :
 | 
					do :
 | 
				
			||||||
  ac_fn_cxx_check_func "$LINENO" "gettimeofday" "ac_cv_func_gettimeofday"
 | 
					  ac_fn_cxx_check_func "$LINENO" "gettimeofday" "ac_cv_func_gettimeofday"
 | 
				
			||||||
@@ -4515,6 +4520,105 @@ fi
 | 
				
			|||||||
done
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __gmpf_init in -lgmp" >&5
 | 
				
			||||||
 | 
					$as_echo_n "checking for __gmpf_init in -lgmp... " >&6; }
 | 
				
			||||||
 | 
					if ${ac_cv_lib_gmp___gmpf_init+:} false; then :
 | 
				
			||||||
 | 
					  $as_echo_n "(cached) " >&6
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  ac_check_lib_save_LIBS=$LIBS
 | 
				
			||||||
 | 
					LIBS="-lgmp  $LIBS"
 | 
				
			||||||
 | 
					cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 | 
				
			||||||
 | 
					/* end confdefs.h.  */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Override any GCC internal prototype to avoid an error.
 | 
				
			||||||
 | 
					   Use char because int might match the return type of a GCC
 | 
				
			||||||
 | 
					   builtin and then its argument prototype would still apply.  */
 | 
				
			||||||
 | 
					#ifdef __cplusplus
 | 
				
			||||||
 | 
					extern "C"
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					char __gmpf_init ();
 | 
				
			||||||
 | 
					int
 | 
				
			||||||
 | 
					main ()
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					return __gmpf_init ();
 | 
				
			||||||
 | 
					  ;
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					_ACEOF
 | 
				
			||||||
 | 
					if ac_fn_cxx_try_link "$LINENO"; then :
 | 
				
			||||||
 | 
					  ac_cv_lib_gmp___gmpf_init=yes
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  ac_cv_lib_gmp___gmpf_init=no
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					rm -f core conftest.err conftest.$ac_objext \
 | 
				
			||||||
 | 
					    conftest$ac_exeext conftest.$ac_ext
 | 
				
			||||||
 | 
					LIBS=$ac_check_lib_save_LIBS
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_gmp___gmpf_init" >&5
 | 
				
			||||||
 | 
					$as_echo "$ac_cv_lib_gmp___gmpf_init" >&6; }
 | 
				
			||||||
 | 
					if test "x$ac_cv_lib_gmp___gmpf_init" = xyes; then :
 | 
				
			||||||
 | 
					  cat >>confdefs.h <<_ACEOF
 | 
				
			||||||
 | 
					#define HAVE_LIBGMP 1
 | 
				
			||||||
 | 
					_ACEOF
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LIBS="-lgmp $LIBS"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  as_fn_error $? "GNU Multiple Precision GMP library was not found in your system.
 | 
				
			||||||
 | 
					Please install or provide the correct path to your installation
 | 
				
			||||||
 | 
					Info at: http://www.gmplib.org" "$LINENO" 5
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for mpfr_init in -lmpfr" >&5
 | 
				
			||||||
 | 
					$as_echo_n "checking for mpfr_init in -lmpfr... " >&6; }
 | 
				
			||||||
 | 
					if ${ac_cv_lib_mpfr_mpfr_init+:} false; then :
 | 
				
			||||||
 | 
					  $as_echo_n "(cached) " >&6
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  ac_check_lib_save_LIBS=$LIBS
 | 
				
			||||||
 | 
					LIBS="-lmpfr  $LIBS"
 | 
				
			||||||
 | 
					cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 | 
				
			||||||
 | 
					/* end confdefs.h.  */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Override any GCC internal prototype to avoid an error.
 | 
				
			||||||
 | 
					   Use char because int might match the return type of a GCC
 | 
				
			||||||
 | 
					   builtin and then its argument prototype would still apply.  */
 | 
				
			||||||
 | 
					#ifdef __cplusplus
 | 
				
			||||||
 | 
					extern "C"
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					char mpfr_init ();
 | 
				
			||||||
 | 
					int
 | 
				
			||||||
 | 
					main ()
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					return mpfr_init ();
 | 
				
			||||||
 | 
					  ;
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					_ACEOF
 | 
				
			||||||
 | 
					if ac_fn_cxx_try_link "$LINENO"; then :
 | 
				
			||||||
 | 
					  ac_cv_lib_mpfr_mpfr_init=yes
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  ac_cv_lib_mpfr_mpfr_init=no
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					rm -f core conftest.err conftest.$ac_objext \
 | 
				
			||||||
 | 
					    conftest$ac_exeext conftest.$ac_ext
 | 
				
			||||||
 | 
					LIBS=$ac_check_lib_save_LIBS
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_mpfr_mpfr_init" >&5
 | 
				
			||||||
 | 
					$as_echo "$ac_cv_lib_mpfr_mpfr_init" >&6; }
 | 
				
			||||||
 | 
					if test "x$ac_cv_lib_mpfr_mpfr_init" = xyes; then :
 | 
				
			||||||
 | 
					  cat >>confdefs.h <<_ACEOF
 | 
				
			||||||
 | 
					#define HAVE_LIBMPFR 1
 | 
				
			||||||
 | 
					_ACEOF
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LIBS="-lmpfr $LIBS"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					  as_fn_error $? "GNU Multiple Precision MPFR library was not found in your system.
 | 
				
			||||||
 | 
					Please install or provide the correct path to your installation
 | 
				
			||||||
 | 
					Info at: http://www.mpfr.org/" "$LINENO" 5
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										16
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								configure.ac
									
									
									
									
									
								
							@@ -3,7 +3,7 @@
 | 
				
			|||||||
#
 | 
					#
 | 
				
			||||||
# Project Grid package  
 | 
					# Project Grid package  
 | 
				
			||||||
# 
 | 
					# 
 | 
				
			||||||
# Time-stamp: <2015-05-18 17:14:20 neo>
 | 
					# Time-stamp: <2015-05-19 13:51:08 neo>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
AC_PREREQ([2.69])
 | 
					AC_PREREQ([2.69])
 | 
				
			||||||
AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
 | 
					AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
 | 
				
			||||||
@@ -46,8 +46,22 @@ AC_TYPE_UINT32_T
 | 
				
			|||||||
AC_TYPE_UINT64_T
 | 
					AC_TYPE_UINT64_T
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Checks for library functions.
 | 
					# Checks for library functions.
 | 
				
			||||||
 | 
					echo
 | 
				
			||||||
 | 
					echo Checking libraries 
 | 
				
			||||||
 | 
					echo :::::::::::::::::::::::::::::::::::::::::::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
AC_CHECK_FUNCS([gettimeofday])
 | 
					AC_CHECK_FUNCS([gettimeofday])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					AC_CHECK_LIB([gmp],[__gmpf_init],,
 | 
				
			||||||
 | 
					        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
 | 
				
			||||||
 | 
					Please install or provide the correct path to your installation
 | 
				
			||||||
 | 
					Info at: http://www.gmplib.org)])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					AC_CHECK_LIB([mpfr],[mpfr_init],,
 | 
				
			||||||
 | 
					        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
 | 
				
			||||||
 | 
					Please install or provide the correct path to your installation
 | 
				
			||||||
 | 
					Info at: http://www.mpfr.org/)])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -36,6 +36,12 @@
 | 
				
			|||||||
/* Define to 1 if you have the <inttypes.h> header file. */
 | 
					/* Define to 1 if you have the <inttypes.h> header file. */
 | 
				
			||||||
#define HAVE_INTTYPES_H 1
 | 
					#define HAVE_INTTYPES_H 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Define to 1 if you have the `gmp' library (-lgmp). */
 | 
				
			||||||
 | 
					#define HAVE_LIBGMP 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Define to 1 if you have the `mpfr' library (-lmpfr). */
 | 
				
			||||||
 | 
					#define HAVE_LIBMPFR 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Define to 1 if you have the <malloc.h> header file. */
 | 
					/* Define to 1 if you have the <malloc.h> header file. */
 | 
				
			||||||
#define HAVE_MALLOC_H 1
 | 
					#define HAVE_MALLOC_H 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -35,6 +35,12 @@
 | 
				
			|||||||
/* Define to 1 if you have the <inttypes.h> header file. */
 | 
					/* Define to 1 if you have the <inttypes.h> header file. */
 | 
				
			||||||
#undef HAVE_INTTYPES_H
 | 
					#undef HAVE_INTTYPES_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Define to 1 if you have the `gmp' library (-lgmp). */
 | 
				
			||||||
 | 
					#undef HAVE_LIBGMP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Define to 1 if you have the `mpfr' library (-lmpfr). */
 | 
				
			||||||
 | 
					#undef HAVE_LIBMPFR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Define to 1 if you have the <malloc.h> header file. */
 | 
					/* Define to 1 if you have the <malloc.h> header file. */
 | 
				
			||||||
#undef HAVE_MALLOC_H
 | 
					#undef HAVE_MALLOC_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -95,5 +95,8 @@ nobase_include_HEADERS = algorithms/approx/bigfloat.h\
 | 
				
			|||||||
	simd/Grid_vComplexF.h					\
 | 
						simd/Grid_vComplexF.h					\
 | 
				
			||||||
	simd/Grid_vInteger.h					\
 | 
						simd/Grid_vInteger.h					\
 | 
				
			||||||
	simd/Grid_vRealD.h					\
 | 
						simd/Grid_vRealD.h					\
 | 
				
			||||||
	simd/Grid_vRealF.h
 | 
						simd/Grid_vRealF.h					\
 | 
				
			||||||
 | 
						simd/Grid_vector_types.h				\
 | 
				
			||||||
 | 
						simd/Grid_sse4.h					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										0
									
								
								lib/algorithms/approx/Remez.cc
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/algorithms/approx/Remez.cc
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								lib/algorithms/approx/Remez.h
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/algorithms/approx/Remez.h
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								lib/algorithms/approx/Zolotarev.cc
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/algorithms/approx/Zolotarev.cc
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								lib/algorithms/approx/Zolotarev.h
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/algorithms/approx/Zolotarev.h
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								lib/algorithms/approx/bigfloat.h
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/algorithms/approx/bigfloat.h
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							@@ -138,16 +138,8 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 | 
				
			|||||||
  return ;
 | 
					  return ;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
					void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  assert((dag==0) ||(dag==1));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  WilsonCompressor compressor(dag);
 | 
					 | 
				
			||||||
  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					 | 
				
			||||||
  for(int sss=0;sss<grid->oSites();sss++){
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    vHalfSpinColourVector  tmp;    
 | 
					    vHalfSpinColourVector  tmp;    
 | 
				
			||||||
    vHalfSpinColourVector  chi;    
 | 
					    vHalfSpinColourVector  chi;    
 | 
				
			||||||
    vSpinColourVector result;
 | 
					    vSpinColourVector result;
 | 
				
			||||||
@@ -155,16 +147,14 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    int offset,local,perm, ptype;
 | 
					    int offset,local,perm, ptype;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    //    int ss = Stencil._LebesgueReorder[sss];
 | 
					    //    int ss = Stencil._LebesgueReorder[sss];
 | 
				
			||||||
    int ss = sss;
 | 
					 | 
				
			||||||
    int ssu= ss;
 | 
					    int ssu= ss;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Xp
 | 
					    // Xp
 | 
				
			||||||
    int idx = (Xp+dag*4)%8;
 | 
					    offset = Stencil._offsets [Xp][ss];
 | 
				
			||||||
    offset = Stencil._offsets [idx][ss];
 | 
					    local  = Stencil._is_local[Xp][ss];
 | 
				
			||||||
    local  = Stencil._is_local[idx][ss];
 | 
					    perm   = Stencil._permute[Xp][ss];
 | 
				
			||||||
    perm   = Stencil._permute[idx][ss];
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ptype  = Stencil._permute_type[idx];
 | 
					    ptype  = Stencil._permute_type[Xp];
 | 
				
			||||||
    if ( local && perm ) {
 | 
					    if ( local && perm ) {
 | 
				
			||||||
      spProjXp(tmp,in._odata[offset]);
 | 
					      spProjXp(tmp,in._odata[offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
@@ -173,16 +163,14 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      chi=comm_buf[offset];
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
 | 
				
			||||||
    spReconXp(result,Uchi);
 | 
					    spReconXp(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Yp
 | 
					    // Yp
 | 
				
			||||||
    idx = (Yp+dag*4)%8;
 | 
					    offset = Stencil._offsets [Yp][ss];
 | 
				
			||||||
    offset = Stencil._offsets [idx][ss];
 | 
					    local  = Stencil._is_local[Yp][ss];
 | 
				
			||||||
    local  = Stencil._is_local[idx][ss];
 | 
					    perm   = Stencil._permute[Yp][ss];
 | 
				
			||||||
    perm   = Stencil._permute[idx][ss];
 | 
					    ptype  = Stencil._permute_type[Yp];
 | 
				
			||||||
    ptype  = Stencil._permute_type[idx];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if ( local && perm ) {
 | 
					    if ( local && perm ) {
 | 
				
			||||||
      spProjYp(tmp,in._odata[offset]);
 | 
					      spProjYp(tmp,in._odata[offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
@@ -191,15 +179,14 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      chi=comm_buf[offset];
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
 | 
				
			||||||
    accumReconYp(result,Uchi);
 | 
					    accumReconYp(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Zp
 | 
					    // Zp
 | 
				
			||||||
    idx = (Zp+dag*4)%8;
 | 
					    offset = Stencil._offsets [Zp][ss];
 | 
				
			||||||
    offset = Stencil._offsets [idx][ss];
 | 
					    local  = Stencil._is_local[Zp][ss];
 | 
				
			||||||
    local  = Stencil._is_local[idx][ss];
 | 
					    perm   = Stencil._permute[Zp][ss];
 | 
				
			||||||
    perm   = Stencil._permute[idx][ss];
 | 
					    ptype  = Stencil._permute_type[Zp];
 | 
				
			||||||
    ptype  = Stencil._permute_type[idx];
 | 
					 | 
				
			||||||
    if ( local && perm ) {
 | 
					    if ( local && perm ) {
 | 
				
			||||||
      spProjZp(tmp,in._odata[offset]);
 | 
					      spProjZp(tmp,in._odata[offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
@@ -208,15 +195,14 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      chi=comm_buf[offset];
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
 | 
				
			||||||
    accumReconZp(result,Uchi);
 | 
					    accumReconZp(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Tp
 | 
					    // Tp
 | 
				
			||||||
    idx = (Tp+dag*4)%8;
 | 
					    offset = Stencil._offsets [Tp][ss];
 | 
				
			||||||
    offset = Stencil._offsets [idx][ss];
 | 
					    local  = Stencil._is_local[Tp][ss];
 | 
				
			||||||
    local  = Stencil._is_local[idx][ss];
 | 
					    perm   = Stencil._permute[Tp][ss];
 | 
				
			||||||
    perm   = Stencil._permute[idx][ss];
 | 
					    ptype  = Stencil._permute_type[Tp];
 | 
				
			||||||
    ptype  = Stencil._permute_type[idx];
 | 
					 | 
				
			||||||
    if ( local && perm ) {
 | 
					    if ( local && perm ) {
 | 
				
			||||||
      spProjTp(tmp,in._odata[offset]);
 | 
					      spProjTp(tmp,in._odata[offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
@@ -225,15 +211,14 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      chi=comm_buf[offset];
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
 | 
				
			||||||
    accumReconTp(result,Uchi);
 | 
					    accumReconTp(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Xm
 | 
					    // Xm
 | 
				
			||||||
    idx = (Xm+dag*4)%8;
 | 
					    offset = Stencil._offsets [Xm][ss];
 | 
				
			||||||
    offset = Stencil._offsets [idx][ss];
 | 
					    local  = Stencil._is_local[Xm][ss];
 | 
				
			||||||
    local  = Stencil._is_local[idx][ss];
 | 
					    perm   = Stencil._permute[Xm][ss];
 | 
				
			||||||
    perm   = Stencil._permute[idx][ss];
 | 
					    ptype  = Stencil._permute_type[Xm];
 | 
				
			||||||
    ptype  = Stencil._permute_type[idx];
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if ( local && perm ) 
 | 
					    if ( local && perm ) 
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
@@ -244,16 +229,15 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      chi=comm_buf[offset];
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
 | 
				
			||||||
    accumReconXm(result,Uchi);
 | 
					    accumReconXm(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Ym
 | 
					    // Ym
 | 
				
			||||||
    idx = (Ym+dag*4)%8;
 | 
					    offset = Stencil._offsets [Ym][ss];
 | 
				
			||||||
    offset = Stencil._offsets [idx][ss];
 | 
					    local  = Stencil._is_local[Ym][ss];
 | 
				
			||||||
    local  = Stencil._is_local[idx][ss];
 | 
					    perm   = Stencil._permute[Ym][ss];
 | 
				
			||||||
    perm   = Stencil._permute[idx][ss];
 | 
					    ptype  = Stencil._permute_type[Ym];
 | 
				
			||||||
    ptype  = Stencil._permute_type[idx];
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if ( local && perm ) {
 | 
					    if ( local && perm ) {
 | 
				
			||||||
      spProjYm(tmp,in._odata[offset]);
 | 
					      spProjYm(tmp,in._odata[offset]);
 | 
				
			||||||
@@ -263,15 +247,14 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      chi=comm_buf[offset];
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
 | 
				
			||||||
    accumReconYm(result,Uchi);
 | 
					    accumReconYm(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Zm
 | 
					    // Zm
 | 
				
			||||||
    idx = (Zm+dag*4)%8;
 | 
					    offset = Stencil._offsets [Zm][ss];
 | 
				
			||||||
    offset = Stencil._offsets [idx][ss];
 | 
					    local  = Stencil._is_local[Zm][ss];
 | 
				
			||||||
    local  = Stencil._is_local[idx][ss];
 | 
					    perm   = Stencil._permute[Zm][ss];
 | 
				
			||||||
    perm   = Stencil._permute[idx][ss];
 | 
					    ptype  = Stencil._permute_type[Zm];
 | 
				
			||||||
    ptype  = Stencil._permute_type[idx];
 | 
					 | 
				
			||||||
    if ( local && perm ) {
 | 
					    if ( local && perm ) {
 | 
				
			||||||
      spProjZm(tmp,in._odata[offset]);
 | 
					      spProjZm(tmp,in._odata[offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
@@ -280,15 +263,14 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      chi=comm_buf[offset];
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
 | 
				
			||||||
    accumReconZm(result,Uchi);
 | 
					    accumReconZm(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Tm
 | 
					    // Tm
 | 
				
			||||||
    idx = (Tm+dag*4)%8;
 | 
					    offset = Stencil._offsets [Tm][ss];
 | 
				
			||||||
    offset = Stencil._offsets [idx][ss];
 | 
					    local  = Stencil._is_local[Tm][ss];
 | 
				
			||||||
    local  = Stencil._is_local[idx][ss];
 | 
					    perm   = Stencil._permute[Tm][ss];
 | 
				
			||||||
    perm   = Stencil._permute[idx][ss];
 | 
					    ptype  = Stencil._permute_type[Tm];
 | 
				
			||||||
    ptype  = Stencil._permute_type[idx];
 | 
					 | 
				
			||||||
    if ( local && perm ) {
 | 
					    if ( local && perm ) {
 | 
				
			||||||
      spProjTm(tmp,in._odata[offset]);
 | 
					      spProjTm(tmp,in._odata[offset]);
 | 
				
			||||||
      permute(chi,tmp,ptype);
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
@@ -297,12 +279,175 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      chi=comm_buf[offset];
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
 | 
				
			||||||
 | 
					    accumReconTm(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    vstream(out._odata[ss],result);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    vHalfSpinColourVector  tmp;    
 | 
				
			||||||
 | 
					    vHalfSpinColourVector  chi;    
 | 
				
			||||||
 | 
					    vSpinColourVector result;
 | 
				
			||||||
 | 
					    vHalfSpinColourVector Uchi;
 | 
				
			||||||
 | 
					    int offset,local,perm, ptype;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int ssu= ss;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Xp
 | 
				
			||||||
 | 
					    offset = Stencil._offsets [Xm][ss];
 | 
				
			||||||
 | 
					    local  = Stencil._is_local[Xm][ss];
 | 
				
			||||||
 | 
					    perm   = Stencil._permute[Xm][ss];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ptype  = Stencil._permute_type[Xm];
 | 
				
			||||||
 | 
					    if ( local && perm ) {
 | 
				
			||||||
 | 
					      spProjXp(tmp,in._odata[offset]);
 | 
				
			||||||
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
 | 
					    } else if ( local ) {
 | 
				
			||||||
 | 
					      spProjXp(chi,in._odata[offset]);
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
 | 
				
			||||||
 | 
					    spReconXp(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Yp
 | 
				
			||||||
 | 
					    offset = Stencil._offsets [Ym][ss];
 | 
				
			||||||
 | 
					    local  = Stencil._is_local[Ym][ss];
 | 
				
			||||||
 | 
					    perm   = Stencil._permute[Ym][ss];
 | 
				
			||||||
 | 
					    ptype  = Stencil._permute_type[Ym];
 | 
				
			||||||
 | 
					    if ( local && perm ) {
 | 
				
			||||||
 | 
					      spProjYp(tmp,in._odata[offset]);
 | 
				
			||||||
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
 | 
					    } else if ( local ) {
 | 
				
			||||||
 | 
					      spProjYp(chi,in._odata[offset]);
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
 | 
				
			||||||
 | 
					    accumReconYp(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Zp
 | 
				
			||||||
 | 
					    offset = Stencil._offsets [Zm][ss];
 | 
				
			||||||
 | 
					    local  = Stencil._is_local[Zm][ss];
 | 
				
			||||||
 | 
					    perm   = Stencil._permute[Zm][ss];
 | 
				
			||||||
 | 
					    ptype  = Stencil._permute_type[Zm];
 | 
				
			||||||
 | 
					    if ( local && perm ) {
 | 
				
			||||||
 | 
					      spProjZp(tmp,in._odata[offset]);
 | 
				
			||||||
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
 | 
					    } else if ( local ) {
 | 
				
			||||||
 | 
					      spProjZp(chi,in._odata[offset]);
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
 | 
				
			||||||
 | 
					    accumReconZp(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Tp
 | 
				
			||||||
 | 
					    offset = Stencil._offsets [Tm][ss];
 | 
				
			||||||
 | 
					    local  = Stencil._is_local[Tm][ss];
 | 
				
			||||||
 | 
					    perm   = Stencil._permute[Tm][ss];
 | 
				
			||||||
 | 
					    ptype  = Stencil._permute_type[Tm];
 | 
				
			||||||
 | 
					    if ( local && perm ) {
 | 
				
			||||||
 | 
					      spProjTp(tmp,in._odata[offset]);
 | 
				
			||||||
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
 | 
					    } else if ( local ) {
 | 
				
			||||||
 | 
					      spProjTp(chi,in._odata[offset]);
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
 | 
				
			||||||
 | 
					    accumReconTp(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Xm
 | 
				
			||||||
 | 
					    offset = Stencil._offsets [Xp][ss];
 | 
				
			||||||
 | 
					    local  = Stencil._is_local[Xp][ss];
 | 
				
			||||||
 | 
					    perm   = Stencil._permute[Xp][ss];
 | 
				
			||||||
 | 
					    ptype  = Stencil._permute_type[Xp];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if ( local && perm ) 
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      spProjXm(tmp,in._odata[offset]);
 | 
				
			||||||
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
 | 
					    } else if ( local ) {
 | 
				
			||||||
 | 
					      spProjXm(chi,in._odata[offset]);
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
 | 
				
			||||||
 | 
					    accumReconXm(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Ym
 | 
				
			||||||
 | 
					    offset = Stencil._offsets [Yp][ss];
 | 
				
			||||||
 | 
					    local  = Stencil._is_local[Yp][ss];
 | 
				
			||||||
 | 
					    perm   = Stencil._permute[Yp][ss];
 | 
				
			||||||
 | 
					    ptype  = Stencil._permute_type[Yp];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if ( local && perm ) {
 | 
				
			||||||
 | 
					      spProjYm(tmp,in._odata[offset]);
 | 
				
			||||||
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
 | 
					    } else if ( local ) {
 | 
				
			||||||
 | 
					      spProjYm(chi,in._odata[offset]);
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
 | 
				
			||||||
 | 
					    accumReconYm(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Zm
 | 
				
			||||||
 | 
					    offset = Stencil._offsets [Zp][ss];
 | 
				
			||||||
 | 
					    local  = Stencil._is_local[Zp][ss];
 | 
				
			||||||
 | 
					    perm   = Stencil._permute[Zp][ss];
 | 
				
			||||||
 | 
					    ptype  = Stencil._permute_type[Zp];
 | 
				
			||||||
 | 
					    if ( local && perm ) {
 | 
				
			||||||
 | 
					      spProjZm(tmp,in._odata[offset]);
 | 
				
			||||||
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
 | 
					    } else if ( local ) {
 | 
				
			||||||
 | 
					      spProjZm(chi,in._odata[offset]);
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
 | 
				
			||||||
 | 
					    accumReconZm(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Tm
 | 
				
			||||||
 | 
					    offset = Stencil._offsets [Tp][ss];
 | 
				
			||||||
 | 
					    local  = Stencil._is_local[Tp][ss];
 | 
				
			||||||
 | 
					    perm   = Stencil._permute[Tp][ss];
 | 
				
			||||||
 | 
					    ptype  = Stencil._permute_type[Tp];
 | 
				
			||||||
 | 
					    if ( local && perm ) {
 | 
				
			||||||
 | 
					      spProjTm(tmp,in._odata[offset]);
 | 
				
			||||||
 | 
					      permute(chi,tmp,ptype);
 | 
				
			||||||
 | 
					    } else if ( local ) {
 | 
				
			||||||
 | 
					      spProjTm(chi,in._odata[offset]);
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					      chi=comm_buf[offset];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
 | 
				
			||||||
    accumReconTm(result,Uchi);
 | 
					    accumReconTm(result,Uchi);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    vstream(out._odata[ss],result);
 | 
					    vstream(out._odata[ss],result);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  assert((dag==0) ||(dag==1));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  WilsonCompressor compressor(dag);
 | 
				
			||||||
 | 
					  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( dag ) {
 | 
				
			||||||
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
 | 
					    for(int sss=0;sss<grid->oSites();sss++){
 | 
				
			||||||
 | 
					      DhopSiteDag(sss,in,out);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
 | 
					    for(int sss=0;sss<grid->oSites();sss++){
 | 
				
			||||||
 | 
					      DhopSite(sss,in,out);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -46,6 +46,8 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      // non-hermitian hopping term; half cb or both
 | 
					      // non-hermitian hopping term; half cb or both
 | 
				
			||||||
      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
					      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
				
			||||||
 | 
					      void DhopSite   (int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
				
			||||||
 | 
					      void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      typedef iScalar<iMatrix<vComplex, Nc> > matrix;
 | 
					      typedef iScalar<iMatrix<vComplex, Nc> > matrix;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										0
									
								
								lib/simd/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/simd/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										194
									
								
								lib/simd/Grid_sse4.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										194
									
								
								lib/simd/Grid_sse4.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,194 @@
 | 
				
			|||||||
 | 
					//----------------------------------------------------------------------
 | 
				
			||||||
 | 
					/*! @file Grid_sse4.h
 | 
				
			||||||
 | 
					  @brief Optimization libraries
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
 | 
					// Time-stamp: <2015-05-19 17:06:51 neo>
 | 
				
			||||||
 | 
					//----------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <pmmintrin.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Optimization {
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  struct Vsplat{
 | 
				
			||||||
 | 
					    //Complex float
 | 
				
			||||||
 | 
					    inline __m128 operator()(float a, float b){
 | 
				
			||||||
 | 
					      return _mm_set_ps(b,a,b,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Real float
 | 
				
			||||||
 | 
					    inline __m128 operator()(float a){
 | 
				
			||||||
 | 
					      return _mm_set_ps(a,a,a,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex double
 | 
				
			||||||
 | 
					    inline __m128d operator()(double a, double b){
 | 
				
			||||||
 | 
					      return _mm_set_pd(b,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Real double
 | 
				
			||||||
 | 
					    inline __m128d operator()(double a){
 | 
				
			||||||
 | 
					      return _mm_set_pd(a,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Integer
 | 
				
			||||||
 | 
					    inline __m128i operator()(Integer a){
 | 
				
			||||||
 | 
					      return _mm_set1_epi32(a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Vstore{
 | 
				
			||||||
 | 
					    //Float 
 | 
				
			||||||
 | 
					    inline void operator()(__m128 a, float* F){
 | 
				
			||||||
 | 
					      _mm_store_ps(F,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Double
 | 
				
			||||||
 | 
					    inline void operator()(__m128d a, double* D){
 | 
				
			||||||
 | 
					      _mm_store_pd(D,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Integer
 | 
				
			||||||
 | 
					    inline void operator()(__m128i a, Integer* I){
 | 
				
			||||||
 | 
					      _mm_store_si128((__m128i *)I,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Vset{
 | 
				
			||||||
 | 
					    // Complex float 
 | 
				
			||||||
 | 
					    inline __m128 operator()(Grid::ComplexF *a){
 | 
				
			||||||
 | 
					      return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Complex double 
 | 
				
			||||||
 | 
					    inline __m128d operator()(Grid::ComplexD *a){
 | 
				
			||||||
 | 
					      return _mm_set_pd(a[0].imag(),a[0].real());
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Real float 
 | 
				
			||||||
 | 
					    inline __m128 operator()(float *a){
 | 
				
			||||||
 | 
					      return _mm_set_ps(a[3],a[2],a[1],a[0]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Real double
 | 
				
			||||||
 | 
					    inline __m128d operator()(double *a){
 | 
				
			||||||
 | 
					      return _mm_set_pd(a[1],a[0]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Integer
 | 
				
			||||||
 | 
					    inline __m128i operator()(Integer *a){
 | 
				
			||||||
 | 
					      return _mm_set_epi32(a[0],a[1],a[2],a[3]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Reduce{
 | 
				
			||||||
 | 
					    //Complex float
 | 
				
			||||||
 | 
					    inline Grid::ComplexF operator()(__m128 in){
 | 
				
			||||||
 | 
					      union {
 | 
				
			||||||
 | 
						__m128 v1;  
 | 
				
			||||||
 | 
						float f[4]; 
 | 
				
			||||||
 | 
					      } u128;
 | 
				
			||||||
 | 
					      u128.v1 = _mm_add_ps(in, _mm_shuffle_ps(in,in, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
 | 
				
			||||||
 | 
					      return Grid::ComplexF(u128.f[0], u128.f[1]);   
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex double
 | 
				
			||||||
 | 
					    inline Grid::ComplexD operator()(__m128d in){
 | 
				
			||||||
 | 
					      printf("Missing complex double implementation -> FIX\n");
 | 
				
			||||||
 | 
					      return Grid::ComplexD(0,0); // FIXME wrong
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // Arithmetic operations
 | 
				
			||||||
 | 
					  /////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  struct Sum{
 | 
				
			||||||
 | 
					    //Complex/Real float
 | 
				
			||||||
 | 
					    inline __m128 operator()(__m128 a, __m128 b){
 | 
				
			||||||
 | 
					      return _mm_add_ps(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex/Real double
 | 
				
			||||||
 | 
					    inline __m128d operator()(__m128d a, __m128d b){
 | 
				
			||||||
 | 
					      return _mm_add_pd(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Integer
 | 
				
			||||||
 | 
					    inline __m128i operator()(__m128i a, __m128i b){
 | 
				
			||||||
 | 
					      return _mm_add_epi32(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Sub{
 | 
				
			||||||
 | 
					    //Complex/Real float
 | 
				
			||||||
 | 
					    inline __m128 operator()(__m128 a, __m128 b){
 | 
				
			||||||
 | 
					      return _mm_sub_ps(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex/Real double
 | 
				
			||||||
 | 
					    inline __m128d operator()(__m128d a, __m128d b){
 | 
				
			||||||
 | 
					      return _mm_sub_pd(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Integer
 | 
				
			||||||
 | 
					    inline __m128i operator()(__m128i a, __m128i b){
 | 
				
			||||||
 | 
					      return _mm_sub_epi32(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct MultComplex{
 | 
				
			||||||
 | 
					    // Complex float
 | 
				
			||||||
 | 
					    inline __m128 operator()(__m128 a, __m128 b){
 | 
				
			||||||
 | 
					      __m128 ymm0,ymm1,ymm2;
 | 
				
			||||||
 | 
					      ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
 | 
				
			||||||
 | 
					      ymm0 = _mm_mul_ps(ymm0,b);        // ymm0 <- ar bi, ar br
 | 
				
			||||||
 | 
					      ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
 | 
				
			||||||
 | 
					      ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
 | 
				
			||||||
 | 
					      ymm1 = _mm_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
 | 
				
			||||||
 | 
					      return _mm_addsub_ps(ymm0,ymm1);    
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Complex double
 | 
				
			||||||
 | 
					    inline __m128d operator()(__m128d a, __m128d b){
 | 
				
			||||||
 | 
					      __m128d ymm0,ymm1,ymm2;
 | 
				
			||||||
 | 
					      ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar,
 | 
				
			||||||
 | 
					      ymm0 = _mm_mul_pd(ymm0,b);        // ymm0 <- ar bi, ar br
 | 
				
			||||||
 | 
					      ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi   b01
 | 
				
			||||||
 | 
					      ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai   b11
 | 
				
			||||||
 | 
					      ymm1 = _mm_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
 | 
				
			||||||
 | 
					      return _mm_addsub_pd(ymm0,ymm1);  
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Mult{
 | 
				
			||||||
 | 
					    // Real float
 | 
				
			||||||
 | 
					    inline __m128 operator()(__m128 a, __m128 b){
 | 
				
			||||||
 | 
					      return _mm_mul_ps(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Real double
 | 
				
			||||||
 | 
					    inline __m128d operator()(__m128d a, __m128d b){
 | 
				
			||||||
 | 
					      return _mm_mul_pd(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Integer
 | 
				
			||||||
 | 
					    inline __m128i operator()(__m128i a, __m128i b){
 | 
				
			||||||
 | 
					      return _mm_mul_epi32(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Here assign types 
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					  typedef __m128 SIMD_Ftype;  // Single precision type
 | 
				
			||||||
 | 
					  typedef __m128d SIMD_Dtype; // Double precision type
 | 
				
			||||||
 | 
					  typedef __m128i SIMD_Itype; // Integer type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Function names
 | 
				
			||||||
 | 
					  typedef Optimization::Vsplat VsplatSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Vstore VstoreSIMD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Arithmetic operations
 | 
				
			||||||
 | 
					  typedef Optimization::Sum         SumSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Sub         SubSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Mult        MultSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::MultComplex MultComplexSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Vset        VsetSIMD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -1,6 +1,14 @@
 | 
				
			|||||||
 | 
					//---------------------------------------------------------------------------
 | 
				
			||||||
 | 
					/*! @file Grid_vector_types.h
 | 
				
			||||||
 | 
					  @brief Defines templated class Grid_simd to deal with inner vector types
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
 | 
					// Time-stamp: <2015-05-19 17:20:36 neo>
 | 
				
			||||||
 | 
					//---------------------------------------------------------------------------
 | 
				
			||||||
#ifndef GRID_VECTOR_TYPES
 | 
					#ifndef GRID_VECTOR_TYPES
 | 
				
			||||||
#define GRID_VECTOR_TYPES
 | 
					#define GRID_VECTOR_TYPES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "Grid_sse4.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -13,31 +21,32 @@ namespace Grid {
 | 
				
			|||||||
    struct RealPart< std::complex<T> >{
 | 
					    struct RealPart< std::complex<T> >{
 | 
				
			||||||
    typedef T type;
 | 
					    typedef T type;
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  ////////////////////////////////////////////////////////
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////////////////////////////
 | 
				
			||||||
  // Check for complexity with type traits
 | 
					  // Check for complexity with type traits
 | 
				
			||||||
  template <typename T> 
 | 
					  template <typename T> 
 | 
				
			||||||
    struct is_complex : std::false_type {};
 | 
					    struct is_complex : std::false_type {};
 | 
				
			||||||
  template < typename T > 
 | 
					  template < typename T > 
 | 
				
			||||||
    struct is_complex< std::complex<T> >: std::true_type {};
 | 
					    struct is_complex< std::complex<T> >: std::true_type {};
 | 
				
			||||||
  ////////////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Define the operation templates functors
 | 
					  // Define the operation templates functors
 | 
				
			||||||
  template < class SIMD, class Operation > 
 | 
					  // general forms to allow for vsplat syntax
 | 
				
			||||||
    SIMD binary(SIMD src_1, SIMD src_2, Operation op){
 | 
					  // need explicit declaration of types when used since
 | 
				
			||||||
 | 
					  // clang cannot automatically determine the output type sometimes
 | 
				
			||||||
 | 
					  template < class Out, class Input1, class Input2, class Operation > 
 | 
				
			||||||
 | 
					    Out binary(Input1 src_1, Input2 src_2, Operation op){
 | 
				
			||||||
    return op(src_1, src_2);
 | 
					    return op(src_1, src_2);
 | 
				
			||||||
  } 
 | 
					  } 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template < class SIMD, class Operation > 
 | 
					  template < class SIMDout, class Input, class Operation > 
 | 
				
			||||||
    SIMD unary(SIMD src, Operation op){
 | 
					    SIMDout unary(Input src, Operation op){
 | 
				
			||||||
    return op(src);
 | 
					    return op(src);
 | 
				
			||||||
  } 
 | 
					  } 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ///////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /*
 | 
					  /*
 | 
				
			||||||
    @brief Grid_simd class for the SIMD vector type operations
 | 
					    @brief Grid_simd class for the SIMD vector type operations
 | 
				
			||||||
 | 
					 | 
				
			||||||
   */
 | 
					   */
 | 
				
			||||||
  template < class Scalar_type, class Vector_type > 
 | 
					  template < class Scalar_type, class Vector_type > 
 | 
				
			||||||
    class Grid_simd {
 | 
					    class Grid_simd {
 | 
				
			||||||
@@ -70,75 +79,90 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
       
 | 
					       
 | 
				
			||||||
       
 | 
					 | 
				
			||||||
    ///////////////////////////////////////////////
 | 
					    ///////////////////////////////////////////////
 | 
				
			||||||
    // mac, mult, sub, add, adj
 | 
					    // mac, mult, sub, add, adj
 | 
				
			||||||
    // Should do an AVX2 version with mac.
 | 
					 | 
				
			||||||
    ///////////////////////////////////////////////
 | 
					    ///////////////////////////////////////////////
 | 
				
			||||||
    friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
 | 
					    friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
 | 
				
			||||||
    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
 | 
					    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
 | 
				
			||||||
    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
 | 
					    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
 | 
				
			||||||
    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
 | 
					    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    //not for integer types... FIXME
 | 
				
			||||||
    friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
 | 
					    friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
    //////////////////////////////////
 | 
					    ///////////////////////////////////////////////
 | 
				
			||||||
    // Initialise to 1,0,i
 | 
					    // Initialise to 1,0,i for the correct types
 | 
				
			||||||
    //////////////////////////////////
 | 
					    ///////////////////////////////////////////////
 | 
				
			||||||
    // if not complex overload here 
 | 
					    // if not complex overload here 
 | 
				
			||||||
 | 
					    template <  class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > 
 | 
				
			||||||
      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0); }
 | 
					      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0); }
 | 
				
			||||||
 | 
					    template <  class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > 
 | 
				
			||||||
      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0); }
 | 
					      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0); }
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    // overload for complex type
 | 
					    // overload for complex type
 | 
				
			||||||
    template <  class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
 | 
					    template <  class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
 | 
				
			||||||
      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0,0.0); }
 | 
					      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0,0.0); }
 | 
				
			||||||
    template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
 | 
					    template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
 | 
				
			||||||
    friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0,0.0); }
 | 
					      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0,0.0); }// use xor?
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    // For integral type
 | 
					    // For integral type
 | 
				
			||||||
    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
 | 
					    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
 | 
				
			||||||
      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1); }
 | 
					      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1); }
 | 
				
			||||||
    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
 | 
					    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
 | 
				
			||||||
      friend inline void vzero(Grid_simd &ret)      { vsplat(ret,0); }
 | 
					      friend inline void vzero(Grid_simd &ret)      { vsplat(ret,0); }
 | 
				
			||||||
 | 
					    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
 | 
				
			||||||
 | 
					      friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
 | 
				
			||||||
 | 
					    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
 | 
				
			||||||
 | 
					      friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // do not compile if real or integer, send an error message from the compiler
 | 
					    // do not compile if real or integer, send an error message from the compiler
 | 
				
			||||||
    template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
 | 
					    template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
 | 
				
			||||||
    friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);}
 | 
					    friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);}
 | 
				
			||||||
   
 | 
					   
 | 
				
			||||||
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
   
 | 
					 | 
				
			||||||
    ////////////////////////////////////
 | 
					    ////////////////////////////////////
 | 
				
			||||||
    // Arithmetic operator overloads +,-,*
 | 
					    // Arithmetic operator overloads +,-,*
 | 
				
			||||||
    ////////////////////////////////////
 | 
					    ////////////////////////////////////
 | 
				
			||||||
    friend inline Grid_simd operator + (Grid_simd a, Grid_simd b)
 | 
					    friend inline Grid_simd operator + (Grid_simd a, Grid_simd b)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      vComplexF ret;
 | 
					      Grid_simd ret;
 | 
				
			||||||
      // FIXME call the binary op
 | 
					      ret.v = binary<Vector_type>(a.v, b.v, SumSIMD());
 | 
				
			||||||
      return ret;
 | 
					      return ret;
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
    friend inline Grid_simd operator - (Grid_simd a, Grid_simd b)
 | 
					    friend inline Grid_simd operator - (Grid_simd a, Grid_simd b)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      vComplexF ret;
 | 
					      Grid_simd ret;
 | 
				
			||||||
      // FIXME call the binary op
 | 
					      ret.v = binary<Vector_type>(a.v, b.v, SubSIMD());
 | 
				
			||||||
      return ret;
 | 
					      return ret;
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
 | 
					    // Distinguish between complex types and others
 | 
				
			||||||
 | 
					    template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 >
 | 
				
			||||||
      friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
 | 
					      friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
 | 
				
			||||||
      {
 | 
					      {
 | 
				
			||||||
	vComplexF ret;
 | 
						Grid_simd ret;
 | 
				
			||||||
	// FIXME call the binary op
 | 
						ret.v = binary<Vector_type>(a.v,b.v, MultComplexSIMD());
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
      };
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Real/Integer types
 | 
				
			||||||
 | 
					    template <  class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > 
 | 
				
			||||||
 | 
					    friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
						Grid_simd ret;
 | 
				
			||||||
 | 
						ret.v = binary<Vector_type>(a.v,b.v, MultSIMD());
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
    // FIXME:  gonna remove these load/store, get, set, prefetch
 | 
					    // FIXME:  gonna remove these load/store, get, set, prefetch
 | 
				
			||||||
    ////////////////////////////////////////////////////////////////////////
 | 
					    ////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
    friend inline void vset(Grid_simd &ret, Scalar_type *a){
 | 
					    friend inline void vset(Grid_simd &ret, Scalar_type *a){
 | 
				
			||||||
      // FIXME set 
 | 
					      ret.v = unary<Vector_type>(a, VsetSIMD());
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
    ///////////////////////
 | 
					    ///////////////////////
 | 
				
			||||||
@@ -155,26 +179,25 @@ namespace Grid {
 | 
				
			|||||||
    // this only for the complex version
 | 
					    // this only for the complex version
 | 
				
			||||||
    template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
 | 
					    template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
 | 
				
			||||||
    friend inline void vsplat(Grid_simd &ret,Real a, Real b){
 | 
					    friend inline void vsplat(Grid_simd &ret,Real a, Real b){
 | 
				
			||||||
      // FIXME add operator
 | 
					      ret.v = binary<Vector_type>(a, b, VsplatSIMD());
 | 
				
			||||||
    }    
 | 
					    }    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    //if real fill with a, if complex fill with a in the real part
 | 
					    //if real fill with a, if complex fill with a in the real part (first function above)
 | 
				
			||||||
    friend inline void vsplat(Grid_simd &ret,Real a){
 | 
					    friend inline void vsplat(Grid_simd &ret,Real a){
 | 
				
			||||||
      // FIXME add operator
 | 
					      ret.v = unary<Vector_type>(a, VsplatSIMD());
 | 
				
			||||||
    }    
 | 
					    }    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    friend inline void vstore(const Grid_simd &ret, Scalar_type *a){
 | 
					    friend inline void vstore(const Grid_simd &ret, Scalar_type *a){
 | 
				
			||||||
      //FIXME
 | 
					      binary<void>(ret.v, (Real*)a, VstoreSIMD());
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    friend inline void vprefetch(const Grid_simd &v)
 | 
					    friend inline void vprefetch(const Grid_simd &v)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
 | 
					      _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    friend inline Scalar_type Reduce(const Grid_simd & in)
 | 
					    friend inline Scalar_type Reduce(const Grid_simd & in)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      // FIXME add operator
 | 
					      // FIXME add operator
 | 
				
			||||||
@@ -221,6 +244,7 @@ namespace Grid {
 | 
				
			|||||||
    inline Grid_simd &operator *=(const Grid_simd &r) {
 | 
					    inline Grid_simd &operator *=(const Grid_simd &r) {
 | 
				
			||||||
      *this = (*this)*r;
 | 
					      *this = (*this)*r;
 | 
				
			||||||
      return *this;
 | 
					      return *this;
 | 
				
			||||||
 | 
					      // return (*this)*r; ?
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    inline Grid_simd &operator +=(const Grid_simd &r) {
 | 
					    inline Grid_simd &operator +=(const Grid_simd &r) {
 | 
				
			||||||
      *this = *this+r;
 | 
					      *this = *this+r;
 | 
				
			||||||
@@ -233,6 +257,12 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        Gpermute<Grid_simd>(y,b,perm);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					     /*
 | 
				
			||||||
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
 | 
					    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      Gpermute<Grid_simd>(y,b,perm);
 | 
					      Gpermute<Grid_simd>(y,b,perm);
 | 
				
			||||||
@@ -253,7 +283,7 @@ namespace Grid {
 | 
				
			|||||||
    {
 | 
					    {
 | 
				
			||||||
      Gextract<Grid_simd,Scalar_type>(y,extracted);
 | 
					      Gextract<Grid_simd,Scalar_type>(y,extracted);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					     */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  };// end of Grid_simd class definition 
 | 
					  };// end of Grid_simd class definition 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -286,11 +316,11 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Define available types (now change names to avoid clashing)
 | 
					  // Define available types (now change names to avoid clashing)
 | 
				
			||||||
  typedef __m128 SIMD_type;// decided at compilation time
 | 
					
 | 
				
			||||||
  typedef Grid_simd< float                 , SIMD_type > MyRealF;
 | 
					  typedef Grid_simd< float                 , SIMD_Ftype > MyRealF;
 | 
				
			||||||
  typedef Grid_simd< double                , SIMD_type > MyRealD;
 | 
					  typedef Grid_simd< double                , SIMD_Dtype > MyRealD;
 | 
				
			||||||
  typedef Grid_simd< std::complex< float > , SIMD_type > MyComplexF;
 | 
					  typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF;
 | 
				
			||||||
  typedef Grid_simd< std::complex< double >, SIMD_type > MyComplexD;
 | 
					  typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										9
									
								
								scripts/bench_wilson.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										9
									
								
								scripts/bench_wilson.sh
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,9 @@
 | 
				
			|||||||
 | 
					for omp in 1 2 4
 | 
				
			||||||
 | 
					do
 | 
				
			||||||
 | 
					echo > wilson.t$omp
 | 
				
			||||||
 | 
					for vol in 4.4.4.4 4.4.4.8 4.4.8.8  4.8.8.8  8.8.8.8   8.8.8.16 8.8.16.16  8.16.16.16
 | 
				
			||||||
 | 
					do   
 | 
				
			||||||
 | 
					perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp  | grep mflop | awk '{print $3}'`
 | 
				
			||||||
 | 
					echo $vol $perf >> wilson.t$omp
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
							
								
								
									
										7
									
								
								scripts/wilson.gnu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								scripts/wilson.gnu
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,7 @@
 | 
				
			|||||||
 | 
					plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
 | 
				
			||||||
 | 
					replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
 | 
				
			||||||
 | 
					replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
 | 
				
			||||||
 | 
					set terminal 'pdf'
 | 
				
			||||||
 | 
					set output 'wilson_clang.pdf'
 | 
				
			||||||
 | 
					replot
 | 
				
			||||||
 | 
					quit
 | 
				
			||||||
@@ -1,5 +1,9 @@
 | 
				
			|||||||
#include "Grid.h"
 | 
					#include "Grid.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//DEBUG
 | 
				
			||||||
 | 
					#include "simd/Grid_vector_types.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
using namespace std;
 | 
					using namespace std;
 | 
				
			||||||
using namespace Grid;
 | 
					using namespace Grid;
 | 
				
			||||||
using namespace Grid::QCD;
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
@@ -151,6 +155,39 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    scMat = sMat*scMat;  // LatticeSpinColourMatrix = LatticeSpinMatrix       * LatticeSpinColourMatrix
 | 
					    scMat = sMat*scMat;  // LatticeSpinColourMatrix = LatticeSpinMatrix       * LatticeSpinColourMatrix
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef SSE4
 | 
				
			||||||
 | 
					    ///////// Tests the new class Grid_simd 
 | 
				
			||||||
 | 
					    std::complex<double> ctest(3.0,2.0);
 | 
				
			||||||
 | 
					    std::complex<float> ctestf(3.0,2.0);
 | 
				
			||||||
 | 
					    MyComplexF TestMe1(1.0); // fill real part
 | 
				
			||||||
 | 
					    MyComplexD TestMe2(ctest);
 | 
				
			||||||
 | 
					    MyComplexD TestMe3(ctest);// compiler generate conversion of basic types
 | 
				
			||||||
 | 
					    //MyRealF TestMe5(ctest);// Must generate compiler error
 | 
				
			||||||
 | 
					    MyRealD TestMe4(2.0); 
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					    MyComplexF TestMe6(ctestf);
 | 
				
			||||||
 | 
					    MyComplexF TestMe7(ctestf);  
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    MyComplexD TheSum= TestMe2*TestMe3;
 | 
				
			||||||
 | 
					    MyComplexF TheSumF= TestMe6*TestMe7;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    double dsum[2];
 | 
				
			||||||
 | 
					    _mm_store_pd(dsum, TheSum.v);
 | 
				
			||||||
 | 
					    for (int i =0; i< 2; i++)
 | 
				
			||||||
 | 
					      printf("%f\n", dsum[i]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    float fsum[4];
 | 
				
			||||||
 | 
					    _mm_store_ps(fsum, TheSumF.v);
 | 
				
			||||||
 | 
					    for (int i =0; i< 4; i++)
 | 
				
			||||||
 | 
					      printf("%f\n", fsum[i]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    vstore(TheSum, &ctest);
 | 
				
			||||||
 | 
					    std::cout << ctest<< std::endl;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					    ///////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Non-lattice (const objects) * Lattice
 | 
					    // Non-lattice (const objects) * Lattice
 | 
				
			||||||
    ColourMatrix cm;
 | 
					    ColourMatrix cm;
 | 
				
			||||||
    SpinColourMatrix scm;
 | 
					    SpinColourMatrix scm;
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user