mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Got unpreconditioned conjugate gradient to run and converge on a random (uniform random,
not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop.
This commit is contained in:
		
							
								
								
									
										44
									
								
								Makefile.in
									
									
									
									
									
								
							
							
						
						
									
										44
									
								
								Makefile.in
									
									
									
									
									
								
							@@ -1,7 +1,7 @@
 | 
			
		||||
# Makefile.in generated by automake 1.14.1 from Makefile.am.
 | 
			
		||||
# Makefile.in generated by automake 1.15 from Makefile.am.
 | 
			
		||||
# @configure_input@
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 1994-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 1994-2014 Free Software Foundation, Inc.
 | 
			
		||||
 | 
			
		||||
# This Makefile.in is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -14,7 +14,17 @@
 | 
			
		||||
 | 
			
		||||
@SET_MAKE@
 | 
			
		||||
VPATH = @srcdir@
 | 
			
		||||
am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
 | 
			
		||||
am__is_gnu_make = { \
 | 
			
		||||
  if test -z '$(MAKELEVEL)'; then \
 | 
			
		||||
    false; \
 | 
			
		||||
  elif test -n '$(MAKE_HOST)'; then \
 | 
			
		||||
    true; \
 | 
			
		||||
  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
 | 
			
		||||
    true; \
 | 
			
		||||
  else \
 | 
			
		||||
    false; \
 | 
			
		||||
  fi; \
 | 
			
		||||
}
 | 
			
		||||
am__make_running_with_option = \
 | 
			
		||||
  case $${target_option-} in \
 | 
			
		||||
      ?) ;; \
 | 
			
		||||
@@ -79,14 +89,12 @@ build_triplet = @build@
 | 
			
		||||
host_triplet = @host@
 | 
			
		||||
target_triplet = @target@
 | 
			
		||||
subdir = .
 | 
			
		||||
DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
 | 
			
		||||
	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 | 
			
		||||
	$(top_srcdir)/configure $(am__configure_deps) COPYING TODO \
 | 
			
		||||
	compile config.guess config.sub depcomp install-sh missing
 | 
			
		||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 | 
			
		||||
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
 | 
			
		||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 | 
			
		||||
	$(ACLOCAL_M4)
 | 
			
		||||
DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
 | 
			
		||||
	$(am__configure_deps) $(am__DIST_COMMON)
 | 
			
		||||
am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
 | 
			
		||||
 configure.lineno config.status.lineno
 | 
			
		||||
mkinstalldirs = $(install_sh) -d
 | 
			
		||||
@@ -149,6 +157,9 @@ ETAGS = etags
 | 
			
		||||
CTAGS = ctags
 | 
			
		||||
CSCOPE = cscope
 | 
			
		||||
DIST_SUBDIRS = $(SUBDIRS)
 | 
			
		||||
am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \
 | 
			
		||||
	INSTALL NEWS README TODO compile config.guess config.sub \
 | 
			
		||||
	depcomp install-sh missing
 | 
			
		||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 | 
			
		||||
distdir = $(PACKAGE)-$(VERSION)
 | 
			
		||||
top_distdir = $(distdir)
 | 
			
		||||
@@ -314,7 +325,6 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 | 
			
		||||
	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
 | 
			
		||||
	$(am__cd) $(top_srcdir) && \
 | 
			
		||||
	  $(AUTOMAKE) --gnu Makefile
 | 
			
		||||
.PRECIOUS: Makefile
 | 
			
		||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 | 
			
		||||
	@case '$?' in \
 | 
			
		||||
	  *config.status*) \
 | 
			
		||||
@@ -521,15 +531,15 @@ dist-xz: distdir
 | 
			
		||||
	$(am__post_remove_distdir)
 | 
			
		||||
 | 
			
		||||
dist-tarZ: distdir
 | 
			
		||||
	@echo WARNING: "Support for shar distribution archives is" \
 | 
			
		||||
	               "deprecated." >&2
 | 
			
		||||
	@echo WARNING: "Support for distribution archives compressed with" \
 | 
			
		||||
		       "legacy program 'compress' is deprecated." >&2
 | 
			
		||||
	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 | 
			
		||||
	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
 | 
			
		||||
	$(am__post_remove_distdir)
 | 
			
		||||
 | 
			
		||||
dist-shar: distdir
 | 
			
		||||
	@echo WARNING: "Support for distribution archives compressed with" \
 | 
			
		||||
		       "legacy program 'compress' is deprecated." >&2
 | 
			
		||||
	@echo WARNING: "Support for shar distribution archives is" \
 | 
			
		||||
	               "deprecated." >&2
 | 
			
		||||
	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 | 
			
		||||
	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
 | 
			
		||||
	$(am__post_remove_distdir)
 | 
			
		||||
@@ -565,17 +575,17 @@ distcheck: dist
 | 
			
		||||
	esac
 | 
			
		||||
	chmod -R a-w $(distdir)
 | 
			
		||||
	chmod u+w $(distdir)
 | 
			
		||||
	mkdir $(distdir)/_build $(distdir)/_inst
 | 
			
		||||
	mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
 | 
			
		||||
	chmod a-w $(distdir)
 | 
			
		||||
	test -d $(distdir)/_build || exit 0; \
 | 
			
		||||
	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
 | 
			
		||||
	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
 | 
			
		||||
	  && am__cwd=`pwd` \
 | 
			
		||||
	  && $(am__cd) $(distdir)/_build \
 | 
			
		||||
	  && ../configure \
 | 
			
		||||
	  && $(am__cd) $(distdir)/_build/sub \
 | 
			
		||||
	  && ../../configure \
 | 
			
		||||
	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
 | 
			
		||||
	    $(DISTCHECK_CONFIGURE_FLAGS) \
 | 
			
		||||
	    --srcdir=.. --prefix="$$dc_install_base" \
 | 
			
		||||
	    --srcdir=../.. --prefix="$$dc_install_base" \
 | 
			
		||||
	  && $(MAKE) $(AM_MAKEFLAGS) \
 | 
			
		||||
	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
 | 
			
		||||
	  && $(MAKE) $(AM_MAKEFLAGS) check \
 | 
			
		||||
@@ -749,6 +759,8 @@ uninstall-am:
 | 
			
		||||
	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
 | 
			
		||||
	pdf-am ps ps-am tags tags-am uninstall uninstall-am
 | 
			
		||||
 | 
			
		||||
.PRECIOUS: Makefile
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
 | 
			
		||||
# Otherwise a system limit (for SysV at least) may be exceeded.
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										61
									
								
								aclocal.m4
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										61
									
								
								aclocal.m4
									
									
									
									
										vendored
									
									
								
							@@ -1,6 +1,6 @@
 | 
			
		||||
# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
 | 
			
		||||
# generated automatically by aclocal 1.15 -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 | 
			
		||||
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 | 
			
		||||
If you have problems, you may need to regenerate the build system entirely.
 | 
			
		||||
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2002-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2002-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
 | 
			
		||||
# generated from the m4 files accompanying Automake X.Y.
 | 
			
		||||
# (This private macro should not be called outside this file.)
 | 
			
		||||
AC_DEFUN([AM_AUTOMAKE_VERSION],
 | 
			
		||||
[am__api_version='1.14'
 | 
			
		||||
[am__api_version='1.15'
 | 
			
		||||
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 | 
			
		||||
dnl require some minimum version.  Point them to the right macro.
 | 
			
		||||
m4_if([$1], [1.14.1], [],
 | 
			
		||||
m4_if([$1], [1.15], [],
 | 
			
		||||
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 | 
			
		||||
])
 | 
			
		||||
 | 
			
		||||
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 | 
			
		||||
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 | 
			
		||||
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 | 
			
		||||
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
 | 
			
		||||
[AM_AUTOMAKE_VERSION([1.14.1])dnl
 | 
			
		||||
[AM_AUTOMAKE_VERSION([1.15])dnl
 | 
			
		||||
m4_ifndef([AC_AUTOCONF_VERSION],
 | 
			
		||||
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 | 
			
		||||
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 | 
			
		||||
 | 
			
		||||
# AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -103,15 +103,14 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 | 
			
		||||
# configured tree to be moved without reconfiguration.
 | 
			
		||||
 | 
			
		||||
AC_DEFUN([AM_AUX_DIR_EXPAND],
 | 
			
		||||
[dnl Rely on autoconf to set up CDPATH properly.
 | 
			
		||||
AC_PREREQ([2.50])dnl
 | 
			
		||||
# expand $ac_aux_dir to an absolute path
 | 
			
		||||
am_aux_dir=`cd $ac_aux_dir && pwd`
 | 
			
		||||
[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
 | 
			
		||||
# Expand $ac_aux_dir to an absolute path.
 | 
			
		||||
am_aux_dir=`cd "$ac_aux_dir" && pwd`
 | 
			
		||||
])
 | 
			
		||||
 | 
			
		||||
# AM_CONDITIONAL                                            -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 1997-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -142,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE(
 | 
			
		||||
Usually this means the macro was only invoked conditionally.]])
 | 
			
		||||
fi])])
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 1999-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -333,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
 | 
			
		||||
 | 
			
		||||
# Generate code to set up dependency tracking.              -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 1999-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -409,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 | 
			
		||||
 | 
			
		||||
# Do all the work for Automake.                             -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -499,8 +498,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 | 
			
		||||
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
 | 
			
		||||
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 | 
			
		||||
AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
 | 
			
		||||
# We need awk for the "check" target.  The system "awk" is bad on
 | 
			
		||||
# some platforms.
 | 
			
		||||
# We need awk for the "check" target (and possibly the TAP driver).  The
 | 
			
		||||
# system "awk" is bad on some platforms.
 | 
			
		||||
AC_REQUIRE([AC_PROG_AWK])dnl
 | 
			
		||||
AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 | 
			
		||||
AC_REQUIRE([AM_SET_LEADING_DOT])dnl
 | 
			
		||||
@@ -573,7 +572,11 @@ to "yes", and re-run configure.
 | 
			
		||||
END
 | 
			
		||||
    AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
 | 
			
		||||
  fi
 | 
			
		||||
fi])
 | 
			
		||||
fi
 | 
			
		||||
dnl The trailing newline in this macro's definition is deliberate, for
 | 
			
		||||
dnl backward compatibility and to allow trailing 'dnl'-style comments
 | 
			
		||||
dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
 | 
			
		||||
])
 | 
			
		||||
 | 
			
		||||
dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 | 
			
		||||
dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
 | 
			
		||||
@@ -602,7 +605,7 @@ for _am_header in $config_headers :; do
 | 
			
		||||
done
 | 
			
		||||
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -613,7 +616,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
 | 
			
		||||
# Define $install_sh.
 | 
			
		||||
AC_DEFUN([AM_PROG_INSTALL_SH],
 | 
			
		||||
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
 | 
			
		||||
if test x"${install_sh}" != xset; then
 | 
			
		||||
if test x"${install_sh+set}" != xset; then
 | 
			
		||||
  case $am_aux_dir in
 | 
			
		||||
  *\ * | *\	*)
 | 
			
		||||
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
 | 
			
		||||
@@ -623,7 +626,7 @@ if test x"${install_sh}" != xset; then
 | 
			
		||||
fi
 | 
			
		||||
AC_SUBST([install_sh])])
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2003-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2003-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -644,7 +647,7 @@ AC_SUBST([am__leading_dot])])
 | 
			
		||||
 | 
			
		||||
# Check to see how 'make' treats includes.	            -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -694,7 +697,7 @@ rm -f confinc confmf
 | 
			
		||||
 | 
			
		||||
# Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 1997-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -733,7 +736,7 @@ fi
 | 
			
		||||
 | 
			
		||||
# Helper functions for option handling.                     -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -764,7 +767,7 @@ AC_DEFUN([_AM_IF_OPTION],
 | 
			
		||||
 | 
			
		||||
# Check to make sure that the build environment is sane.    -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -845,7 +848,7 @@ AC_CONFIG_COMMANDS_PRE(
 | 
			
		||||
rm -f conftest.file
 | 
			
		||||
])
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2009-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2009-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -905,7 +908,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 | 
			
		||||
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 | 
			
		||||
])
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -933,7 +936,7 @@ fi
 | 
			
		||||
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 | 
			
		||||
AC_SUBST([INSTALL_STRIP_PROGRAM])])
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2006-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2006-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
@@ -952,7 +955,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 | 
			
		||||
 | 
			
		||||
# Check how to create a tarball.                            -*- Autoconf -*-
 | 
			
		||||
 | 
			
		||||
# Copyright (C) 2004-2013 Free Software Foundation, Inc.
 | 
			
		||||
# Copyright (C) 2004-2014 Free Software Foundation, Inc.
 | 
			
		||||
#
 | 
			
		||||
# This file is free software; the Free Software Foundation
 | 
			
		||||
# gives unlimited permission to copy and/or distribute it,
 | 
			
		||||
 
 | 
			
		||||
@@ -46,6 +46,13 @@ int main (int argc, char ** argv)
 | 
			
		||||
    volume=volume*latt_size[mu];
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  // Only one non-zero (y)
 | 
			
		||||
  Umu=zero;
 | 
			
		||||
  for(int nn=0;nn<Nd;nn++){
 | 
			
		||||
    random(pRNG,U[nn]);
 | 
			
		||||
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
  }
 | 
			
		||||
@@ -53,7 +60,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
  { // Naive wilson implementation
 | 
			
		||||
    ref = zero;
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
 | 
			
		||||
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu,1);
 | 
			
		||||
      for(int i=0;i<ref._odata.size();i++){
 | 
			
		||||
@@ -75,7 +81,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  int ncall=1000;
 | 
			
		||||
  double t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Dw.M(src,result);
 | 
			
		||||
    Dw.Dhop(src,result,0);
 | 
			
		||||
  }
 | 
			
		||||
  double t1=usecond();
 | 
			
		||||
  double flops=1320*volume*ncall;
 | 
			
		||||
@@ -99,5 +105,30 @@ int main (int argc, char ** argv)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  { // Naive wilson dag implementation
 | 
			
		||||
    ref = zero;
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
 | 
			
		||||
      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu,1);
 | 
			
		||||
      for(int i=0;i<ref._odata.size();i++){
 | 
			
		||||
	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu,-1);
 | 
			
		||||
      for(int i=0;i<ref._odata.size();i++){
 | 
			
		||||
	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  Dw.Dhop(src,result,1);
 | 
			
		||||
  std::cout << "Called DwDag"<<std::endl;
 | 
			
		||||
  std::cout << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
  err = ref -result;
 | 
			
		||||
  std::cout << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										56
									
								
								benchmarks/Grid_wilson_cg_unprec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								benchmarks/Grid_wilson_cg_unprec.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,56 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::GammaMatrix Gmu [] = {
 | 
			
		||||
    Gamma::GammaX,
 | 
			
		||||
    Gamma::GammaY,
 | 
			
		||||
    Gamma::GammaZ,
 | 
			
		||||
    Gamma::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src(&Grid); random(pRNG,src);
 | 
			
		||||
  std::cout << "src norm" << norm2(src)<<std::endl;
 | 
			
		||||
  LatticeFermion result(&Grid); result=zero;
 | 
			
		||||
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,&Grid);
 | 
			
		||||
 | 
			
		||||
  double volume=1;
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    volume=volume*latt_size[mu];
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  RealD mass=0.5;
 | 
			
		||||
  WilsonMatrix Dw(Umu,mass);
 | 
			
		||||
  HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
 | 
			
		||||
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
 | 
			
		||||
  CG(HermOp,src,result);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -5,12 +5,14 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 | 
			
		||||
#
 | 
			
		||||
# Test code
 | 
			
		||||
#
 | 
			
		||||
bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3
 | 
			
		||||
 | 
			
		||||
bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec
 | 
			
		||||
 | 
			
		||||
Grid_wilson_SOURCES = Grid_wilson.cc
 | 
			
		||||
Grid_wilson_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc
 | 
			
		||||
Grid_wilson_cg_unprec_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_comms_SOURCES = Grid_comms.cc
 | 
			
		||||
Grid_comms_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										2
									
								
								config.guess
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								config.guess
									
									
									
									
										vendored
									
									
								
							@@ -1 +1 @@
 | 
			
		||||
/usr/share/automake-1.14/config.guess
 | 
			
		||||
/opt/local/share/automake-1.15/config.guess
 | 
			
		||||
							
								
								
									
										2
									
								
								config.sub
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								config.sub
									
									
									
									
										vendored
									
									
								
							@@ -1 +1 @@
 | 
			
		||||
/usr/share/automake-1.14/config.sub
 | 
			
		||||
/opt/local/share/automake-1.15/config.sub
 | 
			
		||||
							
								
								
									
										13
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										13
									
								
								configure
									
									
									
									
										vendored
									
									
								
							@@ -2466,7 +2466,7 @@ test -n "$target_alias" &&
 | 
			
		||||
    NONENONEs,x,x, &&
 | 
			
		||||
  program_prefix=${target_alias}-
 | 
			
		||||
 | 
			
		||||
am__api_version='1.14'
 | 
			
		||||
am__api_version='1.15'
 | 
			
		||||
 | 
			
		||||
# Find a good install program.  We prefer a C program (faster),
 | 
			
		||||
# so one script is as good as another.  But avoid the broken or
 | 
			
		||||
@@ -2638,8 +2638,8 @@ test "$program_suffix" != NONE &&
 | 
			
		||||
ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 | 
			
		||||
program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
 | 
			
		||||
 | 
			
		||||
# expand $ac_aux_dir to an absolute path
 | 
			
		||||
am_aux_dir=`cd $ac_aux_dir && pwd`
 | 
			
		||||
# Expand $ac_aux_dir to an absolute path.
 | 
			
		||||
am_aux_dir=`cd "$ac_aux_dir" && pwd`
 | 
			
		||||
 | 
			
		||||
if test x"${MISSING+set}" != xset; then
 | 
			
		||||
  case $am_aux_dir in
 | 
			
		||||
@@ -2658,7 +2658,7 @@ else
 | 
			
		||||
$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if test x"${install_sh}" != xset; then
 | 
			
		||||
if test x"${install_sh+set}" != xset; then
 | 
			
		||||
  case $am_aux_dir in
 | 
			
		||||
  *\ * | *\	*)
 | 
			
		||||
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
 | 
			
		||||
@@ -2986,8 +2986,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 | 
			
		||||
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 | 
			
		||||
mkdir_p='$(MKDIR_P)'
 | 
			
		||||
 | 
			
		||||
# We need awk for the "check" target.  The system "awk" is bad on
 | 
			
		||||
# some platforms.
 | 
			
		||||
# We need awk for the "check" target (and possibly the TAP driver).  The
 | 
			
		||||
# system "awk" is bad on some platforms.
 | 
			
		||||
# Always define AMTAR for backward compatibility.  Yes, it's still used
 | 
			
		||||
# in the wild :-(  We should find a proper way to deprecate it ...
 | 
			
		||||
AMTAR='$${TAR-tar}'
 | 
			
		||||
@@ -3046,6 +3046,7 @@ END
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ac_config_headers="$ac_config_headers lib/Grid_config.h"
 | 
			
		||||
 | 
			
		||||
# Check whether --enable-silent-rules was given.
 | 
			
		||||
 
 | 
			
		||||
@@ -50,15 +50,20 @@ namespace Grid {
 | 
			
		||||
  typedef std::complex<Real>  Complex;
 | 
			
		||||
 | 
			
		||||
  inline RealF adj(const RealF  & r){ return r; }
 | 
			
		||||
  inline RealF conj(const RealF  & r){ return r; }
 | 
			
		||||
  inline RealF conjugate(const RealF  & r){ return r; }
 | 
			
		||||
  inline RealF real(const RealF  & r){ return r; }
 | 
			
		||||
 | 
			
		||||
  inline RealD adj(const RealD  & r){ return r; }
 | 
			
		||||
  inline RealD conj(const RealD  & r){ return r; }
 | 
			
		||||
  inline RealD conjugate(const RealD  & r){ return r; }
 | 
			
		||||
  inline RealD real(const RealD  & r){ return r; }
 | 
			
		||||
 | 
			
		||||
  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
 | 
			
		||||
  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
 | 
			
		||||
  inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
 | 
			
		||||
  inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
 | 
			
		||||
  inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
 | 
			
		||||
  inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
 | 
			
		||||
 | 
			
		||||
  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
 | 
			
		||||
  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
 | 
			
		||||
  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
 | 
			
		||||
  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
 | 
			
		||||
 | 
			
		||||
@@ -70,15 +75,14 @@ namespace Grid {
 | 
			
		||||
    inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
 | 
			
		||||
    inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
 | 
			
		||||
    inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
 | 
			
		||||
    inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
 | 
			
		||||
    // conj already supported for complex
 | 
			
		||||
    // conjugate already supported for complex
 | 
			
		||||
    
 | 
			
		||||
    inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
 | 
			
		||||
    inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
 | 
			
		||||
    inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
 | 
			
		||||
    inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
 | 
			
		||||
    inline ComplexF  adj(const ComplexF& r ){ return(conj(r)); }
 | 
			
		||||
    //conj already supported for complex
 | 
			
		||||
 | 
			
		||||
    //conjugate already supported for complex
 | 
			
		||||
 | 
			
		||||
    inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
 | 
			
		||||
    inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
 | 
			
		||||
 
 | 
			
		||||
@@ -7,8 +7,8 @@ namespace Grid {
 | 
			
		||||
  // LinearOperators Take a something and return a something.
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  //
 | 
			
		||||
  // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugate (transpose if real):
 | 
			
		||||
  //
 | 
			
		||||
  // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
 | 
			
		||||
  //SBase
 | 
			
		||||
  //   i)  F(a x + b y) = aF(x) + b F(y).
 | 
			
		||||
  //  ii)  <x|Op|y> = <y|AdjOp|x>^\ast
 | 
			
		||||
  //
 | 
			
		||||
@@ -25,12 +25,13 @@ namespace Grid {
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
 | 
			
		||||
    public:
 | 
			
		||||
      virtual RealD OpAndNorm(const Field &in, Field &out);
 | 
			
		||||
      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2);
 | 
			
		||||
      void AdjOp(const Field &in, Field &out) {
 | 
			
		||||
	Op(in,out);
 | 
			
		||||
      };
 | 
			
		||||
      void Op(const Field &in, Field &out) {
 | 
			
		||||
	OpAndNorm(in,out);
 | 
			
		||||
	double n1,n2;
 | 
			
		||||
	OpAndNorm(in,out,n1,n2);
 | 
			
		||||
      };
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
@@ -80,8 +81,8 @@ namespace Grid {
 | 
			
		||||
      Matrix &_Mat;
 | 
			
		||||
    public:
 | 
			
		||||
      HermitianOperator(Matrix &Mat): _Mat(Mat) {};
 | 
			
		||||
      RealD OpAndNorm(const Field &in, Field &out){
 | 
			
		||||
	return _Mat.MdagM(in,out);
 | 
			
		||||
      void OpAndNorm(const Field &in, Field &out,double &n1,double &n2){
 | 
			
		||||
	return _Mat.MdagM(in,out,n1,n2);
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -18,6 +18,7 @@ public:
 | 
			
		||||
      std::cout << Tolerance<<std::endl;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};
 | 
			
		||||
    void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){
 | 
			
		||||
 | 
			
		||||
      RealD cp,c,a,d,b,ssq,qq,b_pred;
 | 
			
		||||
@@ -61,21 +62,27 @@ public:
 | 
			
		||||
	
 | 
			
		||||
	Linop.OpAndNorm(p,mmp,d,qq);
 | 
			
		||||
	  
 | 
			
		||||
	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<std::endl;
 | 
			
		||||
      
 | 
			
		||||
	a      = c/d;
 | 
			
		||||
	b_pred = a*(a*qq-d)/c;
 | 
			
		||||
	
 | 
			
		||||
	cp = axpy_norm(r,mmp,r,-a);
 | 
			
		||||
	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
 | 
			
		||||
 | 
			
		||||
	cp = axpy_norm(r,-a,mmp,r);
 | 
			
		||||
	b = cp/c;
 | 
			
		||||
	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,b "<<cp<< " "<<b <<std::endl;
 | 
			
		||||
	
 | 
			
		||||
	// Fuse these loops ; should be really easy
 | 
			
		||||
	psi= a*p+psi;
 | 
			
		||||
	p  = p*b+r;
 | 
			
		||||
	  
 | 
			
		||||
	std::cout << "Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 | 
			
		||||
	std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 | 
			
		||||
 | 
			
		||||
	// Stopping condition
 | 
			
		||||
	if ( cp <= rsq ) { 
 | 
			
		||||
	  
 | 
			
		||||
	  Linop.Op(p,mmp);
 | 
			
		||||
	  Linop.Op(psi,mmp);
 | 
			
		||||
	  p=mmp-src;
 | 
			
		||||
	  
 | 
			
		||||
	  RealD mmpnorm = sqrt(norm2(mmp));
 | 
			
		||||
@@ -83,8 +90,11 @@ public:
 | 
			
		||||
	  RealD srcnorm = sqrt(norm2(src));
 | 
			
		||||
	  RealD resnorm = sqrt(norm2(p));
 | 
			
		||||
	  RealD true_residual = resnorm/srcnorm;
 | 
			
		||||
 | 
			
		||||
	  std::cout<<"ConjugateGradient: Converged on iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 | 
			
		||||
	  std::cout<<"ConjugateGradient: true   residual  is "<<true_residual<<" sol "<<psinorm<<" src "<<srcnorm<<std::endl;
 | 
			
		||||
	  std::cout<<"ConjugateGradient: target residual was "<<Tolerance<<std::endl;
 | 
			
		||||
	  return;
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      std::cout<<"ConjugateGradient did NOT converge"<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -102,7 +102,7 @@ template <class arg> struct name\
 | 
			
		||||
 | 
			
		||||
GridUnopClass(UnarySub,-a);
 | 
			
		||||
GridUnopClass(UnaryAdj,adj(a));
 | 
			
		||||
GridUnopClass(UnaryConj,conj(a));
 | 
			
		||||
GridUnopClass(UnaryConj,conjugate(a));
 | 
			
		||||
GridUnopClass(UnaryTrace,trace(a));
 | 
			
		||||
GridUnopClass(UnaryTranspose,transpose(a));
 | 
			
		||||
 | 
			
		||||
@@ -178,7 +178,7 @@ template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,con
 | 
			
		||||
 | 
			
		||||
GRID_DEF_UNOP(operator -,UnarySub);
 | 
			
		||||
GRID_DEF_UNOP(adj,UnaryAdj);
 | 
			
		||||
GRID_DEF_UNOP(conj,UnaryConj);
 | 
			
		||||
GRID_DEF_UNOP(conjugate,UnaryConj);
 | 
			
		||||
GRID_DEF_UNOP(trace,UnaryTrace);
 | 
			
		||||
GRID_DEF_UNOP(transpose,UnaryTranspose);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -144,14 +144,44 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class sobj,class vobj> strong_inline
 | 
			
		||||
  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      vobj tmp = a*lhs._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp+rhs._odata[ss]);
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  template<class sobj,class vobj> strong_inline
 | 
			
		||||
  void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class sobj,class vobj> strong_inline
 | 
			
		||||
  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
    }
 | 
			
		||||
    return norm2(ret);
 | 
			
		||||
  }
 | 
			
		||||
  template<class sobj,class vobj> strong_inline
 | 
			
		||||
  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
    }
 | 
			
		||||
    return norm2(ret); // FIXME implement parallel norm in ss loop
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -9,7 +9,7 @@ namespace Grid {
 | 
			
		||||
// Functionality:
 | 
			
		||||
//     -=,+=,*=,()
 | 
			
		||||
//     add,+,sub,-,mult,mac,*
 | 
			
		||||
//     adj,conj
 | 
			
		||||
//     adj,conjugate
 | 
			
		||||
//     real,imag
 | 
			
		||||
//     transpose,transposeIndex  
 | 
			
		||||
//     trace,traceIndex
 | 
			
		||||
 
 | 
			
		||||
@@ -18,11 +18,11 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
        return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<class vobj> inline Lattice<vobj> conj(const Lattice<vobj> &lhs){
 | 
			
		||||
    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
 | 
			
		||||
        Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = conj(lhs._odata[ss]);
 | 
			
		||||
            ret._odata[ss] = conjugate(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
    };
 | 
			
		||||
 
 | 
			
		||||
@@ -98,26 +98,26 @@ template<class vtype,int N> inline void  timesMinusI(iMatrix<vtype,N> &ret,const
 | 
			
		||||
/////////////////////////////////////////////// 
 | 
			
		||||
// Conj function for scalar, vector, matrix
 | 
			
		||||
/////////////////////////////////////////////// 
 | 
			
		||||
template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
 | 
			
		||||
template<class vtype> inline iScalar<vtype> conjugate(const iScalar<vtype>&r)
 | 
			
		||||
{
 | 
			
		||||
    iScalar<vtype> ret;
 | 
			
		||||
    ret._internal = conj(r._internal);
 | 
			
		||||
    ret._internal = conjugate(r._internal);
 | 
			
		||||
    return ret;
 | 
			
		||||
}
 | 
			
		||||
template<class vtype,int N> inline iVector<vtype,N> conj(const iVector<vtype,N>&r)
 | 
			
		||||
template<class vtype,int N> inline iVector<vtype,N> conjugate(const iVector<vtype,N>&r)
 | 
			
		||||
{
 | 
			
		||||
  iVector<vtype,N> ret;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    ret._internal[i] = conj(r._internal[i]);
 | 
			
		||||
    ret._internal[i] = conjugate(r._internal[i]);
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
template<class vtype,int N> inline iMatrix<vtype,N> conj(const iMatrix<vtype,N>&r)
 | 
			
		||||
template<class vtype,int N> inline iMatrix<vtype,N> conjugate(const iMatrix<vtype,N>&r)
 | 
			
		||||
{
 | 
			
		||||
  iMatrix<vtype,N> ret;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
  for(int j=0;j<N;j++){
 | 
			
		||||
    ret._internal[i][j] = conj(r._internal[i][j]);
 | 
			
		||||
    ret._internal[i][j] = conjugate(r._internal[i][j]);
 | 
			
		||||
  }}
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -21,7 +21,13 @@ const int WilsonMatrix::Tm = 7;
 | 
			
		||||
  class WilsonCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    int mu;
 | 
			
		||||
    
 | 
			
		||||
    int dag;
 | 
			
		||||
 | 
			
		||||
    WilsonCompressor(int _dag){
 | 
			
		||||
      mu=0;
 | 
			
		||||
      dag=_dag;
 | 
			
		||||
      assert((dag==0)||(dag==1));
 | 
			
		||||
    }
 | 
			
		||||
    void Point(int p) { 
 | 
			
		||||
      mu=p;
 | 
			
		||||
    };
 | 
			
		||||
@@ -29,7 +35,11 @@ const int WilsonMatrix::Tm = 7;
 | 
			
		||||
    vHalfSpinColourVector operator () (const vSpinColourVector &in)
 | 
			
		||||
    {
 | 
			
		||||
      vHalfSpinColourVector ret;
 | 
			
		||||
      switch(mu) {
 | 
			
		||||
      int mudag=mu;
 | 
			
		||||
      if (dag) {
 | 
			
		||||
	mudag=(mu+Nd)%(2*Nd);
 | 
			
		||||
      }
 | 
			
		||||
      switch(mudag) {
 | 
			
		||||
      case WilsonMatrix::Xp:
 | 
			
		||||
	spProjXp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
@@ -87,43 +97,52 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
 | 
			
		||||
 | 
			
		||||
RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Dhop(in,out);
 | 
			
		||||
  return 0.0;
 | 
			
		||||
  Dhop(in,out,0);
 | 
			
		||||
  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
 | 
			
		||||
  return norm2(out);
 | 
			
		||||
}
 | 
			
		||||
RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Dhop(in,out);
 | 
			
		||||
  return 0.0;
 | 
			
		||||
  Dhop(in,out,1);
 | 
			
		||||
  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
 | 
			
		||||
  return norm2(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Dhop(in,out);
 | 
			
		||||
  Dhop(in,out,0);
 | 
			
		||||
  out = 0.5*out; // FIXME : scale factor in Dhop
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Dhop(in,out);
 | 
			
		||||
  Dhop(in,out,1);
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out = (4.0+mass)*in;
 | 
			
		||||
  return ;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out = (1.0/(4.0+mass))*in;
 | 
			
		||||
  return ;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out = (1.0/(4.0+mass))*in;
 | 
			
		||||
  return ;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out = (1.0/(4.0+mass))*in;
 | 
			
		||||
  return ;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  WilsonCompressor compressor;
 | 
			
		||||
  assert((dag==0) ||(dag==1));
 | 
			
		||||
 | 
			
		||||
  WilsonCompressor compressor(dag);
 | 
			
		||||
  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
@@ -140,13 +159,13 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    int ssu= ss;
 | 
			
		||||
 | 
			
		||||
    // Xp
 | 
			
		||||
    offset = Stencil._offsets [Xp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Xp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Xp];
 | 
			
		||||
    int idx = (Xp+dag*4)%8;
 | 
			
		||||
    offset = Stencil._offsets [idx][ss];
 | 
			
		||||
    local  = Stencil._is_local[idx][ss];
 | 
			
		||||
    perm   = Stencil._permute[idx][ss];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
    ptype  = Stencil._permute_type[idx];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
@@ -154,18 +173,17 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
 | 
			
		||||
    //prefetch(Umu._odata[ssu](Yp));
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
			
		||||
    spReconXp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Yp
 | 
			
		||||
    offset = Stencil._offsets [Yp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Yp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Yp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Yp];
 | 
			
		||||
    idx = (Yp+dag*4)%8;
 | 
			
		||||
    offset = Stencil._offsets [idx][ss];
 | 
			
		||||
    local  = Stencil._is_local[idx][ss];
 | 
			
		||||
    perm   = Stencil._permute[idx][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[idx];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
@@ -173,18 +191,16 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
 | 
			
		||||
    //    prefetch(Umu._odata[ssu](Zp));
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zp
 | 
			
		||||
    offset = Stencil._offsets [Zp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Zp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Zp];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
    idx = (Zp+dag*4)%8;
 | 
			
		||||
    offset = Stencil._offsets [idx][ss];
 | 
			
		||||
    local  = Stencil._is_local[idx][ss];
 | 
			
		||||
    perm   = Stencil._permute[idx][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[idx];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
@@ -192,18 +208,16 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
 | 
			
		||||
    //    prefetch(Umu._odata[ssu](Tp));
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tp
 | 
			
		||||
    offset = Stencil._offsets [Tp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Tp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Tp];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
    idx = (Tp+dag*4)%8;
 | 
			
		||||
    offset = Stencil._offsets [idx][ss];
 | 
			
		||||
    local  = Stencil._is_local[idx][ss];
 | 
			
		||||
    perm   = Stencil._permute[idx][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[idx];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
@@ -211,15 +225,15 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
 | 
			
		||||
    //    prefetch(Umu._odata[ssu](Xm));
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Xm
 | 
			
		||||
    offset = Stencil._offsets [Xm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Xm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Xm];
 | 
			
		||||
    idx = (Xm+dag*4)%8;
 | 
			
		||||
    offset = Stencil._offsets [idx][ss];
 | 
			
		||||
    local  = Stencil._is_local[idx][ss];
 | 
			
		||||
    perm   = Stencil._permute[idx][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[idx];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
@@ -230,18 +244,18 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Ym
 | 
			
		||||
    offset = Stencil._offsets [Ym][ss];
 | 
			
		||||
    local  = Stencil._is_local[Ym][ss];
 | 
			
		||||
    perm   = Stencil._permute[Ym][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Ym];
 | 
			
		||||
    idx = (Ym+dag*4)%8;
 | 
			
		||||
    offset = Stencil._offsets [idx][ss];
 | 
			
		||||
    local  = Stencil._is_local[idx][ss];
 | 
			
		||||
    perm   = Stencil._permute[idx][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[idx];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
@@ -249,17 +263,16 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zm
 | 
			
		||||
    offset = Stencil._offsets [Zm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Zm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Zm];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
    idx = (Zm+dag*4)%8;
 | 
			
		||||
    offset = Stencil._offsets [idx][ss];
 | 
			
		||||
    local  = Stencil._is_local[idx][ss];
 | 
			
		||||
    perm   = Stencil._permute[idx][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[idx];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
@@ -267,17 +280,16 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tm
 | 
			
		||||
    offset = Stencil._offsets [Tm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Tm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Tm];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
    idx = (Tm+dag*4)%8;
 | 
			
		||||
    offset = Stencil._offsets [idx][ss];
 | 
			
		||||
    local  = Stencil._is_local[idx][ss];
 | 
			
		||||
    perm   = Stencil._permute[idx][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[idx];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
@@ -285,7 +297,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    vstream(out._odata[ss],result);
 | 
			
		||||
@@ -294,10 +306,6 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -45,10 +45,7 @@ namespace Grid {
 | 
			
		||||
      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
 | 
			
		||||
      // non-hermitian hopping term; half cb or both
 | 
			
		||||
      void Dhop(const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
 | 
			
		||||
      // m+4r -1/2 Dhop; both cb's
 | 
			
		||||
      void Dw(const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
			
		||||
 | 
			
		||||
      typedef iScalar<iMatrix<vComplex, Nc> > matrix;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -32,7 +32,7 @@ namespace Grid {
 | 
			
		||||
        friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
 | 
			
		||||
        friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
 | 
			
		||||
        friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
 | 
			
		||||
        friend inline vComplexD adj(const vComplexD &in){ return conj(in); }
 | 
			
		||||
        friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); }
 | 
			
		||||
 | 
			
		||||
        //////////////////////////////////
 | 
			
		||||
        // Initialise to 1,0,i
 | 
			
		||||
@@ -166,11 +166,11 @@ namespace Grid {
 | 
			
		||||
	// all subtypes; may not be a good assumption, but could
 | 
			
		||||
	// add the vector width as a template param for BG/Q for example
 | 
			
		||||
	////////////////////////////////////////////////////////////////////
 | 
			
		||||
	/*
 | 
			
		||||
	friend inline void permute(vComplexD &y,vComplexD b,int perm)
 | 
			
		||||
	{
 | 
			
		||||
	  Gpermute<vComplexD>(y,b,perm);
 | 
			
		||||
	}
 | 
			
		||||
	/*
 | 
			
		||||
	friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
 | 
			
		||||
	{
 | 
			
		||||
	  Gmerge<vComplexD,ComplexD >(y,extracted);
 | 
			
		||||
@@ -269,7 +269,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
        ////////////////////////
 | 
			
		||||
        // Conjugate
 | 
			
		||||
        ////////////////////////
 | 
			
		||||
        friend inline vComplexD conj(const vComplexD &in){
 | 
			
		||||
        friend inline vComplexD conjugate(const vComplexD &in){
 | 
			
		||||
            vComplexD ret ; vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	    //	    addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
 | 
			
		||||
@@ -345,17 +345,17 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
// REDUCE FIXME must be a cleaner implementation
 | 
			
		||||
       friend inline ComplexD Reduce(const vComplexD & in)
 | 
			
		||||
       { 
 | 
			
		||||
#if defined SSE4
 | 
			
		||||
         return ComplexD(in.v[0], in.v[1]); // inefficient
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	 return ComplexD(in.v[0],in.v[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
	 vComplexD v1;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired complex single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 return ComplexD(v1.v[0],v1.v[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX1) || defined(AVX2)
 | 
			
		||||
	 //            return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
 | 
			
		||||
	 __attribute__ ((aligned(32))) double c_[4];
 | 
			
		||||
         _mm256_store_pd(c_,in.v);
 | 
			
		||||
	 return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
 | 
			
		||||
#endif 
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
 | 
			
		||||
	 return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
 | 
			
		||||
#endif 
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#endif
 | 
			
		||||
@@ -387,7 +387,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
 | 
			
		||||
    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    typedef  vComplexD vDComplex;
 | 
			
		||||
 
 | 
			
		||||
@@ -47,7 +47,7 @@ namespace Grid {
 | 
			
		||||
        friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
 | 
			
		||||
        friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
 | 
			
		||||
        friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
 | 
			
		||||
        friend inline vComplexF adj(const vComplexF &in){ return conj(in); }
 | 
			
		||||
        friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); }
 | 
			
		||||
        
 | 
			
		||||
        //////////////////////////////////
 | 
			
		||||
        // Initialise to 1,0,i
 | 
			
		||||
@@ -228,42 +228,25 @@ namespace Grid {
 | 
			
		||||
            ret.v = {a,b,a,b};
 | 
			
		||||
#endif
 | 
			
		||||
        }
 | 
			
		||||
       friend inline ComplexF Reduce(const vComplexF & in)
 | 
			
		||||
       {
 | 
			
		||||
	friend inline void permute(vComplexF &y,vComplexF b,int perm)
 | 
			
		||||
	{
 | 
			
		||||
	  Gpermute<vComplexF>(y,b,perm);
 | 
			
		||||
	}
 | 
			
		||||
	friend inline ComplexF Reduce(const vComplexF & in)
 | 
			
		||||
	{
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	   union {
 | 
			
		||||
	     cvec v1;    // SSE 4 x float vector
 | 
			
		||||
	     float f[4];  // scalar array of 4 floats
 | 
			
		||||
	   } u128;
 | 
			
		||||
	   u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
 | 
			
		||||
	   return ComplexF(u128.f[0], u128.f[1]);
 | 
			
		||||
	 vComplexF v1;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired complex single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 return ComplexF(v1.v[0],v1.v[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX1
 | 
			
		||||
	   //it would be better passing 2 arguments to saturate the vector lanes
 | 
			
		||||
	   union {
 | 
			
		||||
	     __m256 v1;    
 | 
			
		||||
	     float f[8];  
 | 
			
		||||
	   } u256;
 | 
			
		||||
	   //SWAP lanes
 | 
			
		||||
	   // FIXME .. icc complains with lib/lattice/Grid_lattice_reduction.h (49): (col. 20) warning #13211: Immediate parameter to intrinsic call too large
 | 
			
		||||
	   __m256 t0 = _mm256_permute2f128_ps(in.v, in.v, 1);
 | 
			
		||||
	   __m256 t1 = _mm256_permute_ps(in.v  , 0b11011000);//real (0,2,1,3)
 | 
			
		||||
	   __m256 t2 = _mm256_permute_ps(t0 , 0b10001101);//imag (1,3,0,2)
 | 
			
		||||
	   t0 = _mm256_blend_ps(t1, t2, 0b0101000001010000);// (0,0,1,1,0,0,1,1)
 | 
			
		||||
	   t1 = _mm256_hadd_ps(t0,t0);
 | 
			
		||||
	   u256.v1 = _mm256_hadd_ps(t1, t1);
 | 
			
		||||
	   return ComplexF(u256.f[0], u256.f[4]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX2
 | 
			
		||||
	   union {
 | 
			
		||||
	     __m256 v1;    
 | 
			
		||||
	     float f[8];  
 | 
			
		||||
	   } u256;
 | 
			
		||||
	   const __m256i mask= _mm256_set_epi32( 7, 5, 3, 1, 6, 4, 2, 0);
 | 
			
		||||
	   __m256 tmp1 = _mm256_permutevar8x32_ps(in.v, mask);
 | 
			
		||||
	   __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
 | 
			
		||||
	   u256.v1 = _mm256_hadd_ps(tmp2, tmp2);
 | 
			
		||||
	   return ComplexF(u256.f[0], u256.f[4]);
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
	 vComplexF v1,v2;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired complex single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); // avx 256; quad complex single
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 return ComplexF(v1.v[0],v1.v[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
 | 
			
		||||
@@ -345,13 +328,10 @@ namespace Grid {
 | 
			
		||||
        // Conjugate
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
								     
 | 
			
		||||
        friend inline vComplexF conj(const vComplexF &in){
 | 
			
		||||
        friend inline vComplexF conjugate(const vComplexF &in){
 | 
			
		||||
            vComplexF ret ; vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	    cvec tmp;
 | 
			
		||||
	    tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
 | 
			
		||||
	    ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
 | 
			
		||||
	    ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f));
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	    ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f));
 | 
			
		||||
@@ -433,10 +413,6 @@ namespace Grid {
 | 
			
		||||
            return *this;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      friend inline void permute(vComplexF &y,vComplexF b,int perm)
 | 
			
		||||
      {
 | 
			
		||||
	Gpermute<vComplexF>(y,b,perm);
 | 
			
		||||
      }
 | 
			
		||||
      /*
 | 
			
		||||
      friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
 | 
			
		||||
      {
 | 
			
		||||
@@ -460,7 +436,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) 
 | 
			
		||||
    {
 | 
			
		||||
      return conj(l)*r; 
 | 
			
		||||
      return conjugate(l)*r; 
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    inline void zeroit(vComplexF &z){ vzero(z);}
 | 
			
		||||
 
 | 
			
		||||
@@ -117,7 +117,7 @@ namespace Grid {
 | 
			
		||||
        };
 | 
			
		||||
        
 | 
			
		||||
        ///////////////////////////////////////////////
 | 
			
		||||
        // mult, sub, add, adj,conj, mac functions
 | 
			
		||||
        // mult, sub, add, adj,conjugate, mac functions
 | 
			
		||||
        ///////////////////////////////////////////////
 | 
			
		||||
        friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
 | 
			
		||||
        friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
 | 
			
		||||
 
 | 
			
		||||
@@ -26,7 +26,7 @@ namespace Grid {
 | 
			
		||||
        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
 | 
			
		||||
        friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
 | 
			
		||||
        friend inline vRealD adj(const vRealD &in) { return in; }
 | 
			
		||||
        friend inline vRealD conj(const vRealD &in){ return in; }
 | 
			
		||||
        friend inline vRealD conjugate(const vRealD &in){ return in; }
 | 
			
		||||
        
 | 
			
		||||
        friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
 | 
			
		||||
#if defined (AVX1) || defined (SSE4)
 | 
			
		||||
@@ -112,11 +112,12 @@ namespace Grid {
 | 
			
		||||
	// all subtypes; may not be a good assumption, but could
 | 
			
		||||
	// add the vector width as a template param for BG/Q for example
 | 
			
		||||
	////////////////////////////////////////////////////////////////////
 | 
			
		||||
	/*
 | 
			
		||||
 | 
			
		||||
	friend inline void permute(vRealD &y,vRealD b,int perm)
 | 
			
		||||
	{
 | 
			
		||||
	  Gpermute<vRealD>(y,b,perm);
 | 
			
		||||
	}
 | 
			
		||||
	/*
 | 
			
		||||
	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
 | 
			
		||||
	{
 | 
			
		||||
	  Gmerge<vRealD,RealD >(y,extracted);
 | 
			
		||||
@@ -209,48 +210,26 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
       friend inline RealD Reduce(const vRealD & in)
 | 
			
		||||
       {
 | 
			
		||||
#if defined (SSE4)
 | 
			
		||||
	 // FIXME Hack
 | 
			
		||||
	 const RealD * ptr =(const RealD *)  ∈
 | 
			
		||||
	 RealD ret = 0; 
 | 
			
		||||
	 for(int i=0;i<vRealD::Nsimd();i++){
 | 
			
		||||
	   ret = ret+ptr[i];
 | 
			
		||||
	 }
 | 
			
		||||
	 return ret;
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	 vRealD v1;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired real double
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 return RealD(v1.v[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX1) || defined(AVX2)
 | 
			
		||||
	 typedef union  {
 | 
			
		||||
	   uint64_t l;
 | 
			
		||||
	   double   d;
 | 
			
		||||
	 } my_conv_t;
 | 
			
		||||
	 my_conv_t converter;
 | 
			
		||||
// more reduce_add
 | 
			
		||||
/*
 | 
			
		||||
            __attribute__ ((aligned(32))) double c_[16];
 | 
			
		||||
	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
 | 
			
		||||
            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
 | 
			
		||||
  	             tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp  = 3+2,1+0,1+0,3+2
 | 
			
		||||
                    hadd = _mm256_hadd_pd(tmp,tmp);               // tmp  = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
 | 
			
		||||
                    _mm256_store_pd(c_,hadd);<3B>
 | 
			
		||||
             return c[0]
 | 
			
		||||
*/
 | 
			
		||||
	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
 | 
			
		||||
            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
 | 
			
		||||
                    hadd = _mm256_hadd_pd(hadd,hadd);             // hadd = 1+0+3+2...
 | 
			
		||||
		    converter.l = _mm256_extract_epi64((ivec)hadd,0);
 | 
			
		||||
            return converter.d;
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
	 vRealD v1,v2;
 | 
			
		||||
	 permute(v1,in,0); // avx 256; quad double
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 return v1.v[0];
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            return _mm512_reduce_add_pd(in.v);
 | 
			
		||||
/*
 | 
			
		||||
            __attribute__ ((aligned(32))) double c_[8];
 | 
			
		||||
           _mm512_store_pd(c_,in.v);
 | 
			
		||||
            return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
 | 
			
		||||
*/
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#endif
 | 
			
		||||
        }
 | 
			
		||||
       }
 | 
			
		||||
 | 
			
		||||
        // *=,+=,-= operators
 | 
			
		||||
        inline vRealD &operator *=(const vRealD &r) {
 | 
			
		||||
@@ -270,7 +249,7 @@ namespace Grid {
 | 
			
		||||
        static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
 | 
			
		||||
    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }
 | 
			
		||||
    inline void zeroit(vRealD &z){ vzero(z);}
 | 
			
		||||
 | 
			
		||||
    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
 | 
			
		||||
 
 | 
			
		||||
@@ -92,13 +92,13 @@ namespace Grid {
 | 
			
		||||
        };
 | 
			
		||||
        
 | 
			
		||||
        ///////////////////////////////////////////////
 | 
			
		||||
        // mult, sub, add, adj,conj, mac functions
 | 
			
		||||
        // mult, sub, add, adj,conjugate, mac functions
 | 
			
		||||
        ///////////////////////////////////////////////
 | 
			
		||||
        friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
 | 
			
		||||
        friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
 | 
			
		||||
        friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
 | 
			
		||||
        friend inline vRealF adj(const vRealF &in) { return in; }
 | 
			
		||||
        friend inline vRealF conj(const vRealF &in){ return in; }
 | 
			
		||||
        friend inline vRealF conjugate(const vRealF &in){ return in; }
 | 
			
		||||
 | 
			
		||||
        friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
 | 
			
		||||
#if defined (AVX1) || defined (SSE4)
 | 
			
		||||
@@ -133,11 +133,12 @@ namespace Grid {
 | 
			
		||||
	// all subtypes; may not be a good assumption, but could
 | 
			
		||||
	// add the vector width as a template param for BG/Q for example
 | 
			
		||||
	////////////////////////////////////////////////////////////////////
 | 
			
		||||
	/*
 | 
			
		||||
 | 
			
		||||
	friend inline void permute(vRealF &y,vRealF b,int perm)
 | 
			
		||||
	{
 | 
			
		||||
	  Gpermute<vRealF>(y,b,perm);
 | 
			
		||||
	}
 | 
			
		||||
	/*
 | 
			
		||||
	friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
 | 
			
		||||
	{
 | 
			
		||||
	  Gmerge<vRealF,RealF >(y,extracted);
 | 
			
		||||
@@ -155,7 +156,6 @@ namespace Grid {
 | 
			
		||||
	  Gextract<vRealF,RealF>(y,extracted);
 | 
			
		||||
	}
 | 
			
		||||
	*/
 | 
			
		||||
 | 
			
		||||
        
 | 
			
		||||
        /////////////////////////////////////////////////////
 | 
			
		||||
        // Broadcast a value across Nsimd copies.
 | 
			
		||||
@@ -243,33 +243,26 @@ friend inline void vstore(const vRealF &ret, float *a){
 | 
			
		||||
        }
 | 
			
		||||
       friend inline RealF Reduce(const vRealF & in)
 | 
			
		||||
       {
 | 
			
		||||
#if defined (SSE4)
 | 
			
		||||
	 // FIXME Hack
 | 
			
		||||
	 const RealF * ptr = (const RealF *) ∈
 | 
			
		||||
	 RealF ret = 0; 
 | 
			
		||||
	 for(int i=0;i<vRealF::Nsimd();i++){
 | 
			
		||||
	   ret = ret+ptr[i];
 | 
			
		||||
	 }
 | 
			
		||||
	 return ret;
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	 vRealF v1,v2;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; quad single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 return v1.v[0];
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX1) || defined(AVX2)
 | 
			
		||||
            __attribute__ ((aligned(32))) float c_[16];
 | 
			
		||||
            __m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
 | 
			
		||||
            __m256 hadd = _mm256_hadd_ps(in.v,tmp);
 | 
			
		||||
                   tmp = _mm256_permute2f128_ps(hadd,hadd,0x01);
 | 
			
		||||
                   hadd = _mm256_hadd_ps(tmp,tmp);
 | 
			
		||||
                  _mm256_store_ps(c_,hadd);
 | 
			
		||||
         return (float)c_[0];
 | 
			
		||||
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
	 vRealF v1,v2;
 | 
			
		||||
	 permute(v1,in,0); // avx 256; octo-double
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 permute(v2,v1,2); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 return v1.v[0];
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            return _mm512_reduce_add_ps(in.v);
 | 
			
		||||
/*
 | 
			
		||||
             __attribute__ ((aligned(64))) float c_[16];
 | 
			
		||||
             _mm512_store_ps(c_,in.v);
 | 
			
		||||
             return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
 | 
			
		||||
                    +c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
 | 
			
		||||
*/
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#endif
 | 
			
		||||
@@ -291,7 +284,7 @@ friend inline void vstore(const vRealF &ret, float *a){
 | 
			
		||||
    public:
 | 
			
		||||
        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
 | 
			
		||||
    };
 | 
			
		||||
    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
 | 
			
		||||
    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conjugate(l)*r; }
 | 
			
		||||
    inline void  zeroit(vRealF &z){ vzero(z);}
 | 
			
		||||
 | 
			
		||||
    inline vRealF outerProduct(const vRealF &l, const vRealF& r)
 | 
			
		||||
 
 | 
			
		||||
@@ -79,7 +79,7 @@ namespace Grid {
 | 
			
		||||
    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
 | 
			
		||||
    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
 | 
			
		||||
    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
 | 
			
		||||
    friend inline Grid_simd adj(const Grid_simd &in){ return conj(in); }
 | 
			
		||||
    friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
 | 
			
		||||
        
 | 
			
		||||
    //////////////////////////////////
 | 
			
		||||
    // Initialise to 1,0,i
 | 
			
		||||
@@ -193,7 +193,7 @@ namespace Grid {
 | 
			
		||||
    // Conjugate
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
								     
 | 
			
		||||
    friend inline Grid_simd  conj(const Grid_simd  &in){
 | 
			
		||||
    friend inline Grid_simd  conjugate(const Grid_simd  &in){
 | 
			
		||||
      Grid_simd  ret ; vzero(ret);
 | 
			
		||||
      // FIXME add operator
 | 
			
		||||
      return ret;
 | 
			
		||||
@@ -265,7 +265,7 @@ namespace Grid {
 | 
			
		||||
  template<class scalar_type, class vector_type > 
 | 
			
		||||
    inline Grid_simd< scalar_type, vector_type>  innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r) 
 | 
			
		||||
  {
 | 
			
		||||
    return conj(l)*r; 
 | 
			
		||||
    return conjugate(l)*r; 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class scalar_type, class vector_type >
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
#!/bin/bash 
 | 
			
		||||
#!/bin/bash  -e
 | 
			
		||||
 | 
			
		||||
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi"
 | 
			
		||||
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
 | 
			
		||||
EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
 | 
			
		||||
BLACK="\033[30m"
 | 
			
		||||
RED="\033[31m"
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx"
 | 
			
		||||
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse"
 | 
			
		||||
 | 
			
		||||
for D in $DIRS
 | 
			
		||||
do
 | 
			
		||||
 
 | 
			
		||||
@@ -34,6 +34,9 @@ icpc-avx512)
 | 
			
		||||
icpc-mic)
 | 
			
		||||
  CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
			
		||||
  ;;
 | 
			
		||||
clang-sse)
 | 
			
		||||
CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
			
		||||
  ;;
 | 
			
		||||
clang-avx)
 | 
			
		||||
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
			
		||||
  ;;
 | 
			
		||||
@@ -47,16 +50,16 @@ clang-avx2-openmp)
 | 
			
		||||
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
			
		||||
  ;;
 | 
			
		||||
clang-avx-openmp-mpi)
 | 
			
		||||
CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
;;
 | 
			
		||||
clang-avx2-openmp-mpi)
 | 
			
		||||
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
;;
 | 
			
		||||
clang-avx-mpi)
 | 
			
		||||
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx "  LIBS="-lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
;;
 | 
			
		||||
clang-avx2-mpi)
 | 
			
		||||
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx "  LIBS="-lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
;;
 | 
			
		||||
esac
 | 
			
		||||
echo -e $NORMAL
 | 
			
		||||
 
 | 
			
		||||
@@ -140,7 +140,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
//     -=,+=,*=,()
 | 
			
		||||
//     add,+,sub,-,mult,mac,*
 | 
			
		||||
//     adj,conj
 | 
			
		||||
//     adj,conjugate
 | 
			
		||||
//     real,imag
 | 
			
		||||
//     transpose,transposeIndex  
 | 
			
		||||
//     trace,traceIndex
 | 
			
		||||
@@ -437,7 +437,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
        for(int r=0;r<3;r++){
 | 
			
		||||
        for(int c=0;c<3;c++){
 | 
			
		||||
            diff =shifted1()()(r,c)-shifted2()()(r,c);
 | 
			
		||||
            nn=real(conj(diff)*diff);
 | 
			
		||||
            nn=real(conjugate(diff)*diff);
 | 
			
		||||
            if ( nn > 0 )
 | 
			
		||||
                cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
 | 
			
		||||
                    <<shifted1()()(r,c)<<" "<<shifted2()()(r,c)
 | 
			
		||||
@@ -451,7 +451,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
        for(int r=0;r<3;r++){
 | 
			
		||||
        for(int c=0;c<3;c++){
 | 
			
		||||
            diff =shifted3()()(r,c)-shifted2()()(r,c);
 | 
			
		||||
            nn=real(conj(diff)*diff);
 | 
			
		||||
            nn=real(conjugate(diff)*diff);
 | 
			
		||||
            if ( nn > 0 )
 | 
			
		||||
                cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
 | 
			
		||||
                <<shifted3()()(r,c)<<" "<<shifted2()()(r,c)
 | 
			
		||||
@@ -468,7 +468,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
        for(int r=0;r<Nc;r++){
 | 
			
		||||
        for(int c=0;c<Nc;c++){
 | 
			
		||||
            diff =foobar2()()(r,c)-foobar1()()(r,c);
 | 
			
		||||
            nrm = nrm + real(conj(diff)*diff);
 | 
			
		||||
            nrm = nrm + real(conjugate(diff)*diff);
 | 
			
		||||
        }}
 | 
			
		||||
    }}}}
 | 
			
		||||
	if( Fine.IsBoss() ){
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										52
									
								
								tests/Grid_rng_fixed.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								tests/Grid_rng_fixed.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,52 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#include <parallelIO/GridNerscIO.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
     
 | 
			
		||||
  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
 | 
			
		||||
  GridSerialRNG            fsRNG;  fsRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  GridParallelRNG          fpRNG(&Grid);  fpRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
 | 
			
		||||
  vComplexF tmp; random(fsRNG,tmp);
 | 
			
		||||
  std::cout<<"Random vComplexF (fixed seed)\n"<< tmp<<std::endl;
 | 
			
		||||
  std::cout<<"conjugate(tmp)\n"<< conjugate(tmp)<<std::endl;
 | 
			
		||||
  std::cout<<"conjugate(tmp)*tmp\n"<< conjugate(tmp)*tmp<<std::endl;
 | 
			
		||||
  std::cout<<"innerProduct"<< innerProduct(tmp,tmp)<<std::endl;
 | 
			
		||||
  std::cout<<"Reduce(innerProduct)"<< Reduce(innerProduct(tmp,tmp))<<std::endl;
 | 
			
		||||
 | 
			
		||||
  SpinMatrix rnd  ; 
 | 
			
		||||
  random(fsRNG,rnd);
 | 
			
		||||
  std::cout<<"Random Spin Matrix (fixed seed)\n"<< rnd<<std::endl;
 | 
			
		||||
 | 
			
		||||
  SpinVector rv; 
 | 
			
		||||
  random(fsRNG,rv);
 | 
			
		||||
  std::cout<<"Random Spin Vector (fixed seed)\n"<< rv<<std::endl;
 | 
			
		||||
 | 
			
		||||
  gaussian(fsRNG,rv);
 | 
			
		||||
  std::cout<<"Gaussian Spin Vector (fixed seed)\n"<< rv<<std::endl;
 | 
			
		||||
 | 
			
		||||
  LatticeColourVector lcv(&Grid);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src(&Grid); random(fpRNG,src);
 | 
			
		||||
  std::cout << "src norm : " << norm2(src)<<std::endl;
 | 
			
		||||
  std::cout << "src " << src<<std::endl;
 | 
			
		||||
 | 
			
		||||
  random(fpRNG,lcv);
 | 
			
		||||
  std::cout<<"Random Lattice Colour Vector (fixed seed)\n"<< lcv<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -26,7 +26,7 @@ public:
 | 
			
		||||
class funcConj {
 | 
			
		||||
public:
 | 
			
		||||
  funcConj() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conj(i1);}
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conjugate(i1);}
 | 
			
		||||
  std::string name(void) const { return std::string("Conj"); }
 | 
			
		||||
};
 | 
			
		||||
class funcAdj {
 | 
			
		||||
@@ -49,6 +49,34 @@ public:
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesMinusI(i1);}
 | 
			
		||||
  std::string name(void) const { return std::string("timesMinusI"); }
 | 
			
		||||
};
 | 
			
		||||
class funcInnerProduct {
 | 
			
		||||
public:
 | 
			
		||||
  funcInnerProduct() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = innerProduct(i1,i2);}
 | 
			
		||||
  std::string name(void) const { return std::string("innerProduct"); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// FIXME still to test:
 | 
			
		||||
//
 | 
			
		||||
//  innerProduct,
 | 
			
		||||
//  norm2, 
 | 
			
		||||
//  Reduce,
 | 
			
		||||
//
 | 
			
		||||
//  mac,mult,sub,add, vone,vzero,vcomplex_i, =zero,
 | 
			
		||||
//  vset,vsplat,vstore,vstream,vload, scalar*vec, vec*scalar
 | 
			
		||||
//  unary -,
 | 
			
		||||
//  *= , -=, +=
 | 
			
		||||
//  outerproduct, 
 | 
			
		||||
//  zeroit
 | 
			
		||||
//  permute
 | 
			
		||||
 | 
			
		||||
class funcReduce {
 | 
			
		||||
public:
 | 
			
		||||
  funcReduce() {};
 | 
			
		||||
template<class reduce,class vec>    void vfunc(reduce &rr,vec &i1,vec &i2)   const { rr = Reduce(i1);}
 | 
			
		||||
template<class reduce,class scal>   void sfunc(reduce &rr,scal &i1,scal &i2) const { rr = i1;}
 | 
			
		||||
  std::string name(void) const { return std::string("Reduce"); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class scal, class vec,class functor > 
 | 
			
		||||
void Tester(const functor &func)
 | 
			
		||||
@@ -96,8 +124,59 @@ void Tester(const functor &func)
 | 
			
		||||
      ok++;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( ok==0 ) std::cout << " OK!" <<std::endl;
 | 
			
		||||
  if ( ok==0 ) {
 | 
			
		||||
    std::cout << " OK!" <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  assert(ok==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class reduced,class scal, class vec,class functor > 
 | 
			
		||||
void ReductionTester(const functor &func)
 | 
			
		||||
{
 | 
			
		||||
  GridSerialRNG          sRNG;
 | 
			
		||||
  sRNG.SeedRandomDevice();
 | 
			
		||||
  
 | 
			
		||||
  int Nsimd = vec::Nsimd();
 | 
			
		||||
 | 
			
		||||
  std::vector<scal> input1(Nsimd);
 | 
			
		||||
  std::vector<scal> input2(Nsimd);
 | 
			
		||||
  reduced result(0);
 | 
			
		||||
  reduced reference(0);
 | 
			
		||||
  reduced tmp;
 | 
			
		||||
 | 
			
		||||
  std::vector<vec,alignedAllocator<vec> > buf(3);
 | 
			
		||||
  vec & v_input1 = buf[0];
 | 
			
		||||
  vec & v_input2 = buf[1];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    random(sRNG,input1[i]);
 | 
			
		||||
    random(sRNG,input2[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  merge<vec,scal>(v_input1,input1);
 | 
			
		||||
  merge<vec,scal>(v_input2,input2);
 | 
			
		||||
 | 
			
		||||
  func.template vfunc<reduced,vec>(result,v_input1,v_input2);
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) {
 | 
			
		||||
    func.template sfunc<reduced,scal>(tmp,input1[i],input2[i]);
 | 
			
		||||
    reference+=tmp;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout << " " << func.name()<<std::endl;
 | 
			
		||||
 | 
			
		||||
  int ok=0;
 | 
			
		||||
  if ( abs(reference-result)/abs(reference) > 1.0e-6 ){ // rounding is possible for reduce order
 | 
			
		||||
    std::cout<< "*****" << std::endl;
 | 
			
		||||
    std::cout<< abs(reference-result) << " " <<reference<< " " << result<<std::endl;
 | 
			
		||||
    ok++;
 | 
			
		||||
  }
 | 
			
		||||
  if ( ok==0 ) {
 | 
			
		||||
    std::cout << " OK!" <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  assert(ok==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -127,6 +206,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcTimes());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcConj());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcAdj());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<ComplexF,ComplexF,vComplexF>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vComplexD "<<std::endl;
 | 
			
		||||
@@ -140,6 +221,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcTimes());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcConj());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcAdj());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vRealF "<<std::endl;
 | 
			
		||||
@@ -150,6 +233,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Tester<RealF,vRealF>(funcMinus());
 | 
			
		||||
  Tester<RealF,vRealF>(funcTimes());
 | 
			
		||||
  Tester<RealF,vRealF>(funcAdj());
 | 
			
		||||
  Tester<RealF,vRealF>(funcConj());
 | 
			
		||||
  Tester<RealF,vRealF>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<RealF,RealF,vRealF>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vRealD "<<std::endl;
 | 
			
		||||
@@ -159,6 +245,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Tester<RealD,vRealD>(funcMinus());
 | 
			
		||||
  Tester<RealD,vRealD>(funcTimes());
 | 
			
		||||
  Tester<RealD,vRealD>(funcAdj());
 | 
			
		||||
  Tester<RealD,vRealD>(funcConj());
 | 
			
		||||
  Tester<RealD,vRealD>(funcInnerProduct());
 | 
			
		||||
  ReductionTester<RealD,RealD,vRealD>(funcReduce());
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
	  for(int r=0;r<3;r++){
 | 
			
		||||
	  for(int c=0;c<3;c++){
 | 
			
		||||
            diff =check()()(r,c)-bar()()(r,c);
 | 
			
		||||
            double nn=real(conj(diff)*diff);
 | 
			
		||||
            double nn=real(conjugate(diff)*diff);
 | 
			
		||||
            if ( nn > 0){
 | 
			
		||||
	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
 | 
			
		||||
		     coor[0],coor[1],coor[2],coor[3],r,c,
 | 
			
		||||
@@ -103,8 +103,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
		     real(bar()()(r,c))
 | 
			
		||||
		     );
 | 
			
		||||
	    }
 | 
			
		||||
	    snrmC=snrmC+real(conj(check()()(r,c))*check()()(r,c));
 | 
			
		||||
	    snrmB=snrmB+real(conj(bar()()(r,c))*bar()()(r,c));
 | 
			
		||||
	    snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c));
 | 
			
		||||
	    snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c));
 | 
			
		||||
	    snrm=snrm+nn;
 | 
			
		||||
	  }}
 | 
			
		||||
	 
 | 
			
		||||
 
 | 
			
		||||
@@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 | 
			
		||||
#
 | 
			
		||||
# Test code
 | 
			
		||||
#
 | 
			
		||||
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez
 | 
			
		||||
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez Grid_rng_fixed
 | 
			
		||||
 | 
			
		||||
Grid_main_SOURCES = Grid_main.cc
 | 
			
		||||
Grid_main_LDADD = -lGrid
 | 
			
		||||
@@ -13,6 +13,9 @@ Grid_main_LDADD = -lGrid
 | 
			
		||||
Grid_rng_SOURCES = Grid_rng.cc
 | 
			
		||||
Grid_rng_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_rng_fixed_SOURCES = Grid_rng_fixed.cc
 | 
			
		||||
Grid_rng_fixed_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_remez_SOURCES = Grid_remez.cc
 | 
			
		||||
Grid_remez_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -186,7 +186,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
	  for(int r=0;r<3;r++){
 | 
			
		||||
	  for(int c=0;c<3;c++){
 | 
			
		||||
            diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
 | 
			
		||||
            double nn=real(conj(diff)*diff);
 | 
			
		||||
            double nn=real(conjugate(diff)*diff);
 | 
			
		||||
            if ( nn > 0 ){
 | 
			
		||||
	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
 | 
			
		||||
		     coor[0],coor[1],coor[2],coor[3],r,c,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user