Merge branch 'develop' of github.com:paboyle/Grid into feature/bgq

2026-07-17 15:43:27 +01:00 · 2016-09-16 15:07:54 +01:00
parent 4d11a6f5f2 5df5d52d41
commit 6dd75ad9e5
11 changed files with 459 additions and 661 deletions
@@ -4,7 +4,7 @@ EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
 echo "-- deploying Eigen source..."
-wget ${EIGEN_URL}
+wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
@@ -10,8 +10,19 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 AC_LANG(C++)
 CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
 ############ openmp  ###############
 AC_OPENMP
 ac_openmp=no
 if test "${OPENMP_CXXFLAGS}X" != "X"; then
 ac_openmp=yes
 AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
 AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
 fi
 ############ libtool ###############
 LT_INIT
 ############### Checks for header files
@@ -29,7 +40,7 @@ AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
-############### Options
+############### GMP and MPFR #################
 AC_ARG_WITH([gmp],
    [AS_HELP_STRING([--with-gmp=prefix],
    [try this for a non-standard install prefix of the GMP library])],
@@ -40,6 +51,8 @@ AC_ARG_WITH([mpfr],
    [try this for a non-standard install prefix of the MPFR library])],
    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
 ################## lapack ####################
 AC_ARG_ENABLE([lapack],
    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
    [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
@@ -55,14 +68,27 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
 esac
-AC_CHECK_LIB([fftw3],[fft_init_threads],
+################## FFTW3 ####################
 AC_ARG_WITH([fftw],    
            [AS_HELP_STRING([--with-fftw=prefix],
            [try this for a non-standard install prefix of the FFTW3 library])],
            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
 #
 # What about the MKL library replacement for fftw3 ?  How do we know if fftw_execute
 # can be found in MKL? 
 #
 AC_CHECK_LIB([fftw3],[fftw_execute],
 	[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes],
        [ac_fftw=no])
 case ${ac_fftw} in
    no)
        echo WARNING libfftw3 not found FFT routines will not work
        ;;
    yes)
-        AM_LDFLAGS="-lfftw3 $AM_LDFLAGS"
+        AM_LDFLAGS="$AM_LDFLAGS -lfftw3 -lfftw3f"
 esac
@@ -307,19 +333,18 @@ Summary of configuration for $PACKAGE v$VERSION
 - compiler version              : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 - SIMD                          : ${ac_SIMD}
- communications type           : ${ac_COMMS}
+- Threading                     : ${ac_openmp} 
- default precision             : ${ac_PRECISION}
+- Communications type           : ${ac_COMMS}
 - Default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG} 
 - GMP                           : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 - LAPACK                        : ${ac_LAPACK}
 - FFTW                          : ${ac_fftw}
 - build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 - graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
- CXXFLAGS: 
+- CXXFLAGS:  "${AM_CXXFLAGS} ${CXXFLAGS}"
-`echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
+- LDFLAGS:   "${AM_LDFLAGS} ${LDFLAGS}"
- LDFLAGS:
+- LIBS:      "${LIBS} "
 `echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
 - LIBS:
 `echo ${LIBS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
 -------------------------------------------------------
 "
@@ -1,3 +1,4 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -28,11 +29,75 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
-#include <Grid/fftw/fftw3.h>
+#ifdef HAVE_FFTW	
-
+#include <fftw3.h>
 #endif
 namespace Grid {
-  
+  template<class scalar> struct FFTW { };
 #ifdef HAVE_FFTW	
  template<> struct FFTW<ComplexD> {
  public:
    typedef fftw_complex FFTW_scalar;
    typedef fftw_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftw_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftw_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftw_destroy_plan(p);
    }
  };
  template<> struct FFTW<ComplexF> {
  public:
    typedef fftwf_complex FFTW_scalar;
    typedef fftwf_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftwf_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftwf_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftwf_destroy_plan(p);
    }
  };
 #endif
 #ifndef FFTW_FORWARD
 #define FFTW_FORWARD (-1)
 #define FFTW_BACKWARD (+1)
 #endif
  class FFT { 
  private:
@@ -40,6 +105,10 @@ namespace Grid {
    GridCartesian *sgrid;
    int Nd;
    double flops;
    double flops_call;
    uint64_t usec;
    std::vector<int> dimensions;
    std::vector<int> processors;
    std::vector<int> processor_coor;
@@ -49,6 +118,9 @@ namespace Grid {
    static const int forward=FFTW_FORWARD;
    static const int backward=FFTW_BACKWARD;
    double Flops(void) {return flops;}
    double MFlops(void) {return flops/usec;}
    FFT ( GridCartesian * grid ) : 
      vgrid(grid),
      Nd(grid->_ndimension),
@@ -56,6 +128,8 @@ namespace Grid {
      processors(grid->_processors),
      processor_coor(grid->_processor_coor)
    {
      flops=0;
      usec =0;
      std::vector<int> layout(Nd,1);
      sgrid = new GridCartesian(dimensions,layout,processors);
    };
@@ -75,55 +149,62 @@ namespace Grid {
      std::vector<int> layout(Nd,1);
      std::vector<int> pencil_gd(vgrid->_fdimensions);
      std::vector<int> pencil_ld(processors);
      pencil_gd[dim] = G*processors[dim];    
      pencil_ld[dim] = G*processors[dim];    
      // Pencil global vol LxLxGxLxL per node
      GridCartesian pencil_g(pencil_gd,layout,processors);
      GridCartesian pencil_l(pencil_ld,layout,processors);
      // Construct pencils
      typedef typename vobj::scalar_object sobj;
      typedef typename sobj::scalar_type   scalar;
      Lattice<vobj> ssource(vgrid); ssource =source;
      Lattice<sobj> pgsource(&pencil_g);
-      Lattice<sobj> pgresult(&pencil_g);
+      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
-      Lattice<sobj> plsource(&pencil_l);
+
-      Lattice<sobj> plresult(&pencil_l);
+#ifndef HAVE_FFTW	
      assert(0);
 #else 
      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
      {
-	assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD));
+	int Ncomp = sizeof(sobj)/sizeof(scalar);
-	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+	int Nlow  = 1;
-	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+	for(int d=0;d<dim;d++){
 	  Nlow*=vgrid->_ldimensions[d];
 	}
-	int Ncomp = sizeof(sobj)/sizeof(fftw_complex);
+	int rank = 1;  /* 1d transforms */
 	int rank = 1;  /* not 2: we are computing 1d transforms */
 	int n[] = {G}; /* 1d transforms of length G */
 	int howmany = Ncomp;
 	int odist,idist,istride,ostride;
-	idist   = odist   = 1;
+	idist   = odist   = 1;          /* Distance between consecutive FT's */
-	istride = ostride = Ncomp; /* distance between two elements in the same column */
+	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
 	int *inembed = n, *onembed = n;
 	fftw_complex *in = (fftw_complex *)&plsource._odata[0];
 	fftw_complex *out= (fftw_complex *)&plresult._odata[0];
 	int sign = FFTW_FORWARD;
 	if (inverse) sign = FFTW_BACKWARD;
-#ifdef HAVE_FFTW
+	FFTW_plan p;
-	fftw_plan p = fftw_plan_many_dft(rank,n,howmany,
+	{
-					 in,inembed,
+	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
-					 istride,idist,
+	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
-					 out,onembed,
+	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
-					 ostride, odist,
+					       in,inembed,
-					 sign,FFTW_ESTIMATE);
+					       istride,idist,
-#else 
+					       out,onembed,
-	fftw_plan p ;
+					       ostride, odist,
-	assert(0);
+					       sign,FFTW_ESTIMATE);
-#endif
+	}
 	double add,mul,fma;
 	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
 	flops_call = add+mul+2.0*fma;
 	GridStopWatch timer;
 	// Barrel shift and collect global pencil
 	for(int p=0;p<processors[dim];p++) { 
@@ -146,43 +227,45 @@ namespace Grid {
 	}
 	// Loop over orthog coords
-	for(int idx=0;idx<sgrid->lSites();idx++) { 
+	int NN=pencil_g.lSites();
 	GridStopWatch Timer;
 	Timer.Start();
 PARALLEL_FOR_LOOP
 	for(int idx=0;idx<NN;idx++) { 
 	  std::vector<int> pcoor(Nd,0);
 	  std::vector<int> lcoor(Nd);
-	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
 	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
-	  
+	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
-	    // Project to local pencil array
+	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
-	    for(int l=0;l<G;l++){
+	    FFTW<scalar>::fftw_execute_dft(p,in,out);
 	      sobj s;
 	      pcoor[dim]=l;
 	      lcoor[dim]=l;
 	      peekLocalSite(s,pgsource,lcoor);
 	      pokeLocalSite(s,plsource,pcoor);
 	    }
 	    // FFT the pencil
 #ifdef HAVE_FFTW
 	    fftw_execute(p);
 #endif
 	    // Extract the result
 	    for(int l=0;l<L;l++){
 	      sobj s;
 	      int p = processor_coor[dim];
 	      lcoor[dim] = l;
 	      pcoor[dim] = l+L*p;
 	      peekLocalSite(s,plresult,pcoor);
 	      pokeLocalSite(s,result,lcoor);
 	    }
 	  }
 	}
-	  
+
-	fftw_destroy_plan(p);
+        Timer.Stop();
 	usec += Timer.useconds();
 	flops+= flops_call*NN;
        int pc = processor_coor[dim];
        for(int idx=0;idx<sgrid->lSites();idx++) { 
 	  std::vector<int> lcoor(Nd);
 	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
 	  std::vector<int> gcoor = lcoor;
 	  // extract the result
 	  sobj s;
 	  gcoor[dim] = lcoor[dim]+L*pc;
 	  peekLocalSite(s,pgresult,gcoor);
 	  pokeLocalSite(s,result,lcoor);
 	}
 	FFTW<scalar>::fftw_destroy_plan(p);
      }
 #endif
    }
  };
@@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
    std::vector<int> cores(0);
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
@@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@@ -1,412 +0,0 @@
 /*
 * Copyright (c) 2003, 2007-14 Matteo Frigo
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
 *
 * The following statement of license applies *only* to this header file,
 * and *not* to the other files distributed with FFTW or derived therefrom:
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /***************************** NOTE TO USERS *********************************
 *
 *                 THIS IS A HEADER FILE, NOT A MANUAL
 *
 *    If you want to know how to use FFTW, please read the manual,
 *    online at http://www.fftw.org/doc/ and also included with FFTW.
 *    For a quick start, see the manual's tutorial section.
 *
 *   (Reading header files to learn how to use a library is a habit
 *    stemming from code lacking a proper manual.  Arguably, it's a
 *    *bad* habit in most cases, because header files can contain
 *    interfaces that are not part of the public, stable API.)
 *
 ****************************************************************************/
 #ifndef FFTW3_H
 #define FFTW3_H
 #include <stdio.h>
 #ifdef __cplusplus
 extern "C"
 {
 #endif /* __cplusplus */
 /* If <complex.h> is included, use the C99 complex type.  Otherwise
   define a type bit-compatible with C99 complex */
 #if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
 #  define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
 #else
 #  define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
 #endif
 #define FFTW_CONCAT(prefix, name) prefix ## name
 #define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
 #define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
 #define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
 #define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
 /* IMPORTANT: for Windows compilers, you should add a line
        #define FFTW_DLL
   here and in kernel/ifftw.h if you are compiling/using FFTW as a
   DLL, in order to do the proper importing/exporting, or
   alternatively compile with -DFFTW_DLL or the equivalent
   command-line flag.  This is not necessary under MinGW/Cygwin, where
   libtool does the imports/exports automatically. */
 #if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
   /* annoying Windows syntax for shared-library declarations */
 #  if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
 #    define FFTW_EXTERN extern __declspec(dllexport) 
 #  else /* user is calling FFTW; import symbol */
 #    define FFTW_EXTERN extern __declspec(dllimport) 
 #  endif
 #else
 #  define FFTW_EXTERN extern
 #endif
 enum fftw_r2r_kind_do_not_use_me {
     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
 };
 struct fftw_iodim_do_not_use_me {
     int n;                     /* dimension size */
     int is;			/* input stride */
     int os;			/* output stride */
 };
 #include <stddef.h> /* for ptrdiff_t */
 struct fftw_iodim64_do_not_use_me {
     ptrdiff_t n;                     /* dimension size */
     ptrdiff_t is;			/* input stride */
     ptrdiff_t os;			/* output stride */
 };
 typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
 typedef int (*fftw_read_char_func_do_not_use_me)(void *);
 /*
  huge second-order macro that defines prototypes for all API
  functions.  We expand this macro for each supported precision
  X: name-mangling macro
  R: real data type
  C: complex data type
 */
 #define FFTW_DEFINE_API(X, R, C)					   \
 									   \
 FFTW_DEFINE_COMPLEX(R, C);						   \
 									   \
 typedef struct X(plan_s) *X(plan);					   \
 									   \
 typedef struct fftw_iodim_do_not_use_me X(iodim);			   \
 typedef struct fftw_iodim64_do_not_use_me X(iodim64);			   \
 									   \
 typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind);			   \
 									   \
 typedef fftw_write_char_func_do_not_use_me X(write_char_func);		   \
 typedef fftw_read_char_func_do_not_use_me X(read_char_func);		   \
 									   \
 FFTW_EXTERN void X(execute)(const X(plan) p);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n,			   \
 		    C *in, C *out, int sign, unsigned flags);		   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign,	   \
 		       unsigned flags);					   \
 FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1,			   \
 		       C *in, C *out, int sign, unsigned flags);	   \
 FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2,		   \
 		       C *in, C *out, int sign, unsigned flags);	   \
 									   \
 FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n,		   \
                         int howmany,					   \
                         C *in, const int *inembed,			   \
                         int istride, int idist,			   \
                         C *out, const int *onembed,			   \
                         int ostride, int odist,			   \
                         int sign, unsigned flags);			   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims,	   \
 			 int howmany_rank,				   \
 			 const X(iodim) *howmany_dims,			   \
 			 C *in, C *out,					   \
 			 int sign, unsigned flags);			   \
 FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
 			 int howmany_rank,				   \
 			 const X(iodim) *howmany_dims,			   \
 			 R *ri, R *ii, R *ro, R *io,			   \
 			 unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank,			   \
                         const X(iodim64) *dims,			   \
 			 int howmany_rank,				   \
 			 const X(iodim64) *howmany_dims,		   \
 			 C *in, C *out,					   \
 			 int sign, unsigned flags);			   \
 FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank,			   \
                         const X(iodim64) *dims,			   \
 			 int howmany_rank,				   \
 			 const X(iodim64) *howmany_dims,		   \
 			 R *ri, R *ii, R *ro, R *io,			   \
 			 unsigned flags);				   \
 									   \
 FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out);	   \
 FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii,	   \
                                      R *ro, R *io);			   \
 									   \
 FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n,	   \
                             int howmany,				   \
                             R *in, const int *inembed,			   \
                             int istride, int idist,			   \
                             C *out, const int *onembed,		   \
                             int ostride, int odist,			   \
                             unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n,		   \
                        R *in, C *out, unsigned flags);			   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
 FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1,			   \
 			   R *in, C *out, unsigned flags);		   \
 FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1,			   \
 			   int n2,					   \
 			   R *in, C *out, unsigned flags);		   \
 									   \
 									   \
 FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n,	   \
 			     int howmany,				   \
 			     C *in, const int *inembed,			   \
 			     int istride, int idist,			   \
 			     R *out, const int *onembed,		   \
 			     int ostride, int odist,			   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n,		   \
                        C *in, R *out, unsigned flags);			   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
 FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1,			   \
 			   C *in, R *out, unsigned flags);		   \
 FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1,			   \
 			   int n2,					   \
 			   C *in, R *out, unsigned flags);		   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims,   \
 			     int howmany_rank,				   \
 			     const X(iodim) *howmany_dims,		   \
 			     R *in, C *out,				   \
 			     unsigned flags);				   \
 FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims,   \
 			     int howmany_rank,				   \
 			     const X(iodim) *howmany_dims,		   \
 			     C *in, R *out,				   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)(				   \
                             int rank, const X(iodim) *dims,		   \
 			     int howmany_rank,				   \
 			     const X(iodim) *howmany_dims,		   \
 			     R *in, R *ro, R *io,			   \
 			     unsigned flags);				   \
 FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)(				   \
                             int rank, const X(iodim) *dims,		   \
 			     int howmany_rank,				   \
 			     const X(iodim) *howmany_dims,		   \
 			     R *ri, R *ii, R *out,			   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank,			   \
                             const X(iodim64) *dims,			   \
 			     int howmany_rank,				   \
 			     const X(iodim64) *howmany_dims,		   \
 			     R *in, C *out,				   \
 			     unsigned flags);				   \
 FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank,			   \
                             const X(iodim64) *dims,			   \
 			     int howmany_rank,				   \
 			     const X(iodim64) *howmany_dims,		   \
 			     C *in, R *out,				   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)(			   \
                             int rank, const X(iodim64) *dims,		   \
 			     int howmany_rank,				   \
 			     const X(iodim64) *howmany_dims,		   \
 			     R *in, R *ro, R *io,			   \
 			     unsigned flags);				   \
 FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)(			   \
                             int rank, const X(iodim64) *dims,		   \
 			     int howmany_rank,				   \
 			     const X(iodim64) *howmany_dims,		   \
 			     R *ri, R *ii, R *out,			   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out);	   \
 FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out);	   \
 									   \
 FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p,		   \
                                          R *in, R *ro, R *io);		   \
 FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p,		   \
                                          R *ri, R *ii, R *out);	   \
 									   \
 FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n,		   \
                         int howmany,					   \
                         R *in, const int *inembed,			   \
                         int istride, int idist,			   \
                         R *out, const int *onembed,			   \
                         int ostride, int odist,			   \
                         const X(r2r_kind) *kind, unsigned flags);	   \
 									   \
 FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,	   \
                    const X(r2r_kind) *kind, unsigned flags);		   \
 									   \
 FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out,		   \
                       X(r2r_kind) kind, unsigned flags);		   \
 FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out,	   \
                       X(r2r_kind) kind0, X(r2r_kind) kind1,		   \
                       unsigned flags);					   \
 FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2,		   \
                       R *in, R *out, X(r2r_kind) kind0,		   \
                       X(r2r_kind) kind1, X(r2r_kind) kind2,		   \
                       unsigned flags);					   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims,	   \
                         int howmany_rank,				   \
                         const X(iodim) *howmany_dims,			   \
                         R *in, R *out,					   \
                         const X(r2r_kind) *kind, unsigned flags);	   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims,   \
                         int howmany_rank,				   \
                         const X(iodim64) *howmany_dims,		   \
                         R *in, R *out,					   \
                         const X(r2r_kind) *kind, unsigned flags);	   \
 									   \
 FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out);	   \
 									   \
 FFTW_EXTERN void X(destroy_plan)(X(plan) p);				   \
 FFTW_EXTERN void X(forget_wisdom)(void);				   \
 FFTW_EXTERN void X(cleanup)(void);					   \
 									   \
 FFTW_EXTERN void X(set_timelimit)(double t);				   \
 									   \
 FFTW_EXTERN void X(plan_with_nthreads)(int nthreads);			   \
 FFTW_EXTERN int X(init_threads)(void);					   \
 FFTW_EXTERN void X(cleanup_threads)(void);				   \
 									   \
 FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename);	   \
 FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file);		   \
 FFTW_EXTERN char *X(export_wisdom_to_string)(void);			   \
 FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char,   	   \
                                  void *data);				   \
 FFTW_EXTERN int X(import_system_wisdom)(void);				   \
 FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename);	   \
 FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file);		   \
 FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string);	   \
 FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
 									   \
 FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file);	   \
 FFTW_EXTERN void X(print_plan)(const X(plan) p);			   \
 FFTW_EXTERN char *X(sprint_plan)(const X(plan) p);			   \
 									   \
 FFTW_EXTERN void *X(malloc)(size_t n);					   \
 FFTW_EXTERN R *X(alloc_real)(size_t n);					   \
 FFTW_EXTERN C *X(alloc_complex)(size_t n);				   \
 FFTW_EXTERN void X(free)(void *p);					   \
 									   \
 FFTW_EXTERN void X(flops)(const X(plan) p,				   \
                          double *add, double *mul, double *fmas);	   \
 FFTW_EXTERN double X(estimate_cost)(const X(plan) p);			   \
 FFTW_EXTERN double X(cost)(const X(plan) p);				   \
 									   \
 FFTW_EXTERN int X(alignment_of)(R *p);                                     \
 FFTW_EXTERN const char X(version)[];                                       \
 FFTW_EXTERN const char X(cc)[];						   \
 FFTW_EXTERN const char X(codelet_optim)[];
 /* end of FFTW_DEFINE_API macro */
 FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
 FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
 FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
 /* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
   for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
 #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
 && !(defined(__ICC) || defined(__INTEL_COMPILER)) \
 && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
 #  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
 /* note: __float128 is a typedef, which is not supported with the _Complex
         keyword in gcc, so instead we use this ugly __attribute__ version.
         However, we can't simply pass the __attribute__ version to
         FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
         types.  Hence redefining FFTW_DEFINE_COMPLEX.  Ugh. */
 #    undef FFTW_DEFINE_COMPLEX
 #    define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
 #  endif
 FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
 #endif
 #define FFTW_FORWARD (-1)
 #define FFTW_BACKWARD (+1)
 #define FFTW_NO_TIMELIMIT (-1.0)
 /* documented flags */
 #define FFTW_MEASURE (0U)
 #define FFTW_DESTROY_INPUT (1U << 0)
 #define FFTW_UNALIGNED (1U << 1)
 #define FFTW_CONSERVE_MEMORY (1U << 2)
 #define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
 #define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
 #define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
 #define FFTW_ESTIMATE (1U << 6)
 #define FFTW_WISDOM_ONLY (1U << 21)
 /* undocumented beyond-guru flags */
 #define FFTW_ESTIMATE_PATIENT (1U << 7)
 #define FFTW_BELIEVE_PCOST (1U << 8)
 #define FFTW_NO_DFT_R2HC (1U << 9)
 #define FFTW_NO_NONTHREADED (1U << 10)
 #define FFTW_NO_BUFFERING (1U << 11)
 #define FFTW_NO_INDIRECT_OP (1U << 12)
 #define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
 #define FFTW_NO_RANK_SPLITS (1U << 14)
 #define FFTW_NO_VRANK_SPLITS (1U << 15)
 #define FFTW_NO_VRECURSE (1U << 16)
 #define FFTW_NO_SIMD (1U << 17)
 #define FFTW_NO_SLOW (1U << 18)
 #define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
 #define FFTW_ALLOW_PRUNING (1U << 20)
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif /* __cplusplus */
 #endif /* FFTW3_H */
@@ -194,22 +194,22 @@ class BinaryIO {
      std::vector<int> site({x,y,z,t});
-      if ( grid->IsBoss() ) {
+      if (grid->IsBoss()) {
-	fin.read((char *)&file_object,sizeof(file_object));
+        fin.read((char *)&file_object, sizeof(file_object));
-	bytes += sizeof(file_object);
+        bytes += sizeof(file_object);
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));
-	munge(file_object,munged,csum);
+        munge(file_object, munged, csum);
      }
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -254,20 +254,20 @@ class BinaryIO {
      if ( grid->IsBoss() ) {
-	
+  
-	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
-	// NB could gather an xstrip as an optimisation.
+  // NB could gather an xstrip as an optimisation.
-	fout.write((char *)&file_object,sizeof(file_object));
+  fout.write((char *)&file_object,sizeof(file_object));
-	bytes+=sizeof(file_object);
+  bytes+=sizeof(file_object);
      }
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -305,15 +305,15 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);
      if( rank == grid->ThisRank() ){
-	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
+  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
-	parallel.GetState(saved,l_idx);
+  parallel.GetState(saved,l_idx);
      }
      grid->Broadcast(rank,(void *)&saved[0],bytes);
      if ( grid->IsBoss() ) {
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-	fout.write((char *)&saved[0],bytes);
+  fout.write((char *)&saved[0],bytes);
      }
    }
@@ -355,14 +355,14 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);
      if ( grid->IsBoss() ) {
-	fin.read((char *)&saved[0],bytes);
+  fin.read((char *)&saved[0],bytes);
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
      }
      grid->Broadcast(0,(void *)&saved[0],bytes);
      if( rank == grid->ThisRank() ){
-	parallel.SetState(saved,l_idx);
+  parallel.SetState(saved,l_idx);
      }
    }
@@ -415,15 +415,15 @@ class BinaryIO {
      if ( d == 0 ) parallel[d] = 0;
      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
+  range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
+  start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
+  range[d] = grid->_gdimensions[d];
-	start[d] = 0;
+  start[d] = 0;
-	ioproc[d]= 0;
+  ioproc[d]= 0;
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
    }
@@ -434,9 +434,9 @@ class BinaryIO {
      std::cout<< std::dec ;
      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
+  std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
+  if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@@ -463,7 +463,7 @@ class BinaryIO {
      // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@@ -472,8 +472,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  gsite[d] = tsite[d]+start[d];               // global site
      }
      /////////////////////////
@@ -487,29 +487,29 @@ class BinaryIO {
      // iorank reads from the seek
      ////////////////////////////////
      if (myrank == iorank) {
-	
+  
-	fin.seekg(offset+g_idx*sizeof(fileObj));
+  fin.seekg(offset+g_idx*sizeof(fileObj));
-	fin.read((char *)&fileObj,sizeof(fileObj));
+  fin.read((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  bytes+=sizeof(fileObj);
-	
+  
-	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
-	
+  
-	munge(fileObj,siteObj,csum);
+  munge(fileObj,siteObj,csum);
-      }	
+      } 
      // Possibly do transport through pt2pt 
      if ( rank != iorank ) { 
-	if ( (myrank == rank) || (myrank==iorank) ) {
+  if ( (myrank == rank) || (myrank==iorank) ) {
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
-	}
+  }
      }
      // Poke at destination
      if ( myrank == rank ) {
-	  pokeLocalSite(siteObj,Umu,lsite);
+    pokeLocalSite(siteObj,Umu,lsite);
      }
      grid->Barrier(); // necessary?
    }
@@ -520,7 +520,7 @@ class BinaryIO {
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -558,15 +558,15 @@ class BinaryIO {
      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
+  range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
+  start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
+  range[d] = grid->_gdimensions[d];
-	start[d] = 0;
+  start[d] = 0;
-	ioproc[d]= 0;
+  ioproc[d]= 0;
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
@@ -577,9 +577,9 @@ class BinaryIO {
      grid->GlobalSum(tmp);
      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
+  std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
+  if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@@ -610,7 +610,7 @@ class BinaryIO {
    // should aggregate a whole chunk and then write.
    // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@@ -619,8 +619,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  gsite[d] = tsite[d]+start[d];               // global site
      }
@@ -640,26 +640,26 @@ class BinaryIO {
      // Pair of nodes may need to do pt2pt send
      if ( rank != iorank ) { // comms is necessary
-	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	  // Send to IOrank 
+    // Send to IOrank 
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
-	}
+  }
      }
      grid->Barrier(); // necessary?
      if (myrank == iorank) {
-	
+  
-	munge(siteObj,fileObj,csum);
+  munge(siteObj,fileObj,csum);
-	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
-	
+  
-	fout.seekp(offset+g_idx*sizeof(fileObj));
+  fout.seekp(offset+g_idx*sizeof(fileObj));
-	fout.write((char *)&fileObj,sizeof(fileObj));
+  fout.write((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  bytes+=sizeof(fileObj);
      }
    }
@@ -668,7 +668,7 @@ class BinaryIO {
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -55,11 +55,14 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
    // note: static const int or constexpr will work for type deductions
    //       with the intel compiler (up to version 17)
    //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 2;
+    #define ColourIndex  2
-    static const int SpinIndex   = 1;
+    #define SpinIndex    1
-    static const int LorentzIndex= 0;
+    #define LorentzIndex 0
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
@@ -96,7 +96,25 @@ namespace Grid {
     WilsonKernels(const ImplParams &p= ImplParams());
    };
-
+    
    ///////////////////////////////////////////////////////////
    // Default to no assembler implementation
    ///////////////////////////////////////////////////////////
    template<class Impl>
    void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
    {
      assert(0);
    }
    template<class Impl>
    void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
    {
      assert(0);
    }
  }
 }
 #endif
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -26,68 +26,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+*************************************************************************************/
-    /*  END LEGAL */
+/*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
-namespace QCD {
+  namespace QCD {
-
+    
  ///////////////////////////////////////////////////////////
  // Default to no assembler implementation
  ///////////////////////////////////////////////////////////
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 #if defined(AVX512) 
-
+    
-
+    
-  ///////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////
-  // If we are AVX512 specialise the single precision routine
+    // If we are AVX512 specialise the single precision routine
-  ///////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////
-
+    
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-
+    
-static Vector<vComplexF> signs;
+    static Vector<vComplexF> signs;
-
+    
-int setupSigns(void ){
+    int setupSigns(void ){
-  Vector<vComplexF> bother(2);
+      Vector<vComplexF> bother(2);
-  signs = bother;
+      signs = bother;
-  vrsign(signs[0]);
+      vrsign(signs[0]);
-  visign(signs[1]);
+      visign(signs[1]);
-  return 1;
+      return 1;
-}
+    }
-static int signInit = setupSigns();
+    static int signInit = setupSigns();
-
+  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
-
+  
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
-
+  
 #undef KERNEL_DAG
-template<>
+    template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+      
 #define KERNEL_DAG
-template<>
+    template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@@ -98,43 +86,22 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebesgue
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-
+				    
 #undef KERNEL_DAG
-template<>
+    template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #define KERNEL_DAG
-template<>
+    template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #endif
-
+  }
-
+}
 template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
@@ -35,6 +35,9 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
  std::vector<int> mpi_layout  = GridDefaultMpi();
@@ -75,10 +78,10 @@ int main (int argc, char ** argv)
  FFT theFFT(&Fine);
-  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
  //  C=zero;
  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@@ -90,10 +93,10 @@ int main (int argc, char ** argv)
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
  SpinMatrixD Sp; 
  Sp = zero; Sp = Sp+cVol;
@@ -0,0 +1,111 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_cshift.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout( { vComplexF::Nsimd(),1,1,1});
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int vol = 1;
  for(int d=0;d<latt_size.size();d++){
    vol = vol * latt_size[d];
  }
  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
  LatticeComplexF     one(&Fine);
  LatticeComplexF      zz(&Fine);
  LatticeComplexF       C(&Fine);
  LatticeComplexF  Ctilde(&Fine);
  LatticeComplexF    coor(&Fine);
  LatticeSpinMatrixF    S(&Fine);
  LatticeSpinMatrixF    Stilde(&Fine);
  std::vector<int> p({1,2,3,2});
  one = ComplexF(1.0,0.0);
  zz  = ComplexF(0.0,0.0);
  ComplexF ci(0.0,1.0);
  C=zero;
  for(int mu=0;mu<4;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
    C = C - (TwoPiL * p[mu]) * coor;
  }
  C = exp(C*ci);
  S=zero;
  S = S+C;
  FFT theFFT(&Fine);
  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
  //  C=zero;
  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
  TComplexF cVol;
  cVol()()() = vol;
  C=zero;
  pokeSite(cVol,C,p);
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;
  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
  SpinMatrixF Sp; 
  Sp = zero; Sp = Sp+cVol;
  S=zero;
  pokeSite(Sp,S,p);
  S= S-Stilde;
  std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
  Grid_finalize();
 }