Merge branch 'feature/FFT' into develop

2026-02-06 07:03:28 +00:00 · 2016-08-22 16:25:04 +01:00
parent fc25d2295c 356e7940fd
commit 8a02824e08
8 changed files with 812 additions and 10 deletions
--- a/configure.ac
+++ b/configure.ac
@@ -43,6 +43,7 @@ AC_ARG_WITH([mpfr],
 AC_ARG_ENABLE([lapack],
    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
    [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+
 case ${ac_LAPACK} in
    no)
        ;;
@@ -54,6 +55,17 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
 esac

+AC_CHECK_LIB([fftw3],[fft_init_threads],
+	[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes],
+        [ac_fftw=no])
+case ${ac_fftw} in
+    no)
+        ;;
+    yes)
+        AM_LDFLAGS="-lfftw3 $AM_LDFLAGS"
+esac
+
+
 ################ Get compiler informations
 AC_LANG([C++])
 AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
@@ -77,7 +89,7 @@ AC_CHECK_LIB([gmp],[__gmpf_init],
                 [have_mpfr=true]
                 [LIBS="$LIBS -lmpfr"],
                 [AC_MSG_ERROR([MPFR library not found])])]
-             [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
+   	     [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
             [have_gmp=true]
             [LIBS="$LIBS -lgmp"],
             [AC_MSG_WARN([**** GMP library not found, Grid can still compile but RHMC will not work ****])])
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -0,0 +1,193 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#include <Grid/fftw/fftw3.h>
+
+namespace Grid {
+
+  
+  class FFT { 
+  private:
+
+    GridCartesian *vgrid;
+    GridCartesian *sgrid;
+
+    int Nd;
+    std::vector<int> dimensions;
+    std::vector<int> processors;
+    std::vector<int> processor_coor;
+
+  public:
+
+    static const int forward=FFTW_FORWARD;
+    static const int backward=FFTW_BACKWARD;
+
+    FFT ( GridCartesian * grid ) : 
+      vgrid(grid),
+      Nd(grid->_ndimension),
+      dimensions(grid->_fdimensions),
+      processors(grid->_processors),
+      processor_coor(grid->_processor_coor)
+    {
+      std::vector<int> layout(Nd,1);
+      sgrid = new GridCartesian(dimensions,layout,processors);
+    };
+
+    ~FFT ( void)  { 
+      delete sgrid; 
+    }
+    
+    template<class vobj>
+    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
+
+      conformable(result._grid,vgrid);
+      conformable(source._grid,vgrid);
+
+      int L = vgrid->_ldimensions[dim];
+      int G = vgrid->_fdimensions[dim];
+
+      std::vector<int> layout(Nd,1);
+      std::vector<int> pencil_gd(vgrid->_fdimensions);
+      std::vector<int> pencil_ld(processors);
+
+      pencil_gd[dim] = G*processors[dim];    
+      pencil_ld[dim] = G*processors[dim];    
+
+      // Pencil global vol LxLxGxLxL per node
+      GridCartesian pencil_g(pencil_gd,layout,processors);
+      GridCartesian pencil_l(pencil_ld,layout,processors);
+
+      // Construct pencils
+      typedef typename vobj::scalar_object sobj;
+      Lattice<vobj> ssource(vgrid); ssource =source;
+      Lattice<sobj> pgsource(&pencil_g);
+      Lattice<sobj> pgresult(&pencil_g);
+      Lattice<sobj> plsource(&pencil_l);
+      Lattice<sobj> plresult(&pencil_l);
+
+      {
+	assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD));
+	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+
+	int Ncomp = sizeof(sobj)/sizeof(fftw_complex);
+
+	int rank = 1;  /* not 2: we are computing 1d transforms */
+	int n[] = {G}; /* 1d transforms of length G */
+	int howmany = Ncomp;
+	int odist,idist,istride,ostride;
+	idist   = odist   = 1;
+	istride = ostride = Ncomp; /* distance between two elements in the same column */
+	int *inembed = n, *onembed = n;
+
+	fftw_complex *in = (fftw_complex *)&plsource._odata[0];
+	fftw_complex *out= (fftw_complex *)&plresult._odata[0];
+	
+	int sign = FFTW_FORWARD;
+	if (inverse) sign = FFTW_BACKWARD;
+
+#ifdef HAVE_FFTW
+	fftw_plan p = fftw_plan_many_dft(rank,n,howmany,
+					 in,inembed,
+					 istride,idist,
+					 out,onembed,
+					 ostride, odist,
+					 sign,FFTW_ESTIMATE);
+#else 
+	fftw_plan p ;
+	assert(0);
+#endif
+
+	// Barrel shift and collect global pencil
+	for(int p=0;p<processors[dim];p++) { 
+
+	  for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	    std::vector<int> lcoor(Nd);
+    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	    sobj s;
+
+	    peekLocalSite(s,ssource,lcoor);
+
+	    lcoor[dim]+=p*L;
+	   
+	    pokeLocalSite(s,pgsource,lcoor);
+	  }
+
+	  ssource = Cshift(ssource,dim,L);
+	}
+	
+	// Loop over orthog coords
+	for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	  std::vector<int> pcoor(Nd,0);
+	  std::vector<int> lcoor(Nd);
+	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+	  
+	    // Project to local pencil array
+	    for(int l=0;l<G;l++){
+	      sobj s;
+	      pcoor[dim]=l;
+	      lcoor[dim]=l;
+	      peekLocalSite(s,pgsource,lcoor);
+	      pokeLocalSite(s,plsource,pcoor);
+	    }
+
+	    // FFT the pencil
+#ifdef HAVE_FFTW
+	    fftw_execute(p);
+#endif
+
+	    // Extract the result
+	    for(int l=0;l<L;l++){
+	      sobj s;
+	      int p = processor_coor[dim];
+	      lcoor[dim] = l;
+	      pcoor[dim] = l+L*p;
+	      peekLocalSite(s,plresult,pcoor);
+	      pokeLocalSite(s,result,lcoor);
+	    }
+
+	  }
+	}
+	  
+	fftw_destroy_plan(p);
+      }
+    }
+
+  };
+
+
+}
+
+#endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -68,6 +68,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Simd.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
+#include <Grid/Init.h>
 #include <Grid/Communicator.h> 
 #include <Grid/Cartesian.h>    
 #include <Grid/Tensors.h>      
@@ -78,7 +79,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
-#include <Grid/Init.h>
+
+#include <Grid/FFT.h>

 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
--- a/lib/fftw/fftw3.h
+++ b/lib/fftw/fftw3.h
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * The following statement of license applies *only* to this header file,
+ * and *not* to the other files distributed with FFTW or derived therefrom:
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/***************************** NOTE TO USERS *********************************
+ *
+ *                 THIS IS A HEADER FILE, NOT A MANUAL
+ *
+ *    If you want to know how to use FFTW, please read the manual,
+ *    online at http://www.fftw.org/doc/ and also included with FFTW.
+ *    For a quick start, see the manual's tutorial section.
+ *
+ *   (Reading header files to learn how to use a library is a habit
+ *    stemming from code lacking a proper manual.  Arguably, it's a
+ *    *bad* habit in most cases, because header files can contain
+ *    interfaces that are not part of the public, stable API.)
+ *
+ ****************************************************************************/
+
+#ifndef FFTW3_H
+#define FFTW3_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* If <complex.h> is included, use the C99 complex type.  Otherwise
+   define a type bit-compatible with C99 complex */
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
+#else
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
+#endif
+
+#define FFTW_CONCAT(prefix, name) prefix ## name
+#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
+#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
+#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
+#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FFTW_DLL
+   here and in kernel/ifftw.h if you are compiling/using FFTW as a
+   DLL, in order to do the proper importing/exporting, or
+   alternatively compile with -DFFTW_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically. */
+#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
+   /* annoying Windows syntax for shared-library declarations */
+#  if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
+#    define FFTW_EXTERN extern __declspec(dllexport) 
+#  else /* user is calling FFTW; import symbol */
+#    define FFTW_EXTERN extern __declspec(dllimport) 
+#  endif
+#else
+#  define FFTW_EXTERN extern
+#endif
+
+enum fftw_r2r_kind_do_not_use_me {
+     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
+     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
+     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
+};
+
+struct fftw_iodim_do_not_use_me {
+     int n;                     /* dimension size */
+     int is;			/* input stride */
+     int os;			/* output stride */
+};
+
+#include <stddef.h> /* for ptrdiff_t */
+struct fftw_iodim64_do_not_use_me {
+     ptrdiff_t n;                     /* dimension size */
+     ptrdiff_t is;			/* input stride */
+     ptrdiff_t os;			/* output stride */
+};
+
+typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
+typedef int (*fftw_read_char_func_do_not_use_me)(void *);
+
+/*
+  huge second-order macro that defines prototypes for all API
+  functions.  We expand this macro for each supported precision
+ 
+  X: name-mangling macro
+  R: real data type
+  C: complex data type
+*/
+
+#define FFTW_DEFINE_API(X, R, C)					   \
+									   \
+FFTW_DEFINE_COMPLEX(R, C);						   \
+									   \
+typedef struct X(plan_s) *X(plan);					   \
+									   \
+typedef struct fftw_iodim_do_not_use_me X(iodim);			   \
+typedef struct fftw_iodim64_do_not_use_me X(iodim64);			   \
+									   \
+typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind);			   \
+									   \
+typedef fftw_write_char_func_do_not_use_me X(write_char_func);		   \
+typedef fftw_read_char_func_do_not_use_me X(read_char_func);		   \
+									   \
+FFTW_EXTERN void X(execute)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n,			   \
+		    C *in, C *out, int sign, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign,	   \
+		       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1,			   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2,		   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         C *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         C *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         int sign, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims,	   \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out);	   \
+FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii,	   \
+                                      R *ro, R *io);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n,	   \
+                             int howmany,				   \
+                             R *in, const int *inembed,			   \
+                             int istride, int idist,			   \
+                             C *out, const int *onembed,		   \
+                             int ostride, int odist,			   \
+                             unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n,		   \
+                        R *in, C *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1,			   \
+			   R *in, C *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   R *in, C *out, unsigned flags);		   \
+									   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n,	   \
+			     int howmany,				   \
+			     C *in, const int *inembed,			   \
+			     int istride, int idist,			   \
+			     R *out, const int *onembed,		   \
+			     int ostride, int odist,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n,		   \
+                        C *in, R *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1,			   \
+			   C *in, R *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   C *in, R *out, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out);	   \
+FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p,		   \
+                                          R *in, R *ro, R *io);		   \
+FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p,		   \
+                                          R *ri, R *ii, R *out);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         R *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         R *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,	   \
+                    const X(r2r_kind) *kind, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out,		   \
+                       X(r2r_kind) kind, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out,	   \
+                       X(r2r_kind) kind0, X(r2r_kind) kind1,		   \
+                       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2,		   \
+                       R *in, R *out, X(r2r_kind) kind0,		   \
+                       X(r2r_kind) kind1, X(r2r_kind) kind2,		   \
+                       unsigned flags);					   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims,	   \
+                         int howmany_rank,				   \
+                         const X(iodim) *howmany_dims,			   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims,   \
+                         int howmany_rank,				   \
+                         const X(iodim64) *howmany_dims,		   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(destroy_plan)(X(plan) p);				   \
+FFTW_EXTERN void X(forget_wisdom)(void);				   \
+FFTW_EXTERN void X(cleanup)(void);					   \
+									   \
+FFTW_EXTERN void X(set_timelimit)(double t);				   \
+									   \
+FFTW_EXTERN void X(plan_with_nthreads)(int nthreads);			   \
+FFTW_EXTERN int X(init_threads)(void);					   \
+FFTW_EXTERN void X(cleanup_threads)(void);				   \
+									   \
+FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename);	   \
+FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file);		   \
+FFTW_EXTERN char *X(export_wisdom_to_string)(void);			   \
+FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char,   	   \
+                                  void *data);				   \
+FFTW_EXTERN int X(import_system_wisdom)(void);				   \
+FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename);	   \
+FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file);		   \
+FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string);	   \
+FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
+									   \
+FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file);	   \
+FFTW_EXTERN void X(print_plan)(const X(plan) p);			   \
+FFTW_EXTERN char *X(sprint_plan)(const X(plan) p);			   \
+									   \
+FFTW_EXTERN void *X(malloc)(size_t n);					   \
+FFTW_EXTERN R *X(alloc_real)(size_t n);					   \
+FFTW_EXTERN C *X(alloc_complex)(size_t n);				   \
+FFTW_EXTERN void X(free)(void *p);					   \
+									   \
+FFTW_EXTERN void X(flops)(const X(plan) p,				   \
+                          double *add, double *mul, double *fmas);	   \
+FFTW_EXTERN double X(estimate_cost)(const X(plan) p);			   \
+FFTW_EXTERN double X(cost)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN int X(alignment_of)(R *p);                                     \
+FFTW_EXTERN const char X(version)[];                                       \
+FFTW_EXTERN const char X(cc)[];						   \
+FFTW_EXTERN const char X(codelet_optim)[];
+
+
+/* end of FFTW_DEFINE_API macro */
+
+FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
+
+/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
+   for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
+#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
+ && !(defined(__ICC) || defined(__INTEL_COMPILER)) \
+ && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
+#  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+/* note: __float128 is a typedef, which is not supported with the _Complex
+         keyword in gcc, so instead we use this ugly __attribute__ version.
+         However, we can't simply pass the __attribute__ version to
+         FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
+         types.  Hence redefining FFTW_DEFINE_COMPLEX.  Ugh. */
+#    undef FFTW_DEFINE_COMPLEX
+#    define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
+#  endif
+FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
+#endif
+
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+
+#define FFTW_NO_TIMELIMIT (-1.0)
+
+/* documented flags */
+#define FFTW_MEASURE (0U)
+#define FFTW_DESTROY_INPUT (1U << 0)
+#define FFTW_UNALIGNED (1U << 1)
+#define FFTW_CONSERVE_MEMORY (1U << 2)
+#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
+#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
+#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
+#define FFTW_ESTIMATE (1U << 6)
+#define FFTW_WISDOM_ONLY (1U << 21)
+
+/* undocumented beyond-guru flags */
+#define FFTW_ESTIMATE_PATIENT (1U << 7)
+#define FFTW_BELIEVE_PCOST (1U << 8)
+#define FFTW_NO_DFT_R2HC (1U << 9)
+#define FFTW_NO_NONTHREADED (1U << 10)
+#define FFTW_NO_BUFFERING (1U << 11)
+#define FFTW_NO_INDIRECT_OP (1U << 12)
+#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
+#define FFTW_NO_RANK_SPLITS (1U << 14)
+#define FFTW_NO_VRANK_SPLITS (1U << 15)
+#define FFTW_NO_VRECURSE (1U << 16)
+#define FFTW_NO_SIMD (1U << 17)
+#define FFTW_NO_SLOW (1U << 18)
+#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
+#define FFTW_ALLOW_PRUNING (1U << 20)
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* FFTW3_H */
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
  }

-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<ig->lSites();idx++){
    std::vector<int> lcoor(ni);
    ig->LocalIndexToLocalCoor(idx,lcoor);
@@ -446,6 +446,79 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in

 }

+
+template<class vobj>
+void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,lowDim,lcoor);
+      pokeLocalSite(s,higherDim,hcoor);
+    }
+  }
+}
+
+
+template<class vobj>
+void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,higherDim,hcoor);
+      pokeLocalSite(s,lowDim,lcoor);
+    }
+  }
+}
+
+
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -388,6 +388,12 @@ class Grid_simd {

 };  // end of Grid_simd class definition

+
+inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
+inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
+inline void permute(RealD &y,RealD b, int perm) {  y=b; }
+inline void permute(RealF &y,RealF b, int perm) {  y=b; }
+
 ////////////////////////////////////////////////////////////////////
 // General rotate
 ////////////////////////////////////////////////////////////////////
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,15 +67,13 @@ template <class scalar>
 struct AsinRealFunctor {
  scalar operator()(const scalar &a) const { return asin(real(a)); }
 };
-
 template <class scalar>
 struct LogRealFunctor {
  scalar operator()(const scalar &a) const { return log(real(a)); }
 };
-
 template <class scalar>
-struct ExpRealFunctor {
-  scalar operator()(const scalar &a) const { return exp(real(a)); }
+struct ExpFunctor {
+  scalar operator()(const scalar &a) const { return exp(a); }
 };
 template <class scalar>
 struct NotFunctor {
@@ -85,7 +83,6 @@ template <class scalar>
 struct AbsRealFunctor {
  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
 };
-
 template <class scalar>
 struct PowRealFunctor {
  double y;
@@ -135,7 +132,6 @@ template <class Scalar>
 inline Scalar rsqrt(const Scalar &r) {
  return (RSqrtRealFunctor<Scalar>(), r);
 }
-
 template <class S, class V>
 inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
  return SimdApply(CosRealFunctor<S>(), r);
@@ -162,7 +158,7 @@ inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
 }
 template <class S, class V>
 inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
-  return SimdApply(ExpRealFunctor<S>(), r);
+  return SimdApply(ExpFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -0,0 +1,108 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
+
+  LatticeComplexD     one(&Fine);
+  LatticeComplexD      zz(&Fine);
+  LatticeComplexD       C(&Fine);
+  LatticeComplexD  Ctilde(&Fine);
+  LatticeComplexD    coor(&Fine);
+
+  LatticeSpinMatrixD    S(&Fine);
+  LatticeSpinMatrixD    Stilde(&Fine);
+  
+  std::vector<int> p({1,2,3,2});
+
+  one = ComplexD(1.0,0.0);
+  zz  = ComplexD(0.0,0.0);
+
+  ComplexD ci(0.0,1.0);
+
+  C=zero;
+  for(int mu=0;mu<4;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    C = C - (TwoPiL * p[mu]) * coor;
+  }
+
+  C = exp(C*ci);
+
+  S=zero;
+  S = S+C;
+
+  FFT theFFT(&Fine);
+
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);
+
+  //  C=zero;
+  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
+  TComplexD cVol;
+  cVol()()() = vol;
+
+  C=zero;
+  pokeSite(cVol,C,p);
+  C=C-Ctilde;
+  std::cout << "diff scalar "<<norm2(C) << std::endl;
+
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);
+
+  SpinMatrixD Sp; 
+  Sp = zero; Sp = Sp+cVol;
+
+  S=zero;
+  pokeSite(Sp,S,p);
+
+  S= S-Stilde;
+  std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
+
+  Grid_finalize();
+}