diff --git a/lib/FFT.h b/lib/FFT.h
new file mode 100644
index 00000000..262e525b
--- /dev/null
+++ b/lib/FFT.h
@@ -0,0 +1,198 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#include <Grid/fftw/fftw3.h>
+
+namespace Grid {
+
+  
+  class FFT { 
+  private:
+
+    GridCartesian *vgrid;
+    GridCartesian *sgrid;
+
+    int Nd;
+    std::vector<int> dimensions;
+    std::vector<int> processors;
+    std::vector<int> processor_coor;
+
+  public:
+
+    static const int forward=FFTW_FORWARD;
+    static const int backward=FFTW_BACKWARD;
+
+    FFT ( GridCartesian * grid ) : 
+      vgrid(grid),
+      Nd(grid->_ndimension),
+      dimensions(grid->_fdimensions),
+      processors(grid->_processors),
+      processor_coor(grid->_processor_coor)
+    {
+      std::vector<int> layout(Nd,1);
+      sgrid = new GridCartesian(dimensions,layout,processors);
+    };
+
+    ~FFT ( void)  { 
+      delete sgrid; 
+    }
+    
+    template<class vobj>
+    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
+
+      conformable(result._grid,vgrid);
+      conformable(source._grid,vgrid);
+
+      int L = vgrid->_ldimensions[dim];
+      int G = vgrid->_fdimensions[dim];
+
+      std::vector<int> layout(Nd,1);
+      std::vector<int> pencil_gd(vgrid->_fdimensions);
+      std::vector<int> pencil_ld(processors);
+
+      pencil_gd[dim] = G*processors[dim];    
+      pencil_ld[dim] = G*processors[dim];    
+
+      // Pencil global vol LxLxGxLxL per node
+      GridCartesian pencil_g(pencil_gd,layout,processors);
+      GridCartesian pencil_l(pencil_ld,layout,processors);
+
+      // Construct pencils
+      typedef typename vobj::scalar_object sobj;
+      Lattice<vobj> ssource(vgrid); ssource =source;
+      Lattice<sobj> pgsource(&pencil_g);
+      Lattice<sobj> pgresult(&pencil_g);
+      Lattice<sobj> plsource(&pencil_l);
+      Lattice<sobj> plresult(&pencil_l);
+
+      {
+
+	assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD));
+	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+
+	int Ncomp = sizeof(sobj)/sizeof(fftw_complex);
+
+	std::cout << "Ncomp = "<<Ncomp<<std::endl;
+
+	int rank = 1;   /* not 2: we are computing 1d transforms */
+	int n[] = {G}; /* 1d transforms of length G */
+	int howmany = Ncomp;
+	int odist,idist,istride,ostride;
+	idist   = odist   = 1;
+	istride = ostride = Ncomp; /* distance between two elements in the same column */
+	int *inembed = n, *onembed = n;
+	
+	fftw_complex *in = (fftw_complex *)&plsource._odata[0];
+	fftw_complex *out= (fftw_complex *)&plresult._odata[0];
+	
+	int sign = FFTW_FORWARD;
+	if (inverse) sign = FFTW_BACKWARD;
+
+	fftw_plan p = fftw_plan_many_dft(rank,n,howmany,
+					 in,inembed,
+					 istride,idist,
+					 out,onembed,
+					 ostride, odist,
+					 sign,FFTW_ESTIMATE);
+
+
+	// Barrel shift and collect global pencil
+	for(int p=0;p<processors[dim];p++) { 
+	  
+	      
+	  for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	    std::vector<int> lcoor(Nd);
+    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	    sobj s;
+
+	    peekLocalSite(s,ssource,lcoor);
+
+	    lcoor[dim]+=p*L;
+	   
+	    pokeLocalSite(s,pgsource,lcoor);
+	  }
+
+	  ssource = Cshift(ssource,dim,L);
+	}
+
+	std::cout << " pgsource pencil " << pgsource<<std::endl ;
+	
+	// Loop over orthog coords
+	for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	  std::vector<int> pcoor(Nd,0);
+	  std::vector<int> lcoor(Nd);
+	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+	  
+	    // Project to local pencil array
+	    for(int l=0;l<G;l++){
+	      sobj s;
+	      pcoor[dim]=l;
+	      lcoor[dim]=l;
+	      peekLocalSite(s,pgsource,lcoor);
+	      pokeLocalSite(s,plsource,pcoor);
+	    }
+
+
+	    if ( idx==0) {
+	      std::cout << " plsource pencil " << pgsource<<std::endl ;
+	    }
+
+	    // FFT the pencil
+	    fftw_execute(p);
+
+	    // Extract the result
+	    for(int l=0;l<L;l++){
+	      sobj s;
+	      int p = processor_coor[dim];
+	      lcoor[dim] = l;
+	      pcoor[dim] = l+L*p;
+	      peekLocalSite(s,plresult,pcoor);
+	      pokeLocalSite(s,result,lcoor);
+	    }
+
+	  }
+	}
+	  
+	fftw_destroy_plan(p);
+      }
+    }
+
+  };
+
+
+}
+
+#endif
diff --git a/lib/Grid.h b/lib/Grid.h
index 9de8470d..486ee4d3 100644
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -68,6 +68,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Simd.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
+#include <Grid/Init.h>
 #include <Grid/Communicator.h> 
 #include <Grid/Cartesian.h>    
 #include <Grid/Tensors.h>      
@@ -78,7 +79,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
-#include <Grid/Init.h>
+
+#include <Grid/FFT.h>
 
 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h
index 2fa72014..cc4617de 100644
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
     assert(ig->_ldimensions[d] == og->_ldimensions[d]);
   }
 
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
   for(int idx=0;idx<ig->lSites();idx++){
     std::vector<int> lcoor(ni);
     ig->LocalIndexToLocalCoor(idx,lcoor);
@@ -446,6 +446,79 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
 
 }
 
+
+template<class vobj>
+void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,lowDim,lcoor);
+      pokeLocalSite(s,higherDim,hcoor);
+    }
+  }
+}
+
+
+template<class vobj>
+void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,higherDim,hcoor);
+      pokeLocalSite(s,lowDim,lcoor);
+    }
+  }
+}
+
+
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 2f2b70c8..30576fb6 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -388,6 +388,12 @@ class Grid_simd {
 
 };  // end of Grid_simd class definition
 
+
+inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
+inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
+inline void permute(RealD &y,RealD b, int perm) {  y=b; }
+inline void permute(RealF &y,RealF b, int perm) {  y=b; }
+
 ////////////////////////////////////////////////////////////////////
 // General rotate
 ////////////////////////////////////////////////////////////////////
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
index a67f4e7d..2afac190 100644
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,15 +67,13 @@ template <class scalar>
 struct AsinRealFunctor {
   scalar operator()(const scalar &a) const { return asin(real(a)); }
 };
-
 template <class scalar>
 struct LogRealFunctor {
   scalar operator()(const scalar &a) const { return log(real(a)); }
 };
-
 template <class scalar>
-struct ExpRealFunctor {
-  scalar operator()(const scalar &a) const { return exp(real(a)); }
+struct ExpFunctor {
+  scalar operator()(const scalar &a) const { return exp(a); }
 };
 template <class scalar>
 struct NotFunctor {
@@ -85,7 +83,6 @@ template <class scalar>
 struct AbsRealFunctor {
   scalar operator()(const scalar &a) const { return std::abs(real(a)); }
 };
-
 template <class scalar>
 struct PowRealFunctor {
   double y;
@@ -135,7 +132,6 @@ template <class Scalar>
 inline Scalar rsqrt(const Scalar &r) {
   return (RSqrtRealFunctor<Scalar>(), r);
 }
-
 template <class S, class V>
 inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
   return SimdApply(CosRealFunctor<S>(), r);
@@ -162,7 +158,7 @@ inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
 }
 template <class S, class V>
 inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
-  return SimdApply(ExpRealFunctor<S>(), r);
+  return SimdApply(ExpFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
diff --git a/lib/tensors/Tensor_unary.h b/lib/tensors/Tensor_unary.h
index dd05a4a7..92a968df 100644
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -36,6 +36,7 @@ template<class obj> inline auto func(const iScalar<obj> &z) -> iScalar<obj>\
 {\
     iScalar<obj> ret;\
     ret._internal = func( (z._internal));\
+    std::cout << "Unary "<<#func<<" " << z._internal <<" -> "<< ret._internal <<" "<< typeid(obj).name() <<std::endl; \
     return ret;\
 }\
 template<class obj,int N> inline auto func(const iVector<obj,N> &z) -> iVector<obj,N>\
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
new file mode 100644
index 00000000..deed6f0a
--- /dev/null
+++ b/tests/core/Test_fft.cc
@@ -0,0 +1,87 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
+
+  LatticeComplexD     one(&Fine);
+  LatticeComplexD      zz(&Fine);
+  LatticeComplexD       C(&Fine);
+  LatticeComplexD  Ctilde(&Fine);
+  LatticeComplexD    coor(&Fine);
+  
+  std::vector<RealD> p({1.0,2.0,3.0,2.0});
+
+  one = ComplexD(1.0,0.0);
+  zz  = ComplexD(0.0,0.0);
+
+  ComplexD ci(0.0,1.0);
+
+  C=zero;
+  for(int mu=0;mu<4;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    C = C - TwoPiL * p[mu] * coor;
+  }
+
+  std::cout << GridLogMessage<< " C " << C<<std::endl;
+
+  C = C*ci;
+  std::cout << GridLogMessage<< " C " << C<<std::endl;
+
+  C = exp(C);
+  std::cout << GridLogMessage<< " C " << C<<std::endl;
+
+  FFT theFFT(&Fine);
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);
+  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+
+  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);
+  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);
+  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);
+  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+
+  Grid_finalize();
+}