diff --git a/configure.ac b/configure.ac
index a6658a96..90764cb7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in
         AC_DEFINE([AVX1],[1],[AVX intrinsics])
         SIMD_FLAGS='-mavx -xavx';;
       AVXFMA)
-        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
-        SIMD_FLAGS='-mavx -mfma';;
+        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
+        SIMD_FLAGS='-mavx -fma';;
       AVX2)
         AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
         SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
diff --git a/lib/FFT.h b/lib/FFT.h
index b5b31d82..240f338b 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -244,7 +244,10 @@ namespace Grid {
             pokeLocalSite(s,pgbuf,cbuf);
           }
         }
-        result = Cshift(result,dim,L);
+        if (p != processors[dim] - 1)
+        {
+          result = Cshift(result,dim,L);
+        }
       }
       
       // Loop over orthog coords
@@ -287,10 +290,10 @@ namespace Grid {
           cgbuf = clbuf;
           cgbuf[dim] = clbuf[dim]+L*pc;
           peekLocalSite(s,pgbuf,cgbuf);
-          s = s * div;
           pokeLocalSite(s,result,clbuf);
         }
       }
+      result = result*div;
       
       // destroying plan
       FFTW<scalar>::fftw_destroy_plan(p);
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index f50eae2b..36360102 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -167,7 +167,7 @@ namespace Optimization {
     }
     //Integer
     inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
           __m128i a0,a1;
           __m128i b0,b1;
           a0 = _mm256_extractf128_si256(a,0);
@@ -195,7 +195,7 @@ namespace Optimization {
     }
     //Integer
     inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
           __m128i a0,a1;
           __m128i b0,b1;
           a0 = _mm256_extractf128_si256(a,0);
@@ -216,7 +216,7 @@ namespace Optimization {
   struct MultComplex{
     // Complex float
     inline __m256 operator()(__m256 a, __m256 b){
-#if defined (AVX1) 
+#if defined (AVX1)
       __m256 ymm0,ymm1,ymm2;
       ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
       ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
@@ -233,7 +233,7 @@ namespace Optimization {
       a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
       return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2)
+#if defined (AVX2)  || defined (AVXFMA)
       __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
       __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
       a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@@ -264,7 +264,7 @@ namespace Optimization {
 	IF IMM0[3] = 0
 	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
       */
-#if defined (AVX1) 
+#if defined (AVX1)
       __m256d ymm0,ymm1,ymm2;
       ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
       ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
@@ -279,7 +279,7 @@ namespace Optimization {
       a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
       return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
       __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
       __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
       a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@@ -320,7 +320,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
       a= _mm256_macc_ps(b,c,a);
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
       a= _mm256_fmadd_ps( b, c, a);
 #endif
     }
@@ -332,7 +332,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
       a= _mm256_macc_pd(b,c,a);
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
       a= _mm256_fmadd_pd( b, c, a);
 #endif
     }
@@ -347,7 +347,7 @@ namespace Optimization {
     }
     // Integer
     inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) 
+#if defined (AVX1) || defined (AVXFMA)
       __m128i a0,a1;
       __m128i b0,b1;
       a0 = _mm256_extractf128_si256(a,0);
diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index 07933f52..bc86291d 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -244,7 +244,22 @@ namespace Optimization {
       return a*b;
     }
   };
-  
+
+  struct Div{
+    // Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_swdiv(a, b);
+    }
+
+    // Real float
+    FLOAT_WRAP_2(operator(), inline)
+
+    // Integer
+    inline int operator()(int a, int b){
+      return a/b;
+    }
+  };
+
   struct Conj{
     // Complex double
     inline vector4double operator()(vector4double v){
@@ -413,6 +428,7 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Mult        MultSIMD;
+typedef Optimization::Div         DivSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 184baad9..080dd5c0 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -44,7 +44,7 @@ directory
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif
-#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
+#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
 #include "Grid_avx.h"
 #endif
 #if defined AVX512
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index 189f0559..92f9bcd8 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -50,6 +50,12 @@ public:
   template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
   std::string name(void) const { return std::string("Times"); }
 };
+class funcDivide {
+public:
+  funcDivide() {};
+  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
+  std::string name(void) const { return std::string("Divide"); }
+};
 class funcConj {
 public:
   funcConj() {};
@@ -341,6 +347,7 @@ int main (int argc, char ** argv)
   Tester<RealF,vRealF>(funcPlus());
   Tester<RealF,vRealF>(funcMinus());
   Tester<RealF,vRealF>(funcTimes());
+  Tester<RealF,vRealF>(funcDivide());
   Tester<RealF,vRealF>(funcAdj());
   Tester<RealF,vRealF>(funcConj());
   Tester<RealF,vRealF>(funcInnerProduct());
@@ -371,6 +378,7 @@ int main (int argc, char ** argv)
   Tester<RealD,vRealD>(funcPlus());
   Tester<RealD,vRealD>(funcMinus());
   Tester<RealD,vRealD>(funcTimes());
+  Tester<RealD,vRealD>(funcDivide());
   Tester<RealD,vRealD>(funcAdj());
   Tester<RealD,vRealD>(funcConj());
   Tester<RealD,vRealD>(funcInnerProduct());
diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc
index 4eb4398d..22838f7b 100644
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
   for(int mu=0;mu<4;mu++){
     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
     LatticeCoordinate(coor,mu);
-    C = C - (TwoPiL * p[mu]) * coor;
+    C = C + (TwoPiL * p[mu]) * coor;
   }
 
   C = exp(C*ci);
@@ -78,10 +78,11 @@ int main (int argc, char ** argv)
 
   FFT theFFT(&Fine);
 
-  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
+  Ctilde = C;
+  theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
 
   //  C=zero;
   //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@@ -93,10 +94,11 @@ int main (int argc, char ** argv)
   C=C-Ctilde;
   std::cout << "diff scalar "<<norm2(C) << std::endl;
 
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
+  Stilde = S;
+  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
 
   SpinMatrixF Sp; 
   Sp = zero; Sp = Sp+cVol;