FFT: cache plans per vobj type across calls

Plans are created lazily on the first FFT_dim call and reused for all subsequent calls on the same FFT object. PlanCreate<vobj>() can be called explicitly to pre-warm the cache. PlanDestroy() must be called before switching to a different vobj type; the destructor cleans up any live plans automatically. Update Test_fft.cc and Test_fftf.cc to call PlanDestroy() between the LatticeComplex and LatticeSpinMatrix sections that reuse the same FFT object. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-20 10:33:27 +01:00 · 2026-05-19 15:12:10 -04:00
parent b6abdc3845
commit 1e29c59bcc
3 changed files with 250 additions and 258 deletions
@@ -1,6 +1,6 @@
 /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/Cshift.h
@@ -28,6 +28,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 #include <any>
 #include <functional>
 #include <typeindex>
 #ifdef GRID_CUDA
 #include <cufft.h>
 #endif
@@ -65,17 +69,16 @@ public:
  typedef hipfftDoubleComplex FFTW_scalar;
  typedef hipfftHandle        FFTW_plan;
  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
-				      FFTW_scalar *in, int *inembed,		
+				      FFTW_scalar *in, int *inembed,
-				      int istride, int idist,		
+				      int istride, int idist,
-				      FFTW_scalar *out, int *onembed,		
+				      FFTW_scalar *out, int *onembed,
-				      int ostride, int odist,		
+				      int ostride, int odist,
 				      int sign, unsigned flags) {
    FFTW_plan p;
    auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
    GRID_ASSERT(rv==HIPFFT_SUCCESS);
    return p;
-  }	  
+  }
  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
    hipfftResult rv;
    if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
@@ -83,29 +86,25 @@ public:
    accelerator_barrier();
    GRID_ASSERT(rv==HIPFFT_SUCCESS);
  }
-  inline static void fftw_destroy_plan(const FFTW_plan p) {
+  inline static void fftw_destroy_plan(const FFTW_plan p) { hipfftDestroy(p); }
    hipfftDestroy(p);
  }
 };
 template<> struct FFTW<ComplexF> {
 public:
  static const int forward=FFTW_FORWARD;
  static const int backward=FFTW_BACKWARD;
-  typedef hipfftComplex      FFTW_scalar;
+  typedef hipfftComplex FFTW_scalar;
-  typedef hipfftHandle        FFTW_plan;
+  typedef hipfftHandle  FFTW_plan;
  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
-				      FFTW_scalar *in, int *inembed,		
+				      FFTW_scalar *in, int *inembed,
-				      int istride, int idist,		
+				      int istride, int idist,
-				      FFTW_scalar *out, int *onembed,		
+				      FFTW_scalar *out, int *onembed,
-				      int ostride, int odist,		
+				      int ostride, int odist,
 				      int sign, unsigned flags) {
    FFTW_plan p;
    auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
    GRID_ASSERT(rv==HIPFFT_SUCCESS);
    return p;
-  }	  
+  }
  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
    hipfftResult rv;
    if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
@@ -113,9 +112,7 @@ public:
    accelerator_barrier();
    GRID_ASSERT(rv==HIPFFT_SUCCESS);
  }
-  inline static void fftw_destroy_plan(const FFTW_plan p) {
+  inline static void fftw_destroy_plan(const FFTW_plan p) { hipfftDestroy(p); }
    hipfftDestroy(p);
  }
 };
 #endif
@@ -126,53 +123,45 @@ public:
  static const int backward=FFTW_BACKWARD;
  typedef cufftDoubleComplex FFTW_scalar;
  typedef cufftHandle        FFTW_plan;
  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
-				      FFTW_scalar *in, int *inembed,		
+				      FFTW_scalar *in, int *inembed,
-				      int istride, int idist,		
+				      int istride, int idist,
-				      FFTW_scalar *out, int *onembed,		
+				      FFTW_scalar *out, int *onembed,
-				      int ostride, int odist,		
+				      int ostride, int odist,
 				      int sign, unsigned flags) {
    FFTW_plan p;
    cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
    return p;
-  }	  
+  }
  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
    if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
    else                   cufftExecZ2Z(p,in,out,CUFFT_INVERSE);
    accelerator_barrier();
  }
-  inline static void fftw_destroy_plan(const FFTW_plan p) {
+  inline static void fftw_destroy_plan(const FFTW_plan p) { cufftDestroy(p); }
    cufftDestroy(p);
  }
 };
 template<> struct FFTW<ComplexF> {
 public:
  static const int forward=FFTW_FORWARD;
  static const int backward=FFTW_BACKWARD;
  typedef cufftComplex FFTW_scalar;
-  typedef cufftHandle        FFTW_plan;
+  typedef cufftHandle  FFTW_plan;
  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
-				      FFTW_scalar *in, int *inembed,		
+				      FFTW_scalar *in, int *inembed,
-				      int istride, int idist,		
+				      int istride, int idist,
-				      FFTW_scalar *out, int *onembed,		
+				      FFTW_scalar *out, int *onembed,
-				      int ostride, int odist,		
+				      int ostride, int odist,
 				      int sign, unsigned flags) {
    FFTW_plan p;
    cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
    return p;
-  }	  
+  }
  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
    if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
    else                   cufftExecC2C(p,in,out,CUFFT_INVERSE);
    accelerator_barrier();
  }
-  inline static void fftw_destroy_plan(const FFTW_plan p) {
+  inline static void fftw_destroy_plan(const FFTW_plan p) { cufftDestroy(p); }
    cufftDestroy(p);
  }
 };
 #endif
@@ -183,313 +172,314 @@ public:
  typedef fftw_complex FFTW_scalar;
  typedef fftw_plan    FFTW_plan;
  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
-				      FFTW_scalar *in, int *inembed,		
+				      FFTW_scalar *in, int *inembed,
-				      int istride, int idist,		
+				      int istride, int idist,
-				      FFTW_scalar *out, int *onembed,		
+				      FFTW_scalar *out, int *onembed,
-				      int ostride, int odist,		
+				      int ostride, int odist,
 				      int sign, unsigned flags) {
    return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
-  }	  
+  }
  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
    ::fftw_execute_dft(p,in,out);
  }
-  inline static void fftw_destroy_plan(const FFTW_plan p) {
+  inline static void fftw_destroy_plan(const FFTW_plan p) { ::fftw_destroy_plan(p); }
    ::fftw_destroy_plan(p);
  }
 };
 template<> struct FFTW<ComplexF> {
 public:
  typedef fftwf_complex FFTW_scalar;
  typedef fftwf_plan    FFTW_plan;
  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
-				      FFTW_scalar *in, int *inembed,		
+				      FFTW_scalar *in, int *inembed,
-				      int istride, int idist,		
+				      int istride, int idist,
-				      FFTW_scalar *out, int *onembed,		
+				      FFTW_scalar *out, int *onembed,
-				      int ostride, int odist,		
+				      int ostride, int odist,
 				      int sign, unsigned flags) {
    return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
-  }	  
+  }
  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
    ::fftwf_execute_dft(p,in,out);
  }
-  inline static void fftw_destroy_plan(const FFTW_plan p) {
+  inline static void fftw_destroy_plan(const FFTW_plan p) { ::fftwf_destroy_plan(p); }
    ::fftwf_destroy_plan(p);
  }
 };
 #endif
 #endif
 class FFT {
 private:
-    
+
-  double flops;
+  double   flops;
-  double flops_call;
+  double   flops_call;
  uint64_t usec;
-    
+  GridCartesian *_grid;
 public:
  static const int forward=FFTW_FORWARD;
  static const int backward=FFTW_BACKWARD;
  double Flops(void) {return flops;}
  double MFlops(void) {return flops/usec;}
  double USec(void)   {return (double)usec;}    
-  FFT ( GridCartesian * grid ) 
+  // Type-erased plan entry.  The handle is recovered via
-  {
+  // std::any_cast<FFTW<scalar>::FFTW_plan> inside FFT_dim, which knows the
-    flops=0;
+  // scalar type at compile time.
-    usec =0;
+  struct PlanEntry {
    std::any              handle;
    std::function<void()> destroy;
  };
  ~FFT ( void)  {
    //    delete sgrid;
  }
  template<class vobj>
  void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
-    //    vgrid=result.Grid();
+  std::vector<PlanEntry> forward_plans;   // size Nd when populated, 0 otherwise
-    //    conformable(result.Grid(),vgrid);
+  std::vector<PlanEntry> backward_plans;
-    //    conformable(source.Grid(),vgrid);
+  std::type_index        _plan_type { typeid(void) }; // vobj type plans were built for
 public:
  static const int forward  = FFTW_FORWARD;
  static const int backward = FFTW_BACKWARD;
  double Flops(void)  { return flops; }
  double MFlops(void) { return flops / usec; }
  double USec(void)   { return (double)usec; }
  FFT(GridCartesian *grid) : _grid(grid), flops(0), usec(0) {}
  ~FFT() {
    if (forward_plans.size() > 0) PlanDestroy();
  }
  // Explicitly pre-create and cache plans for all Nd dimensions.
  // Optional: FFT_dim will call this lazily on first use if not called.
  // Asserts that no plans already exist; call PlanDestroy first to re-create.
  template<class vobj>
  void PlanCreate() {
    GRID_ASSERT(forward_plans.size() == 0);
    typedef typename vobj::scalar_type   scalar;
    typedef typename vobj::scalar_object sobj;
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
    const int Ndim = _grid->Nd();
    forward_plans.resize(Ndim);
    backward_plans.resize(Ndim);
    for (int d = 0; d < Ndim; d++) {
      int G     = _grid->_fdimensions[d];
      int Ncomp = sizeof(sobj) / sizeof(scalar);
      int64_t Nperp = 1;
      for (int dd = 0; dd < Ndim; dd++)
        if (dd != d) Nperp *= _grid->_ldimensions[dd];
      int howmany = Ncomp * (int)Nperp;
      int n[] = {G};
      // GPU backends (cuFFT/hipFFT) ignore the buffer pointer at plan creation.
      // CPU FFTW with FFTW_ESTIMATE inspects only alignment and never touches data.
      deviceVector<scalar> dummy(2);
      FFTW_scalar *buf = (FFTW_scalar *)&dummy[0];
      {
        FFTW_plan p = FFTW<scalar>::fftw_plan_many_dft(
            1, n, howmany, buf, n, 1, G, buf, n, 1, G, FFTW_FORWARD, FFTW_ESTIMATE);
        forward_plans[d] = { p, [p](){ FFTW<scalar>::fftw_destroy_plan(p); } };
      }
      {
        FFTW_plan p = FFTW<scalar>::fftw_plan_many_dft(
            1, n, howmany, buf, n, 1, G, buf, n, 1, G, FFTW_BACKWARD, FFTW_ESTIMATE);
        backward_plans[d] = { p, [p](){ FFTW<scalar>::fftw_destroy_plan(p); } };
      }
    }
    _plan_type = std::type_index(typeid(vobj));
  }
  void PlanDestroy() {
    for (auto &e : forward_plans)  e.destroy();
    for (auto &e : backward_plans) e.destroy();
    forward_plans.resize(0);
    backward_plans.resize(0);
    _plan_type = std::type_index(typeid(void));
  }
  template<class vobj>
  void FFT_dim_mask(Lattice<vobj> &result, const Lattice<vobj> &source, Coordinate mask, int sign) {
    const int Ndim = source.Grid()->Nd();
    Lattice<vobj> tmp = source;
-    for(int d=0;d<Ndim;d++){
+    for (int d = 0; d < Ndim; d++) {
-      if( mask[d] ) {
+      if (mask[d]) {
-	FFT_dim(result,tmp,d,sign);
+        FFT_dim(result, tmp, d, sign);
-	tmp=result;
+        tmp = result;
      }
    }
  }
  template<class vobj>
-  void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
+  void FFT_all_dim(Lattice<vobj> &result, const Lattice<vobj> &source, int sign) {
    const int Ndim = source.Grid()->Nd();
-    Coordinate mask(Ndim,1);
+    Coordinate mask(Ndim, 1);
-    FFT_dim_mask(result,source,mask,sign);
+    FFT_dim_mask(result, source, mask, sign);
  }
  template<class vobj>
-  void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
+  void FFT_dim(Lattice<vobj> &result, const Lattice<vobj> &source, int dim, int sign) {
    const int Ndim = source.Grid()->Nd();
    GridBase *grid = source.Grid();
-    conformable(result.Grid(),source.Grid());
+    conformable(result.Grid(), source.Grid());
    int L = grid->_ldimensions[dim];
    int G = grid->_fdimensions[dim];
-      
+
    Coordinate layout(Ndim,1);
    // Construct pencils
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::scalar_type   scalar;
    typedef typename vobj::scalar_type   scalar_type;
    typedef typename vobj::vector_type   vector_type;
    //std::cout << "CPU view" << std::endl;
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
    int Ncomp = sizeof(sobj)/sizeof(scalar);
    int64_t Nlow  = 1;
    int64_t Nhigh = 1;
-    for(int d=0;d<dim;d++){
+    typedef typename FFTW<scalar_type>::FFTW_scalar FFTW_scalar;
-      Nlow*=grid->_ldimensions[d];
+    typedef typename FFTW<scalar_type>::FFTW_plan   FFTW_plan;
-    }
+
-    for(int d=dim+1;d<Ndim;d++){
+    int     Ncomp  = sizeof(sobj) / sizeof(scalar_type);
-      Nhigh*=grid->_ldimensions[d];
+    int64_t Nlow   = 1;
-    }
+    int64_t Nhigh  = 1;
-    int64_t Nperp=Nlow*Nhigh;
+    for (int d = 0; d < dim;   d++) Nlow  *= grid->_ldimensions[d];
-    
+    for (int d = dim+1; d < Ndim; d++) Nhigh *= grid->_ldimensions[d];
-    deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
+    int64_t Nperp = Nlow * Nhigh;
-    pgbuf.resize(Nperp*Ncomp*G);
+
-    scalar *pgbuf_v = &pgbuf[0];
+    deviceVector<scalar_type> pgbuf(Nperp * Ncomp * G); // [perp][component][dim]
-      
+    scalar_type *pgbuf_v = &pgbuf[0];
-    int rank = 1;  /* 1d transforms */
+
-    int n[] = {G}; /* 1d transforms of length G */
+    int rank    = 1;
    int n[]     = {G};
    int howmany = Ncomp * Nperp;
-    int odist,idist,istride,ostride;
+    int idist = G, odist = G, istride = 1, ostride = 1;
    idist   = odist   = G;            /* Distance between consecutive FT's */
    istride = ostride = 1;            /* Distance between two elements in the same FT */
    int *inembed = n, *onembed = n;
-      
+
-    scalar div;
+    scalar_type div;
-    if ( sign == backward ) div = 1.0/G;
+    if      (sign == backward) div = 1.0 / G;
-    else if ( sign == forward ) div = 1.0;
+    else if (sign == forward)  div = 1.0;
    else GRID_ASSERT(0);
-    double t_pencil=0;
+    // Populate cache on first call; subsequent calls check type consistency.
-    double t_fft   =0;
+    if (forward_plans.size() == 0) PlanCreate<vobj>();
-    double t_total =-usecond();
+    GRID_ASSERT(forward_plans.size() == (size_t)Ndim);
-    //    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    GRID_ASSERT(std::type_index(typeid(vobj)) == _plan_type);
-    /*
+
-     *
+    auto &plans = (sign == forward) ? forward_plans : backward_plans;
-     */
+    FFTW_plan p = std::any_cast<FFTW_plan>(plans[dim].handle);
-    FFTW_plan p;
+
-    {
+    double t_pencil = 0;
-      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
+    double t_fft    = 0;
-      FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
+    double t_copy   = 0;
-      p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
+    double t_shift  = 0;
-					   in,inembed,
+    double t_total  = -usecond();
-					   istride,idist,
+
-					   out,onembed,
+    // Barrel-shift gather: accumulate global pencil into pgbuf
 					   ostride, odist,
 					   sign,FFTW_ESTIMATE);
    }
    // Barrel shift and collect global pencil
    //    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
    Coordinate lcoor(Ndim), gcoor(Ndim);
    double t_copy=0;
    double t_shift=0;
    t_pencil = -usecond();
    result = source;
    int pc = grid->_processor_coor[dim];
    const Coordinate ldims = grid->_ldimensions;
    const Coordinate rdims = grid->_rdimensions;
    const Coordinate sdims = grid->_simd_layout;
    Coordinate processors  = grid->_processors;
    Coordinate processors = grid->_processors;
    Coordinate pgdims(Ndim);
    pgdims[0] = G;
-    for(int d=0, dd=1;d<Ndim;d++){
+    for (int d = 0, dd = 1; d < Ndim; d++)
-      if ( d!=dim ) pgdims[dd++] = ldims[d];
+      if (d != dim) pgdims[dd++] = ldims[d];
-    }
+    int64_t pgvol = 1;
-    int64_t pgvol=1;
+    for (int d = 0; d < Ndim; d++) pgvol *= pgdims[d];
-    for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
+
    const int Nsimd = vobj::Nsimd();
-    for(int p=0;p<processors[dim];p++) {
+    t_pencil = -usecond();
-      t_copy-=usecond();
+    for (int p_idx = 0; p_idx < processors[dim]; p_idx++) {
-      autoView(r_v,result,AcceleratorRead);
+      t_copy -= usecond();
      autoView(r_v, result, AcceleratorRead);
      accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
 #ifdef GRID_SIMT
-      {
+        {
-	int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+          int lane = acceleratorSIMTlane(Nsimd);
 #else
-      for(int lane=0;lane<Nsimd;lane++) {
+        for (int lane = 0; lane < Nsimd; lane++) {
 #endif
-	Coordinate icoor;
+          Coordinate icoor, ocoor, pgcoor;
-	Coordinate ocoor;
+          Lexicographic::CoorFromIndex(icoor, lane, sdims);
-	Coordinate pgcoor;
+          Lexicographic::CoorFromIndex(ocoor, idx,  rdims);
-	Lexicographic::CoorFromIndex(icoor,lane,sdims);
+          pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p_idx)%processors[dim])*L;
-	Lexicographic::CoorFromIndex(ocoor,idx,rdims);
+          for (int d = 0, dd = 1; d < Ndim; d++) {
            if (d != dim) { pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d]; dd++; }
          }
          int64_t pgidx;
          Lexicographic::IndexFromCoor(pgcoor, pgidx, pgdims);
-	pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
+          vector_type   *from = (vector_type *)&r_v[idx];
-	for(int d=0,dd=1;d<Ndim;d++){
+          scalar_type    stmp;
-	  if ( d!=dim ) {
+          for (int w = 0; w < Ncomp; w++) {
-	    pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
+            stmp = getlane(from[w], lane);
-	    dd++;
+            pgbuf_v[pgidx + w*pgvol] = stmp;
-	  }
+          }
 	}
 	// Map coordinates in lattice layout to FFTW index
 	int64_t pgidx;
 	Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
 	vector_type *from = (vector_type *)&r_v[idx];
 	scalar_type stmp;
 	for(int w=0;w<Ncomp;w++){
 	  int64_t pg_idx = pgidx + w*pgvol;
 	  stmp = getlane(from[w], lane);
 	  pgbuf_v[pg_idx] = stmp;
 	}
 #ifdef GRID_SIMT
-      }
+        }
 #else
-      }
+        }
 #endif
      });
      t_copy += usecond();
-      t_copy+=usecond();
+      if (p_idx != processors[dim] - 1) {
-      if (p != processors[dim] - 1) {
+        Lattice<vobj> temp(grid);
-	Lattice<vobj> temp(grid);
+        t_shift -= usecond();
-	t_shift-=usecond();
+        temp = Cshift(result, dim, L); result = temp;
-	temp = Cshift(result,dim,L); result = temp;
+        t_shift += usecond();
 	t_shift+=usecond();
      }
    }
    t_pencil += usecond();
-      
+
-    FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
+    FFTW_scalar *in  = (FFTW_scalar *)pgbuf_v;
-    FFTW_scalar *out= (FFTW_scalar *)pgbuf_v;
+    FFTW_scalar *out = (FFTW_scalar *)pgbuf_v;
    t_fft = -usecond();
-    FFTW<scalar>::fftw_execute_dft(p,in,out,sign);
+    FFTW<scalar_type>::fftw_execute_dft(p, in, out, sign);
    t_fft += usecond();
-    
+
-    // performance counting
+    flops_call = 5.0 * howmany * G * log2(G);
-    flops_call = 5.0*howmany*G*log2(G);
+    usec  = t_fft;
-    usec = t_fft;
+    flops = flops_call;
    flops= flops_call;
    result = Zero();
    double t_insert = -usecond();
    {
-      autoView(r_v,result,AcceleratorWrite);
+      autoView(r_v, result, AcceleratorWrite);
-      accelerator_for(idx,grid->oSites(),Nsimd,{
+      accelerator_for(idx, grid->oSites(), Nsimd, {
 #ifdef GRID_SIMT
-      {
+        {
-	int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+          int lane = acceleratorSIMTlane(Nsimd);
 #else
-      for(int lane=0;lane<Nsimd;lane++) {
+        for (int lane = 0; lane < Nsimd; lane++) {
 #endif
-	Coordinate icoor(Ndim);
+          Coordinate icoor(Ndim), ocoor(Ndim), pgcoor(Ndim);
-	Coordinate ocoor(Ndim);
+          Lexicographic::CoorFromIndex(icoor, lane, sdims);
-	Coordinate pgcoor(Ndim);
+          Lexicographic::CoorFromIndex(ocoor, idx,  rdims);
-	Lexicographic::CoorFromIndex(icoor,lane,sdims);
+          pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
-	Lexicographic::CoorFromIndex(ocoor,idx,rdims);
+          for (int d = 0, dd = 1; d < Ndim; d++) {
            if (d != dim) { pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d]; dd++; }
          }
          int64_t pgidx;
          Lexicographic::IndexFromCoor(pgcoor, pgidx, pgdims);
-	pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
+          vector_type *to   = (vector_type *)&r_v[idx];
-	for(int d=0,dd=1;d<Ndim;d++){
+          scalar_type  stmp;
-	  if ( d!=dim ) {
+          for (int w = 0; w < Ncomp; w++) {
-	    pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
+            stmp = pgbuf_v[pgidx + w*pgvol];
-	    dd++;
+            putlane(to[w], stmp, lane);
-	  }
+          }
 	}
 	// Map coordinates in lattice layout to FFTW index
 	int64_t pgidx;
 	Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
 	vector_type *to = (vector_type *)&r_v[idx];
 	scalar_type stmp;
 	for(int w=0;w<Ncomp;w++){
 	  int64_t pg_idx = pgidx + w*pgvol;
 	  stmp = pgbuf_v[pg_idx];
 	  putlane(to[w], stmp, lane);
 	}
 #ifdef GRID_SIMT
-      }
+        }
 #else
-      }
+        }
 #endif
      });
    }
-    result = result*div;
+    result   = result * div;
    t_insert += usecond();
    t_total  += usecond();
-    t_insert +=usecond();
+    std::cout << GridLogPerformance << " FFT took    " << t_total/1.0e6  << " s" << std::endl;
-    
+    std::cout << GridLogPerformance << " FFT pencil  " << t_pencil/1.0e6 << " s" << std::endl;
-    // destroying plan
+    std::cout << GridLogPerformance << "  of which copy  " << t_copy/1.0e6  << " s" << std::endl;
-    FFTW<scalar>::fftw_destroy_plan(p);
+    std::cout << GridLogPerformance << "  of which shift " << t_shift/1.0e6 << " s" << std::endl;
-
+    std::cout << GridLogPerformance << " FFT kernels " << t_fft/1.0e6    << " s" << std::endl;
-    t_total +=usecond();
+    std::cout << GridLogPerformance << " FFT insert  " << t_insert/1.0e6 << " s" << std::endl;
    std::cout <<GridLogPerformance<< " FFT took   "<<t_total/1.0e6 <<" s" << std::endl;
    std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
    std::cout <<GridLogPerformance<< "  of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
    std::cout <<GridLogPerformance<< "  of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
    std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
    std::cout <<GridLogPerformance<< " FFT insert  "<<t_insert/1.0e6 <<" s" << std::endl;
  }
 };
@@ -113,6 +113,7 @@ int main (int argc, char ** argv)
  Cref= Cref - C;
  std::cout << " invertible check " << norm2(Cref)<<std::endl;
  theFFT.PlanDestroy();
  Stilde=S;
  std::cout<<" Benchmarking FFT of LatticeSpinMatrix  "<<std::endl;
  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
@@ -95,6 +95,7 @@ int main (int argc, char ** argv)
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;
  theFFT.PlanDestroy();
  Stilde = S;
  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;