FFT offload to GPU and MUCH faster comms.

40x speed up on Frontier
2026-04-03 10:36:10 +01:00 · 2025-08-21 16:44:55 -04:00
parent 76c0ada1e1
commit fe0db53842
8 changed files with 443 additions and 176 deletions
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -28,6 +28,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_

+#ifdef GRID_CUDA
+#include <cufft.h>
+#endif
+
+#ifdef GRID_HIP
+#include <hipfft/hipfft.h>
+#endif
+
 #ifdef HAVE_FFTW
 #if defined(USE_MKL) || defined(GRID_SYCL)
 #include <fftw/fftw3.h>
@@ -38,85 +46,184 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class scalar> struct FFTW { };
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+#define FFTW_ESTIMATE (0)
+#endif

-#ifdef HAVE_FFTW	
+template<class scalar> struct FFTW {
+};
+
+#ifdef GRID_HIP
 template<> struct FFTW<ComplexD> {
 public:
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+  typedef hipfftDoubleComplex FFTW_scalar;
+  typedef hipfftHandle        FFTW_plan;
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    FFTW_plan p;
+    auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
+    GRID_ASSERT(rv==HIPFFT_SUCCESS);
+    return p;
+  }	  
+    
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
+    hipfftResult rv;
+    if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
+    else                   rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD);
+    accelerator_barrier();
+    GRID_ASSERT(rv==HIPFFT_SUCCESS);
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    hipfftDestroy(p);
+  }
+};
+template<> struct FFTW<ComplexF> {
+public:
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+  typedef hipfftComplex      FFTW_scalar;
+  typedef hipfftHandle        FFTW_plan;

+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    FFTW_plan p;
+    auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
+    GRID_ASSERT(rv==HIPFFT_SUCCESS);
+    return p;
+  }	  
+    
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
+    hipfftResult rv;
+    if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
+    else                   rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD);
+    accelerator_barrier();
+    GRID_ASSERT(rv==HIPFFT_SUCCESS);
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    hipfftDestroy(p);
+  }
+};
+#endif
+
+#ifdef GRID_CUDA
+template<> struct FFTW<ComplexD> {
+public:
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+  typedef cufftDoubleComplex FFTW_scalar;
+  typedef cufftHandle        FFTW_plan;
+
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    FFTW_plan p;
+    cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
+    return p;
+  }	  
+    
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
+    if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
+    else                   cufftExecZ2Z(p,in,out,CUFFT_BACKWARD);
+    accelerator_barrier();
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    cufftDestroy(p);
+  }
+};
+template<> struct FFTW<ComplexF> {
+public:
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+  typedef cufftComplex FFTW_scalar;
+  typedef cufftHandle        FFTW_plan;
+
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    FFTW_plan p;
+    cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
+    return p;
+  }	  
+    
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
+    if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
+    else                   cufftExecC2C(p,in,out,CUFFT_BACKWARD);
+    accelerator_barrier();
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    cufftDestroy(p);
+  }
+};
+#endif
+
+#ifdef HAVE_FFTW
+template<> struct FFTW<ComplexD> {
+public:
  typedef fftw_complex FFTW_scalar;
  typedef fftw_plan    FFTW_plan;
-
-  static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
-				      FFTW_scalar *in, const int *inembed,		
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
 				      int istride, int idist,		
-				      FFTW_scalar *out, const int *onembed,		
+				      FFTW_scalar *out, int *onembed,		
 				      int ostride, int odist,		
 				      int sign, unsigned flags) {
    return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
  }	  
    
-  static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
-    ::fftw_flops(p,add,mul,fmas);
-  }
-
-  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
    ::fftw_execute_dft(p,in,out);
  }
  inline static void fftw_destroy_plan(const FFTW_plan p) {
    ::fftw_destroy_plan(p);
  }
 };
-
 template<> struct FFTW<ComplexF> {
 public:
-
  typedef fftwf_complex FFTW_scalar;
  typedef fftwf_plan    FFTW_plan;
-
-  static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
-				      FFTW_scalar *in, const int *inembed,		
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
 				      int istride, int idist,		
-				      FFTW_scalar *out, const int *onembed,		
+				      FFTW_scalar *out, int *onembed,		
 				      int ostride, int odist,		
 				      int sign, unsigned flags) {
    return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
  }	  
    
-  static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
-    ::fftwf_flops(p,add,mul,fmas);
-  }
-
-  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
    ::fftwf_execute_dft(p,in,out);
  }
  inline static void fftw_destroy_plan(const FFTW_plan p) {
    ::fftwf_destroy_plan(p);
  }
 };
-
-#endif
-
-#ifndef FFTW_FORWARD
-#define FFTW_FORWARD (-1)
-#define FFTW_BACKWARD (+1)
 #endif

 class FFT {
 private:
    
-  GridCartesian *vgrid;
-  GridCartesian *sgrid;
-    
-  int Nd;
  double flops;
  double flops_call;
  uint64_t usec;
    
-  Coordinate dimensions;
-  Coordinate processors;
-  Coordinate processor_coor;
-    
 public:
    
  static const int forward=FFTW_FORWARD;
@@ -126,31 +233,25 @@ public:
  double MFlops(void) {return flops/usec;}
  double USec(void)   {return (double)usec;}    

-  FFT ( GridCartesian * grid ) :
-    vgrid(grid),
-    Nd(grid->_ndimension),
-    dimensions(grid->_fdimensions),
-    processors(grid->_processors),
-    processor_coor(grid->_processor_coor)
+  FFT ( GridCartesian * grid ) 
  {
    flops=0;
    usec =0;
-    Coordinate layout(Nd,1);
-    sgrid = new GridCartesian(dimensions,layout,processors,*grid);
  };
    
  ~FFT ( void)  {
-    delete sgrid;
+    //    delete sgrid;
  }
    
  template<class vobj>
  void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){

-    conformable(result.Grid(),vgrid);
-    conformable(source.Grid(),vgrid);
-    Lattice<vobj> tmp(vgrid);
-    tmp = source;
-    for(int d=0;d<Nd;d++){
+    //    vgrid=result.Grid();
+    //    conformable(result.Grid(),vgrid);
+    //    conformable(source.Grid(),vgrid);
+    const int Ndim = source.Grid()->Nd();
+    Lattice<vobj> tmp = source;
+    for(int d=0;d<Ndim;d++){
      if( mask[d] ) {
 	FFT_dim(result,tmp,d,sign);
 	tmp=result;
@@ -160,62 +261,70 @@ public:

  template<class vobj>
  void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
-    Coordinate mask(Nd,1);
+    const int Ndim = source.Grid()->Nd();
+    Coordinate mask(Ndim,1);
    FFT_dim_mask(result,source,mask,sign);
  }


  template<class vobj>
  void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
-#ifndef HAVE_FFTW
-    std::cerr << "FFTW is not compiled but is called"<<std::endl;
-    GRID_ASSERT(0);
-#else
-    conformable(result.Grid(),vgrid);
-    conformable(source.Grid(),vgrid);
+    const int Ndim = source.Grid()->Nd();
+    GridBase *grid = source.Grid();
+    conformable(result.Grid(),source.Grid());

-    int L = vgrid->_ldimensions[dim];
-    int G = vgrid->_fdimensions[dim];
-      
-    Coordinate layout(Nd,1);
-    Coordinate pencil_gd(vgrid->_fdimensions);
-      
-    pencil_gd[dim] = G*processors[dim];
-      
-    // Pencil global vol LxLxGxLxL per node
-    GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
+    int L = grid->_ldimensions[dim];
+    int G = grid->_fdimensions[dim];
      
+    Coordinate layout(Ndim,1);
+    
    // Construct pencils
    typedef typename vobj::scalar_object sobj;
-    typedef typename sobj::scalar_type   scalar;
+    typedef typename vobj::scalar_type   scalar;
+    typedef typename vobj::scalar_type   scalar_type;
+    typedef typename vobj::vector_type   vector_type;
      
-    Lattice<sobj> pgbuf(&pencil_g);
-    autoView(pgbuf_v , pgbuf, CpuWrite);
    //std::cout << "CPU view" << std::endl;
    
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
      
    int Ncomp = sizeof(sobj)/sizeof(scalar);
-    int Nlow  = 1;
+    int64_t Nlow  = 1;
+    int64_t Nhigh = 1;
+
    for(int d=0;d<dim;d++){
-      Nlow*=vgrid->_ldimensions[d];
+      Nlow*=grid->_ldimensions[d];
    }
+    for(int d=dim+1;d<Ndim;d++){
+      Nhigh*=grid->_ldimensions[d];
+    }
+    int64_t Nperp=Nlow*Nhigh;
+    
+    deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
+    pgbuf.resize(Nperp*Ncomp*G);
+    scalar *pgbuf_v = &pgbuf[0];
      
    int rank = 1;  /* 1d transforms */
    int n[] = {G}; /* 1d transforms of length G */
-    int howmany = Ncomp;
+    int howmany = Ncomp * Nperp;
    int odist,idist,istride,ostride;
-    idist   = odist   = 1;          /* Distance between consecutive FT's */
-    istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+    idist   = odist   = G;            /* Distance between consecutive FT's */
+    istride = ostride = 1;            /* Distance between two elements in the same FT */
    int *inembed = n, *onembed = n;
      
    scalar div;
    if ( sign == backward ) div = 1.0/G;
    else if ( sign == forward ) div = 1.0;
    else GRID_ASSERT(0);
-      
-    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+
+    double t_pencil=0;
+    double t_fft   =0;
+    double t_total =-usecond();
+    //    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    /*
+     *
+     */
    FFTW_plan p;
    {
      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,72 +338,154 @@ public:
    }
      
    // Barrel shift and collect global pencil
-    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
-    Coordinate lcoor(Nd), gcoor(Nd);
+    //    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    Coordinate lcoor(Ndim), gcoor(Ndim);
+    double t_copy=0;
+    double t_shift=0;
+    t_pencil = -usecond();
    result = source;
-    int pc = processor_coor[dim];
+    int pc = grid->_processor_coor[dim];
+
+    const Coordinate ldims = grid->_ldimensions;
+    const Coordinate rdims = grid->_rdimensions;
+    const Coordinate sdims = grid->_simd_layout;
+
+    Coordinate processors = grid->_processors;
+    Coordinate pgdims(Ndim);
+    pgdims[0] = G;
+    for(int d=0, dd=1;d<Ndim;d++){
+      if ( d!=dim ) pgdims[dd++] = ldims[d];
+    }
+    int64_t pgvol=1;
+    for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
+    
+    const int Nsimd = vobj::Nsimd();
    for(int p=0;p<processors[dim];p++) {
+      t_copy-=usecond();
+      autoView(r_v,result,AcceleratorRead);
+      accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
+#ifdef GRID_SIMT
      {
-	autoView(r_v,result,CpuRead);
-	autoView(p_v,pgbuf,CpuWrite);
-	thread_for(idx, sgrid->lSites(),{
-          Coordinate cbuf(Nd);
-          sobj s;
-	  sgrid->LocalIndexToLocalCoor(idx,cbuf);
-	  peekLocalSite(s,r_v,cbuf);
-	  cbuf[dim]+=((pc+p) % processors[dim])*L;
-	  pokeLocalSite(s,p_v,cbuf);
-        });
+	int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int lane=0;lane<Nsimd;lane++) {
+#endif
+	Coordinate icoor;
+	Coordinate ocoor;
+	Coordinate pgcoor;
+
+	Lexicographic::CoorFromIndex(icoor,lane,sdims);
+	Lexicographic::CoorFromIndex(ocoor,idx,rdims);
+
+	pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
+	for(int d=0,dd=1;d<Ndim;d++){
+	  if ( d!=dim ) {
+	    pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
+	    dd++;
+	  }
+	}
+
+	// Map coordinates in lattice layout to FFTW index
+	int64_t pgidx;
+	Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
+
+	vector_type *from = (vector_type *)&r_v[idx];
+	scalar_type stmp;
+	for(int w=0;w<Ncomp;w++){
+	  int64_t pg_idx = pgidx + w*pgvol;
+	  stmp = getlane(from[w], lane);
+	  pgbuf_v[pg_idx] = stmp;
+	}
+#ifdef GRID_SIMT
      }
+#else
+      }
+#endif
+      });
+
+      t_copy+=usecond();
      if (p != processors[dim] - 1) {
-	result = Cshift(result,dim,L);
+	Lattice<vobj> temp(grid);
+	t_shift-=usecond();
+	temp = Cshift(result,dim,L); result = temp;
+	t_shift+=usecond();
      }
    }
+    t_pencil += usecond();
      
-    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
-    // Loop over orthog coords
-    int NN=pencil_g.lSites();
-    GridStopWatch timer;
-    timer.Start();
-    thread_for( idx,NN,{
-        Coordinate cbuf(Nd);
-	pencil_g.LocalIndexToLocalCoor(idx, cbuf);
-	if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
-	  FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
-	  FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
-	  FFTW<scalar>::fftw_execute_dft(p,in,out);
-	}
-    });
-    timer.Stop();
-      
+    FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
+    FFTW_scalar *out= (FFTW_scalar *)pgbuf_v;
+    t_fft = -usecond();
+    FFTW<scalar>::fftw_execute_dft(p,in,out,sign);
+    t_fft += usecond();
+    
    // performance counting
-    double add,mul,fma;
-    FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
-    flops_call = add+mul+2.0*fma;
-    usec += timer.useconds();
-    flops+= flops_call*NN;
-      
-    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
-    // writing out result
+    flops_call = 5.0*howmany*G*log2(G);
+    usec = t_fft;
+    flops= flops_call;
+
+    result = Zero();
+    
+    double t_insert = -usecond();
    {
-      autoView(pgbuf_v,pgbuf,CpuRead);
-      autoView(result_v,result,CpuWrite);
-      thread_for(idx,sgrid->lSites(),{
-	Coordinate clbuf(Nd), cgbuf(Nd);
-	sobj s;
-	sgrid->LocalIndexToLocalCoor(idx,clbuf);
-	cgbuf = clbuf;
-	cgbuf[dim] = clbuf[dim]+L*pc;
-	peekLocalSite(s,pgbuf_v,cgbuf);
-	pokeLocalSite(s,result_v,clbuf);
+      autoView(r_v,result,AcceleratorWrite);
+      accelerator_for(idx,grid->oSites(),Nsimd,{
+#ifdef GRID_SIMT
+      {
+	int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int lane=0;lane<Nsimd;lane++) {
+#endif
+	Coordinate icoor(Ndim);
+	Coordinate ocoor(Ndim);
+	Coordinate pgcoor(Ndim);
+
+	Lexicographic::CoorFromIndex(icoor,lane,sdims);
+	Lexicographic::CoorFromIndex(ocoor,idx,rdims);
+
+	pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
+	for(int d=0,dd=1;d<Ndim;d++){
+	  if ( d!=dim ) {
+	    pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
+	    dd++;
+	  }
+	}
+	// Map coordinates in lattice layout to FFTW index
+	int64_t pgidx;
+	Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
+
+	vector_type *to = (vector_type *)&r_v[idx];
+	scalar_type stmp;
+	for(int w=0;w<Ncomp;w++){
+	  int64_t pg_idx = pgidx + w*pgvol;
+	  stmp = pgbuf_v[pg_idx];
+	  putlane(to[w], stmp, lane);
+	}
+	
+#ifdef GRID_SIMT
+      }
+#else
+      }
+#endif
      });
    }
+
    result = result*div;
-      
-    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+
+    t_insert +=usecond();
+    
    // destroying plan
    FFTW<scalar>::fftw_destroy_plan(p);
-#endif
+
+    t_total +=usecond();
+
+    std::cout <<GridLogPerformance<< " FFT took   "<<t_total/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< "  of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< "  of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< " FFT insert  "<<t_insert/1.0e6 <<" s" << std::endl;
+    
  }
 };