mirror of
https://github.com/paboyle/Grid.git
synced 2026-05-25 11:34:16 +01:00
FFT: cache plans per vobj type across calls
Plans are created lazily on the first FFT_dim call and reused for all subsequent calls on the same FFT object. PlanCreate<vobj>() can be called explicitly to pre-warm the cache. PlanDestroy() must be called before switching to a different vobj type; the destructor cleans up any live plans automatically. Update Test_fft.cc and Test_fftf.cc to call PlanDestroy() between the LatticeComplex and LatticeSpinMatrix sections that reuse the same FFT object. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+248
-258
@@ -1,6 +1,6 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
Source file: ./lib/Cshift.h
|
Source file: ./lib/Cshift.h
|
||||||
|
|
||||||
@@ -28,6 +28,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef _GRID_FFT_H_
|
#ifndef _GRID_FFT_H_
|
||||||
#define _GRID_FFT_H_
|
#define _GRID_FFT_H_
|
||||||
|
|
||||||
|
#include <any>
|
||||||
|
#include <functional>
|
||||||
|
#include <typeindex>
|
||||||
|
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
#include <cufft.h>
|
#include <cufft.h>
|
||||||
#endif
|
#endif
|
||||||
@@ -65,17 +69,16 @@ public:
|
|||||||
typedef hipfftDoubleComplex FFTW_scalar;
|
typedef hipfftDoubleComplex FFTW_scalar;
|
||||||
typedef hipfftHandle FFTW_plan;
|
typedef hipfftHandle FFTW_plan;
|
||||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||||
FFTW_scalar *in, int *inembed,
|
FFTW_scalar *in, int *inembed,
|
||||||
int istride, int idist,
|
int istride, int idist,
|
||||||
FFTW_scalar *out, int *onembed,
|
FFTW_scalar *out, int *onembed,
|
||||||
int ostride, int odist,
|
int ostride, int odist,
|
||||||
int sign, unsigned flags) {
|
int sign, unsigned flags) {
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
|
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
|
||||||
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||||
hipfftResult rv;
|
hipfftResult rv;
|
||||||
if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
|
if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
|
||||||
@@ -83,29 +86,25 @@ public:
|
|||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
||||||
}
|
}
|
||||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
inline static void fftw_destroy_plan(const FFTW_plan p) { hipfftDestroy(p); }
|
||||||
hipfftDestroy(p);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
template<> struct FFTW<ComplexF> {
|
template<> struct FFTW<ComplexF> {
|
||||||
public:
|
public:
|
||||||
static const int forward=FFTW_FORWARD;
|
static const int forward=FFTW_FORWARD;
|
||||||
static const int backward=FFTW_BACKWARD;
|
static const int backward=FFTW_BACKWARD;
|
||||||
typedef hipfftComplex FFTW_scalar;
|
typedef hipfftComplex FFTW_scalar;
|
||||||
typedef hipfftHandle FFTW_plan;
|
typedef hipfftHandle FFTW_plan;
|
||||||
|
|
||||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||||
FFTW_scalar *in, int *inembed,
|
FFTW_scalar *in, int *inembed,
|
||||||
int istride, int idist,
|
int istride, int idist,
|
||||||
FFTW_scalar *out, int *onembed,
|
FFTW_scalar *out, int *onembed,
|
||||||
int ostride, int odist,
|
int ostride, int odist,
|
||||||
int sign, unsigned flags) {
|
int sign, unsigned flags) {
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
|
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
|
||||||
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||||
hipfftResult rv;
|
hipfftResult rv;
|
||||||
if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
|
if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
|
||||||
@@ -113,9 +112,7 @@ public:
|
|||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
||||||
}
|
}
|
||||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
inline static void fftw_destroy_plan(const FFTW_plan p) { hipfftDestroy(p); }
|
||||||
hipfftDestroy(p);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -126,53 +123,45 @@ public:
|
|||||||
static const int backward=FFTW_BACKWARD;
|
static const int backward=FFTW_BACKWARD;
|
||||||
typedef cufftDoubleComplex FFTW_scalar;
|
typedef cufftDoubleComplex FFTW_scalar;
|
||||||
typedef cufftHandle FFTW_plan;
|
typedef cufftHandle FFTW_plan;
|
||||||
|
|
||||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||||
FFTW_scalar *in, int *inembed,
|
FFTW_scalar *in, int *inembed,
|
||||||
int istride, int idist,
|
int istride, int idist,
|
||||||
FFTW_scalar *out, int *onembed,
|
FFTW_scalar *out, int *onembed,
|
||||||
int ostride, int odist,
|
int ostride, int odist,
|
||||||
int sign, unsigned flags) {
|
int sign, unsigned flags) {
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
|
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||||
if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
|
if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
|
||||||
else cufftExecZ2Z(p,in,out,CUFFT_INVERSE);
|
else cufftExecZ2Z(p,in,out,CUFFT_INVERSE);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
}
|
}
|
||||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
inline static void fftw_destroy_plan(const FFTW_plan p) { cufftDestroy(p); }
|
||||||
cufftDestroy(p);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
template<> struct FFTW<ComplexF> {
|
template<> struct FFTW<ComplexF> {
|
||||||
public:
|
public:
|
||||||
static const int forward=FFTW_FORWARD;
|
static const int forward=FFTW_FORWARD;
|
||||||
static const int backward=FFTW_BACKWARD;
|
static const int backward=FFTW_BACKWARD;
|
||||||
typedef cufftComplex FFTW_scalar;
|
typedef cufftComplex FFTW_scalar;
|
||||||
typedef cufftHandle FFTW_plan;
|
typedef cufftHandle FFTW_plan;
|
||||||
|
|
||||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||||
FFTW_scalar *in, int *inembed,
|
FFTW_scalar *in, int *inembed,
|
||||||
int istride, int idist,
|
int istride, int idist,
|
||||||
FFTW_scalar *out, int *onembed,
|
FFTW_scalar *out, int *onembed,
|
||||||
int ostride, int odist,
|
int ostride, int odist,
|
||||||
int sign, unsigned flags) {
|
int sign, unsigned flags) {
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
|
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||||
if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
|
if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
|
||||||
else cufftExecC2C(p,in,out,CUFFT_INVERSE);
|
else cufftExecC2C(p,in,out,CUFFT_INVERSE);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
}
|
}
|
||||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
inline static void fftw_destroy_plan(const FFTW_plan p) { cufftDestroy(p); }
|
||||||
cufftDestroy(p);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -183,313 +172,314 @@ public:
|
|||||||
typedef fftw_complex FFTW_scalar;
|
typedef fftw_complex FFTW_scalar;
|
||||||
typedef fftw_plan FFTW_plan;
|
typedef fftw_plan FFTW_plan;
|
||||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||||
FFTW_scalar *in, int *inembed,
|
FFTW_scalar *in, int *inembed,
|
||||||
int istride, int idist,
|
int istride, int idist,
|
||||||
FFTW_scalar *out, int *onembed,
|
FFTW_scalar *out, int *onembed,
|
||||||
int ostride, int odist,
|
int ostride, int odist,
|
||||||
int sign, unsigned flags) {
|
int sign, unsigned flags) {
|
||||||
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
|
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||||
::fftw_execute_dft(p,in,out);
|
::fftw_execute_dft(p,in,out);
|
||||||
}
|
}
|
||||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
inline static void fftw_destroy_plan(const FFTW_plan p) { ::fftw_destroy_plan(p); }
|
||||||
::fftw_destroy_plan(p);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
template<> struct FFTW<ComplexF> {
|
template<> struct FFTW<ComplexF> {
|
||||||
public:
|
public:
|
||||||
typedef fftwf_complex FFTW_scalar;
|
typedef fftwf_complex FFTW_scalar;
|
||||||
typedef fftwf_plan FFTW_plan;
|
typedef fftwf_plan FFTW_plan;
|
||||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||||
FFTW_scalar *in, int *inembed,
|
FFTW_scalar *in, int *inembed,
|
||||||
int istride, int idist,
|
int istride, int idist,
|
||||||
FFTW_scalar *out, int *onembed,
|
FFTW_scalar *out, int *onembed,
|
||||||
int ostride, int odist,
|
int ostride, int odist,
|
||||||
int sign, unsigned flags) {
|
int sign, unsigned flags) {
|
||||||
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
|
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||||
::fftwf_execute_dft(p,in,out);
|
::fftwf_execute_dft(p,in,out);
|
||||||
}
|
}
|
||||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
inline static void fftw_destroy_plan(const FFTW_plan p) { ::fftwf_destroy_plan(p); }
|
||||||
::fftwf_destroy_plan(p);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
class FFT {
|
class FFT {
|
||||||
private:
|
private:
|
||||||
|
|
||||||
double flops;
|
double flops;
|
||||||
double flops_call;
|
double flops_call;
|
||||||
uint64_t usec;
|
uint64_t usec;
|
||||||
|
GridCartesian *_grid;
|
||||||
public:
|
|
||||||
|
|
||||||
static const int forward=FFTW_FORWARD;
|
|
||||||
static const int backward=FFTW_BACKWARD;
|
|
||||||
|
|
||||||
double Flops(void) {return flops;}
|
|
||||||
double MFlops(void) {return flops/usec;}
|
|
||||||
double USec(void) {return (double)usec;}
|
|
||||||
|
|
||||||
FFT ( GridCartesian * grid )
|
// Type-erased plan entry. The handle is recovered via
|
||||||
{
|
// std::any_cast<FFTW<scalar>::FFTW_plan> inside FFT_dim, which knows the
|
||||||
flops=0;
|
// scalar type at compile time.
|
||||||
usec =0;
|
struct PlanEntry {
|
||||||
|
std::any handle;
|
||||||
|
std::function<void()> destroy;
|
||||||
};
|
};
|
||||||
|
|
||||||
~FFT ( void) {
|
|
||||||
// delete sgrid;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
|
|
||||||
|
|
||||||
// vgrid=result.Grid();
|
std::vector<PlanEntry> forward_plans; // size Nd when populated, 0 otherwise
|
||||||
// conformable(result.Grid(),vgrid);
|
std::vector<PlanEntry> backward_plans;
|
||||||
// conformable(source.Grid(),vgrid);
|
std::type_index _plan_type { typeid(void) }; // vobj type plans were built for
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
static const int forward = FFTW_FORWARD;
|
||||||
|
static const int backward = FFTW_BACKWARD;
|
||||||
|
|
||||||
|
double Flops(void) { return flops; }
|
||||||
|
double MFlops(void) { return flops / usec; }
|
||||||
|
double USec(void) { return (double)usec; }
|
||||||
|
|
||||||
|
FFT(GridCartesian *grid) : _grid(grid), flops(0), usec(0) {}
|
||||||
|
|
||||||
|
~FFT() {
|
||||||
|
if (forward_plans.size() > 0) PlanDestroy();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Explicitly pre-create and cache plans for all Nd dimensions.
|
||||||
|
// Optional: FFT_dim will call this lazily on first use if not called.
|
||||||
|
// Asserts that no plans already exist; call PlanDestroy first to re-create.
|
||||||
|
template<class vobj>
|
||||||
|
void PlanCreate() {
|
||||||
|
GRID_ASSERT(forward_plans.size() == 0);
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar;
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
|
|
||||||
|
const int Ndim = _grid->Nd();
|
||||||
|
forward_plans.resize(Ndim);
|
||||||
|
backward_plans.resize(Ndim);
|
||||||
|
|
||||||
|
for (int d = 0; d < Ndim; d++) {
|
||||||
|
int G = _grid->_fdimensions[d];
|
||||||
|
int Ncomp = sizeof(sobj) / sizeof(scalar);
|
||||||
|
int64_t Nperp = 1;
|
||||||
|
for (int dd = 0; dd < Ndim; dd++)
|
||||||
|
if (dd != d) Nperp *= _grid->_ldimensions[dd];
|
||||||
|
int howmany = Ncomp * (int)Nperp;
|
||||||
|
int n[] = {G};
|
||||||
|
|
||||||
|
// GPU backends (cuFFT/hipFFT) ignore the buffer pointer at plan creation.
|
||||||
|
// CPU FFTW with FFTW_ESTIMATE inspects only alignment and never touches data.
|
||||||
|
deviceVector<scalar> dummy(2);
|
||||||
|
FFTW_scalar *buf = (FFTW_scalar *)&dummy[0];
|
||||||
|
|
||||||
|
{
|
||||||
|
FFTW_plan p = FFTW<scalar>::fftw_plan_many_dft(
|
||||||
|
1, n, howmany, buf, n, 1, G, buf, n, 1, G, FFTW_FORWARD, FFTW_ESTIMATE);
|
||||||
|
forward_plans[d] = { p, [p](){ FFTW<scalar>::fftw_destroy_plan(p); } };
|
||||||
|
}
|
||||||
|
{
|
||||||
|
FFTW_plan p = FFTW<scalar>::fftw_plan_many_dft(
|
||||||
|
1, n, howmany, buf, n, 1, G, buf, n, 1, G, FFTW_BACKWARD, FFTW_ESTIMATE);
|
||||||
|
backward_plans[d] = { p, [p](){ FFTW<scalar>::fftw_destroy_plan(p); } };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_plan_type = std::type_index(typeid(vobj));
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlanDestroy() {
|
||||||
|
for (auto &e : forward_plans) e.destroy();
|
||||||
|
for (auto &e : backward_plans) e.destroy();
|
||||||
|
forward_plans.resize(0);
|
||||||
|
backward_plans.resize(0);
|
||||||
|
_plan_type = std::type_index(typeid(void));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj>
|
||||||
|
void FFT_dim_mask(Lattice<vobj> &result, const Lattice<vobj> &source, Coordinate mask, int sign) {
|
||||||
const int Ndim = source.Grid()->Nd();
|
const int Ndim = source.Grid()->Nd();
|
||||||
Lattice<vobj> tmp = source;
|
Lattice<vobj> tmp = source;
|
||||||
for(int d=0;d<Ndim;d++){
|
for (int d = 0; d < Ndim; d++) {
|
||||||
if( mask[d] ) {
|
if (mask[d]) {
|
||||||
FFT_dim(result,tmp,d,sign);
|
FFT_dim(result, tmp, d, sign);
|
||||||
tmp=result;
|
tmp = result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
|
void FFT_all_dim(Lattice<vobj> &result, const Lattice<vobj> &source, int sign) {
|
||||||
const int Ndim = source.Grid()->Nd();
|
const int Ndim = source.Grid()->Nd();
|
||||||
Coordinate mask(Ndim,1);
|
Coordinate mask(Ndim, 1);
|
||||||
FFT_dim_mask(result,source,mask,sign);
|
FFT_dim_mask(result, source, mask, sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
void FFT_dim(Lattice<vobj> &result, const Lattice<vobj> &source, int dim, int sign) {
|
||||||
const int Ndim = source.Grid()->Nd();
|
const int Ndim = source.Grid()->Nd();
|
||||||
GridBase *grid = source.Grid();
|
GridBase *grid = source.Grid();
|
||||||
conformable(result.Grid(),source.Grid());
|
conformable(result.Grid(), source.Grid());
|
||||||
|
|
||||||
int L = grid->_ldimensions[dim];
|
int L = grid->_ldimensions[dim];
|
||||||
int G = grid->_fdimensions[dim];
|
int G = grid->_fdimensions[dim];
|
||||||
|
|
||||||
Coordinate layout(Ndim,1);
|
|
||||||
|
|
||||||
// Construct pencils
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::scalar_type scalar;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
//std::cout << "CPU view" << std::endl;
|
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
|
||||||
|
|
||||||
int Ncomp = sizeof(sobj)/sizeof(scalar);
|
|
||||||
int64_t Nlow = 1;
|
|
||||||
int64_t Nhigh = 1;
|
|
||||||
|
|
||||||
for(int d=0;d<dim;d++){
|
typedef typename FFTW<scalar_type>::FFTW_scalar FFTW_scalar;
|
||||||
Nlow*=grid->_ldimensions[d];
|
typedef typename FFTW<scalar_type>::FFTW_plan FFTW_plan;
|
||||||
}
|
|
||||||
for(int d=dim+1;d<Ndim;d++){
|
int Ncomp = sizeof(sobj) / sizeof(scalar_type);
|
||||||
Nhigh*=grid->_ldimensions[d];
|
int64_t Nlow = 1;
|
||||||
}
|
int64_t Nhigh = 1;
|
||||||
int64_t Nperp=Nlow*Nhigh;
|
for (int d = 0; d < dim; d++) Nlow *= grid->_ldimensions[d];
|
||||||
|
for (int d = dim+1; d < Ndim; d++) Nhigh *= grid->_ldimensions[d];
|
||||||
deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
|
int64_t Nperp = Nlow * Nhigh;
|
||||||
pgbuf.resize(Nperp*Ncomp*G);
|
|
||||||
scalar *pgbuf_v = &pgbuf[0];
|
deviceVector<scalar_type> pgbuf(Nperp * Ncomp * G); // [perp][component][dim]
|
||||||
|
scalar_type *pgbuf_v = &pgbuf[0];
|
||||||
int rank = 1; /* 1d transforms */
|
|
||||||
int n[] = {G}; /* 1d transforms of length G */
|
int rank = 1;
|
||||||
|
int n[] = {G};
|
||||||
int howmany = Ncomp * Nperp;
|
int howmany = Ncomp * Nperp;
|
||||||
int odist,idist,istride,ostride;
|
int idist = G, odist = G, istride = 1, ostride = 1;
|
||||||
idist = odist = G; /* Distance between consecutive FT's */
|
|
||||||
istride = ostride = 1; /* Distance between two elements in the same FT */
|
|
||||||
int *inembed = n, *onembed = n;
|
int *inembed = n, *onembed = n;
|
||||||
|
|
||||||
scalar div;
|
scalar_type div;
|
||||||
if ( sign == backward ) div = 1.0/G;
|
if (sign == backward) div = 1.0 / G;
|
||||||
else if ( sign == forward ) div = 1.0;
|
else if (sign == forward) div = 1.0;
|
||||||
else GRID_ASSERT(0);
|
else GRID_ASSERT(0);
|
||||||
|
|
||||||
double t_pencil=0;
|
// Populate cache on first call; subsequent calls check type consistency.
|
||||||
double t_fft =0;
|
if (forward_plans.size() == 0) PlanCreate<vobj>();
|
||||||
double t_total =-usecond();
|
GRID_ASSERT(forward_plans.size() == (size_t)Ndim);
|
||||||
// std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
GRID_ASSERT(std::type_index(typeid(vobj)) == _plan_type);
|
||||||
/*
|
|
||||||
*
|
auto &plans = (sign == forward) ? forward_plans : backward_plans;
|
||||||
*/
|
FFTW_plan p = std::any_cast<FFTW_plan>(plans[dim].handle);
|
||||||
FFTW_plan p;
|
|
||||||
{
|
double t_pencil = 0;
|
||||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
double t_fft = 0;
|
||||||
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
|
double t_copy = 0;
|
||||||
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
|
double t_shift = 0;
|
||||||
in,inembed,
|
double t_total = -usecond();
|
||||||
istride,idist,
|
|
||||||
out,onembed,
|
// Barrel-shift gather: accumulate global pencil into pgbuf
|
||||||
ostride, odist,
|
|
||||||
sign,FFTW_ESTIMATE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
|
||||||
// std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
|
||||||
Coordinate lcoor(Ndim), gcoor(Ndim);
|
|
||||||
double t_copy=0;
|
|
||||||
double t_shift=0;
|
|
||||||
t_pencil = -usecond();
|
|
||||||
result = source;
|
result = source;
|
||||||
int pc = grid->_processor_coor[dim];
|
int pc = grid->_processor_coor[dim];
|
||||||
|
|
||||||
const Coordinate ldims = grid->_ldimensions;
|
const Coordinate ldims = grid->_ldimensions;
|
||||||
const Coordinate rdims = grid->_rdimensions;
|
const Coordinate rdims = grid->_rdimensions;
|
||||||
const Coordinate sdims = grid->_simd_layout;
|
const Coordinate sdims = grid->_simd_layout;
|
||||||
|
Coordinate processors = grid->_processors;
|
||||||
|
|
||||||
Coordinate processors = grid->_processors;
|
|
||||||
Coordinate pgdims(Ndim);
|
Coordinate pgdims(Ndim);
|
||||||
pgdims[0] = G;
|
pgdims[0] = G;
|
||||||
for(int d=0, dd=1;d<Ndim;d++){
|
for (int d = 0, dd = 1; d < Ndim; d++)
|
||||||
if ( d!=dim ) pgdims[dd++] = ldims[d];
|
if (d != dim) pgdims[dd++] = ldims[d];
|
||||||
}
|
int64_t pgvol = 1;
|
||||||
int64_t pgvol=1;
|
for (int d = 0; d < Ndim; d++) pgvol *= pgdims[d];
|
||||||
for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
|
|
||||||
|
|
||||||
const int Nsimd = vobj::Nsimd();
|
const int Nsimd = vobj::Nsimd();
|
||||||
for(int p=0;p<processors[dim];p++) {
|
t_pencil = -usecond();
|
||||||
t_copy-=usecond();
|
for (int p_idx = 0; p_idx < processors[dim]; p_idx++) {
|
||||||
autoView(r_v,result,AcceleratorRead);
|
t_copy -= usecond();
|
||||||
|
autoView(r_v, result, AcceleratorRead);
|
||||||
accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
|
accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
{
|
{
|
||||||
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
|
int lane = acceleratorSIMTlane(Nsimd);
|
||||||
#else
|
#else
|
||||||
for(int lane=0;lane<Nsimd;lane++) {
|
for (int lane = 0; lane < Nsimd; lane++) {
|
||||||
#endif
|
#endif
|
||||||
Coordinate icoor;
|
Coordinate icoor, ocoor, pgcoor;
|
||||||
Coordinate ocoor;
|
Lexicographic::CoorFromIndex(icoor, lane, sdims);
|
||||||
Coordinate pgcoor;
|
Lexicographic::CoorFromIndex(ocoor, idx, rdims);
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(icoor,lane,sdims);
|
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p_idx)%processors[dim])*L;
|
||||||
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
|
for (int d = 0, dd = 1; d < Ndim; d++) {
|
||||||
|
if (d != dim) { pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d]; dd++; }
|
||||||
|
}
|
||||||
|
int64_t pgidx;
|
||||||
|
Lexicographic::IndexFromCoor(pgcoor, pgidx, pgdims);
|
||||||
|
|
||||||
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
|
vector_type *from = (vector_type *)&r_v[idx];
|
||||||
for(int d=0,dd=1;d<Ndim;d++){
|
scalar_type stmp;
|
||||||
if ( d!=dim ) {
|
for (int w = 0; w < Ncomp; w++) {
|
||||||
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
|
stmp = getlane(from[w], lane);
|
||||||
dd++;
|
pgbuf_v[pgidx + w*pgvol] = stmp;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Map coordinates in lattice layout to FFTW index
|
|
||||||
int64_t pgidx;
|
|
||||||
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
|
|
||||||
|
|
||||||
vector_type *from = (vector_type *)&r_v[idx];
|
|
||||||
scalar_type stmp;
|
|
||||||
for(int w=0;w<Ncomp;w++){
|
|
||||||
int64_t pg_idx = pgidx + w*pgvol;
|
|
||||||
stmp = getlane(from[w], lane);
|
|
||||||
pgbuf_v[pg_idx] = stmp;
|
|
||||||
}
|
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
});
|
});
|
||||||
|
t_copy += usecond();
|
||||||
|
|
||||||
t_copy+=usecond();
|
if (p_idx != processors[dim] - 1) {
|
||||||
if (p != processors[dim] - 1) {
|
Lattice<vobj> temp(grid);
|
||||||
Lattice<vobj> temp(grid);
|
t_shift -= usecond();
|
||||||
t_shift-=usecond();
|
temp = Cshift(result, dim, L); result = temp;
|
||||||
temp = Cshift(result,dim,L); result = temp;
|
t_shift += usecond();
|
||||||
t_shift+=usecond();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
t_pencil += usecond();
|
t_pencil += usecond();
|
||||||
|
|
||||||
FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
|
FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
|
||||||
FFTW_scalar *out= (FFTW_scalar *)pgbuf_v;
|
FFTW_scalar *out = (FFTW_scalar *)pgbuf_v;
|
||||||
t_fft = -usecond();
|
t_fft = -usecond();
|
||||||
FFTW<scalar>::fftw_execute_dft(p,in,out,sign);
|
FFTW<scalar_type>::fftw_execute_dft(p, in, out, sign);
|
||||||
t_fft += usecond();
|
t_fft += usecond();
|
||||||
|
|
||||||
// performance counting
|
flops_call = 5.0 * howmany * G * log2(G);
|
||||||
flops_call = 5.0*howmany*G*log2(G);
|
usec = t_fft;
|
||||||
usec = t_fft;
|
flops = flops_call;
|
||||||
flops= flops_call;
|
|
||||||
|
|
||||||
result = Zero();
|
result = Zero();
|
||||||
|
|
||||||
double t_insert = -usecond();
|
double t_insert = -usecond();
|
||||||
{
|
{
|
||||||
autoView(r_v,result,AcceleratorWrite);
|
autoView(r_v, result, AcceleratorWrite);
|
||||||
accelerator_for(idx,grid->oSites(),Nsimd,{
|
accelerator_for(idx, grid->oSites(), Nsimd, {
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
{
|
{
|
||||||
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
|
int lane = acceleratorSIMTlane(Nsimd);
|
||||||
#else
|
#else
|
||||||
for(int lane=0;lane<Nsimd;lane++) {
|
for (int lane = 0; lane < Nsimd; lane++) {
|
||||||
#endif
|
#endif
|
||||||
Coordinate icoor(Ndim);
|
Coordinate icoor(Ndim), ocoor(Ndim), pgcoor(Ndim);
|
||||||
Coordinate ocoor(Ndim);
|
Lexicographic::CoorFromIndex(icoor, lane, sdims);
|
||||||
Coordinate pgcoor(Ndim);
|
Lexicographic::CoorFromIndex(ocoor, idx, rdims);
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(icoor,lane,sdims);
|
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
|
||||||
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
|
for (int d = 0, dd = 1; d < Ndim; d++) {
|
||||||
|
if (d != dim) { pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d]; dd++; }
|
||||||
|
}
|
||||||
|
int64_t pgidx;
|
||||||
|
Lexicographic::IndexFromCoor(pgcoor, pgidx, pgdims);
|
||||||
|
|
||||||
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
|
vector_type *to = (vector_type *)&r_v[idx];
|
||||||
for(int d=0,dd=1;d<Ndim;d++){
|
scalar_type stmp;
|
||||||
if ( d!=dim ) {
|
for (int w = 0; w < Ncomp; w++) {
|
||||||
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
|
stmp = pgbuf_v[pgidx + w*pgvol];
|
||||||
dd++;
|
putlane(to[w], stmp, lane);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
// Map coordinates in lattice layout to FFTW index
|
|
||||||
int64_t pgidx;
|
|
||||||
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
|
|
||||||
|
|
||||||
vector_type *to = (vector_type *)&r_v[idx];
|
|
||||||
scalar_type stmp;
|
|
||||||
for(int w=0;w<Ncomp;w++){
|
|
||||||
int64_t pg_idx = pgidx + w*pgvol;
|
|
||||||
stmp = pgbuf_v[pg_idx];
|
|
||||||
putlane(to[w], stmp, lane);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
result = result*div;
|
result = result * div;
|
||||||
|
t_insert += usecond();
|
||||||
|
t_total += usecond();
|
||||||
|
|
||||||
t_insert +=usecond();
|
std::cout << GridLogPerformance << " FFT took " << t_total/1.0e6 << " s" << std::endl;
|
||||||
|
std::cout << GridLogPerformance << " FFT pencil " << t_pencil/1.0e6 << " s" << std::endl;
|
||||||
// destroying plan
|
std::cout << GridLogPerformance << " of which copy " << t_copy/1.0e6 << " s" << std::endl;
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
std::cout << GridLogPerformance << " of which shift " << t_shift/1.0e6 << " s" << std::endl;
|
||||||
|
std::cout << GridLogPerformance << " FFT kernels " << t_fft/1.0e6 << " s" << std::endl;
|
||||||
t_total +=usecond();
|
std::cout << GridLogPerformance << " FFT insert " << t_insert/1.0e6 << " s" << std::endl;
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< " FFT took "<<t_total/1.0e6 <<" s" << std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< " of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< " of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< " FFT insert "<<t_insert/1.0e6 <<" s" << std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -113,6 +113,7 @@ int main (int argc, char ** argv)
|
|||||||
Cref= Cref - C;
|
Cref= Cref - C;
|
||||||
std::cout << " invertible check " << norm2(Cref)<<std::endl;
|
std::cout << " invertible check " << norm2(Cref)<<std::endl;
|
||||||
|
|
||||||
|
theFFT.PlanDestroy();
|
||||||
Stilde=S;
|
Stilde=S;
|
||||||
std::cout<<" Benchmarking FFT of LatticeSpinMatrix "<<std::endl;
|
std::cout<<" Benchmarking FFT of LatticeSpinMatrix "<<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||||
|
|||||||
@@ -95,6 +95,7 @@ int main (int argc, char ** argv)
|
|||||||
C=C-Ctilde;
|
C=C-Ctilde;
|
||||||
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
||||||
|
|
||||||
|
theFFT.PlanDestroy();
|
||||||
Stilde = S;
|
Stilde = S;
|
||||||
theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
||||||
|
|||||||
Reference in New Issue
Block a user