mirror of
https://github.com/paboyle/Grid.git
synced 2025-08-26 07:57:09 +01:00
FFT offload to GPU and MUCH faster comms.
40x speed up on Frontier
This commit is contained in:
@@ -28,6 +28,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#ifndef _GRID_FFT_H_
|
||||
#define _GRID_FFT_H_
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
#include <cufft.h>
|
||||
#endif
|
||||
|
||||
#ifdef GRID_HIP
|
||||
#include <hipfft/hipfft.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_FFTW
|
||||
#if defined(USE_MKL) || defined(GRID_SYCL)
|
||||
#include <fftw/fftw3.h>
|
||||
@@ -38,85 +46,184 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class scalar> struct FFTW { };
|
||||
#ifndef FFTW_FORWARD
|
||||
#define FFTW_FORWARD (-1)
|
||||
#define FFTW_BACKWARD (+1)
|
||||
#define FFTW_ESTIMATE (0)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_FFTW
|
||||
template<class scalar> struct FFTW {
|
||||
};
|
||||
|
||||
#ifdef GRID_HIP
|
||||
template<> struct FFTW<ComplexD> {
|
||||
public:
|
||||
static const int forward=FFTW_FORWARD;
|
||||
static const int backward=FFTW_BACKWARD;
|
||||
typedef hipfftDoubleComplex FFTW_scalar;
|
||||
typedef hipfftHandle FFTW_plan;
|
||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||
FFTW_scalar *in, int *inembed,
|
||||
int istride, int idist,
|
||||
FFTW_scalar *out, int *onembed,
|
||||
int ostride, int odist,
|
||||
int sign, unsigned flags) {
|
||||
FFTW_plan p;
|
||||
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
|
||||
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
||||
return p;
|
||||
}
|
||||
|
||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||
hipfftResult rv;
|
||||
if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
|
||||
else rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD);
|
||||
accelerator_barrier();
|
||||
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
||||
}
|
||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
||||
hipfftDestroy(p);
|
||||
}
|
||||
};
|
||||
template<> struct FFTW<ComplexF> {
|
||||
public:
|
||||
static const int forward=FFTW_FORWARD;
|
||||
static const int backward=FFTW_BACKWARD;
|
||||
typedef hipfftComplex FFTW_scalar;
|
||||
typedef hipfftHandle FFTW_plan;
|
||||
|
||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||
FFTW_scalar *in, int *inembed,
|
||||
int istride, int idist,
|
||||
FFTW_scalar *out, int *onembed,
|
||||
int ostride, int odist,
|
||||
int sign, unsigned flags) {
|
||||
FFTW_plan p;
|
||||
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
|
||||
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
||||
return p;
|
||||
}
|
||||
|
||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||
hipfftResult rv;
|
||||
if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
|
||||
else rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD);
|
||||
accelerator_barrier();
|
||||
GRID_ASSERT(rv==HIPFFT_SUCCESS);
|
||||
}
|
||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
||||
hipfftDestroy(p);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
template<> struct FFTW<ComplexD> {
|
||||
public:
|
||||
static const int forward=FFTW_FORWARD;
|
||||
static const int backward=FFTW_BACKWARD;
|
||||
typedef cufftDoubleComplex FFTW_scalar;
|
||||
typedef cufftHandle FFTW_plan;
|
||||
|
||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||
FFTW_scalar *in, int *inembed,
|
||||
int istride, int idist,
|
||||
FFTW_scalar *out, int *onembed,
|
||||
int ostride, int odist,
|
||||
int sign, unsigned flags) {
|
||||
FFTW_plan p;
|
||||
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
|
||||
return p;
|
||||
}
|
||||
|
||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||
if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
|
||||
else cufftExecZ2Z(p,in,out,CUFFT_BACKWARD);
|
||||
accelerator_barrier();
|
||||
}
|
||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
||||
cufftDestroy(p);
|
||||
}
|
||||
};
|
||||
template<> struct FFTW<ComplexF> {
|
||||
public:
|
||||
static const int forward=FFTW_FORWARD;
|
||||
static const int backward=FFTW_BACKWARD;
|
||||
typedef cufftComplex FFTW_scalar;
|
||||
typedef cufftHandle FFTW_plan;
|
||||
|
||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||
FFTW_scalar *in, int *inembed,
|
||||
int istride, int idist,
|
||||
FFTW_scalar *out, int *onembed,
|
||||
int ostride, int odist,
|
||||
int sign, unsigned flags) {
|
||||
FFTW_plan p;
|
||||
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
|
||||
return p;
|
||||
}
|
||||
|
||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||
if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
|
||||
else cufftExecC2C(p,in,out,CUFFT_BACKWARD);
|
||||
accelerator_barrier();
|
||||
}
|
||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
||||
cufftDestroy(p);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_FFTW
|
||||
template<> struct FFTW<ComplexD> {
|
||||
public:
|
||||
typedef fftw_complex FFTW_scalar;
|
||||
typedef fftw_plan FFTW_plan;
|
||||
|
||||
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
|
||||
FFTW_scalar *in, const int *inembed,
|
||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||
FFTW_scalar *in, int *inembed,
|
||||
int istride, int idist,
|
||||
FFTW_scalar *out, const int *onembed,
|
||||
FFTW_scalar *out, int *onembed,
|
||||
int ostride, int odist,
|
||||
int sign, unsigned flags) {
|
||||
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
|
||||
}
|
||||
|
||||
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
|
||||
::fftw_flops(p,add,mul,fmas);
|
||||
}
|
||||
|
||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
|
||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||
::fftw_execute_dft(p,in,out);
|
||||
}
|
||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
||||
::fftw_destroy_plan(p);
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct FFTW<ComplexF> {
|
||||
public:
|
||||
|
||||
typedef fftwf_complex FFTW_scalar;
|
||||
typedef fftwf_plan FFTW_plan;
|
||||
|
||||
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
|
||||
FFTW_scalar *in, const int *inembed,
|
||||
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
|
||||
FFTW_scalar *in, int *inembed,
|
||||
int istride, int idist,
|
||||
FFTW_scalar *out, const int *onembed,
|
||||
FFTW_scalar *out, int *onembed,
|
||||
int ostride, int odist,
|
||||
int sign, unsigned flags) {
|
||||
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
|
||||
}
|
||||
|
||||
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
|
||||
::fftwf_flops(p,add,mul,fmas);
|
||||
}
|
||||
|
||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
|
||||
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
|
||||
::fftwf_execute_dft(p,in,out);
|
||||
}
|
||||
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
||||
::fftwf_destroy_plan(p);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef FFTW_FORWARD
|
||||
#define FFTW_FORWARD (-1)
|
||||
#define FFTW_BACKWARD (+1)
|
||||
#endif
|
||||
|
||||
class FFT {
|
||||
private:
|
||||
|
||||
GridCartesian *vgrid;
|
||||
GridCartesian *sgrid;
|
||||
|
||||
int Nd;
|
||||
double flops;
|
||||
double flops_call;
|
||||
uint64_t usec;
|
||||
|
||||
Coordinate dimensions;
|
||||
Coordinate processors;
|
||||
Coordinate processor_coor;
|
||||
|
||||
public:
|
||||
|
||||
static const int forward=FFTW_FORWARD;
|
||||
@@ -126,31 +233,25 @@ public:
|
||||
double MFlops(void) {return flops/usec;}
|
||||
double USec(void) {return (double)usec;}
|
||||
|
||||
FFT ( GridCartesian * grid ) :
|
||||
vgrid(grid),
|
||||
Nd(grid->_ndimension),
|
||||
dimensions(grid->_fdimensions),
|
||||
processors(grid->_processors),
|
||||
processor_coor(grid->_processor_coor)
|
||||
FFT ( GridCartesian * grid )
|
||||
{
|
||||
flops=0;
|
||||
usec =0;
|
||||
Coordinate layout(Nd,1);
|
||||
sgrid = new GridCartesian(dimensions,layout,processors,*grid);
|
||||
};
|
||||
|
||||
~FFT ( void) {
|
||||
delete sgrid;
|
||||
// delete sgrid;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
|
||||
|
||||
conformable(result.Grid(),vgrid);
|
||||
conformable(source.Grid(),vgrid);
|
||||
Lattice<vobj> tmp(vgrid);
|
||||
tmp = source;
|
||||
for(int d=0;d<Nd;d++){
|
||||
// vgrid=result.Grid();
|
||||
// conformable(result.Grid(),vgrid);
|
||||
// conformable(source.Grid(),vgrid);
|
||||
const int Ndim = source.Grid()->Nd();
|
||||
Lattice<vobj> tmp = source;
|
||||
for(int d=0;d<Ndim;d++){
|
||||
if( mask[d] ) {
|
||||
FFT_dim(result,tmp,d,sign);
|
||||
tmp=result;
|
||||
@@ -160,62 +261,70 @@ public:
|
||||
|
||||
template<class vobj>
|
||||
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
|
||||
Coordinate mask(Nd,1);
|
||||
const int Ndim = source.Grid()->Nd();
|
||||
Coordinate mask(Ndim,1);
|
||||
FFT_dim_mask(result,source,mask,sign);
|
||||
}
|
||||
|
||||
|
||||
template<class vobj>
|
||||
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
||||
#ifndef HAVE_FFTW
|
||||
std::cerr << "FFTW is not compiled but is called"<<std::endl;
|
||||
GRID_ASSERT(0);
|
||||
#else
|
||||
conformable(result.Grid(),vgrid);
|
||||
conformable(source.Grid(),vgrid);
|
||||
const int Ndim = source.Grid()->Nd();
|
||||
GridBase *grid = source.Grid();
|
||||
conformable(result.Grid(),source.Grid());
|
||||
|
||||
int L = vgrid->_ldimensions[dim];
|
||||
int G = vgrid->_fdimensions[dim];
|
||||
|
||||
Coordinate layout(Nd,1);
|
||||
Coordinate pencil_gd(vgrid->_fdimensions);
|
||||
|
||||
pencil_gd[dim] = G*processors[dim];
|
||||
|
||||
// Pencil global vol LxLxGxLxL per node
|
||||
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
|
||||
int L = grid->_ldimensions[dim];
|
||||
int G = grid->_fdimensions[dim];
|
||||
|
||||
Coordinate layout(Ndim,1);
|
||||
|
||||
// Construct pencils
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename sobj::scalar_type scalar;
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
Lattice<sobj> pgbuf(&pencil_g);
|
||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||
//std::cout << "CPU view" << std::endl;
|
||||
|
||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||
|
||||
int Ncomp = sizeof(sobj)/sizeof(scalar);
|
||||
int Nlow = 1;
|
||||
int64_t Nlow = 1;
|
||||
int64_t Nhigh = 1;
|
||||
|
||||
for(int d=0;d<dim;d++){
|
||||
Nlow*=vgrid->_ldimensions[d];
|
||||
Nlow*=grid->_ldimensions[d];
|
||||
}
|
||||
for(int d=dim+1;d<Ndim;d++){
|
||||
Nhigh*=grid->_ldimensions[d];
|
||||
}
|
||||
int64_t Nperp=Nlow*Nhigh;
|
||||
|
||||
deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
|
||||
pgbuf.resize(Nperp*Ncomp*G);
|
||||
scalar *pgbuf_v = &pgbuf[0];
|
||||
|
||||
int rank = 1; /* 1d transforms */
|
||||
int n[] = {G}; /* 1d transforms of length G */
|
||||
int howmany = Ncomp;
|
||||
int howmany = Ncomp * Nperp;
|
||||
int odist,idist,istride,ostride;
|
||||
idist = odist = 1; /* Distance between consecutive FT's */
|
||||
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
|
||||
idist = odist = G; /* Distance between consecutive FT's */
|
||||
istride = ostride = 1; /* Distance between two elements in the same FT */
|
||||
int *inembed = n, *onembed = n;
|
||||
|
||||
scalar div;
|
||||
if ( sign == backward ) div = 1.0/G;
|
||||
else if ( sign == forward ) div = 1.0;
|
||||
else GRID_ASSERT(0);
|
||||
|
||||
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
||||
|
||||
double t_pencil=0;
|
||||
double t_fft =0;
|
||||
double t_total =-usecond();
|
||||
// std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
||||
/*
|
||||
*
|
||||
*/
|
||||
FFTW_plan p;
|
||||
{
|
||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||
@@ -229,72 +338,154 @@ public:
|
||||
}
|
||||
|
||||
// Barrel shift and collect global pencil
|
||||
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
||||
Coordinate lcoor(Nd), gcoor(Nd);
|
||||
// std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
||||
Coordinate lcoor(Ndim), gcoor(Ndim);
|
||||
double t_copy=0;
|
||||
double t_shift=0;
|
||||
t_pencil = -usecond();
|
||||
result = source;
|
||||
int pc = processor_coor[dim];
|
||||
int pc = grid->_processor_coor[dim];
|
||||
|
||||
const Coordinate ldims = grid->_ldimensions;
|
||||
const Coordinate rdims = grid->_rdimensions;
|
||||
const Coordinate sdims = grid->_simd_layout;
|
||||
|
||||
Coordinate processors = grid->_processors;
|
||||
Coordinate pgdims(Ndim);
|
||||
pgdims[0] = G;
|
||||
for(int d=0, dd=1;d<Ndim;d++){
|
||||
if ( d!=dim ) pgdims[dd++] = ldims[d];
|
||||
}
|
||||
int64_t pgvol=1;
|
||||
for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
|
||||
|
||||
const int Nsimd = vobj::Nsimd();
|
||||
for(int p=0;p<processors[dim];p++) {
|
||||
t_copy-=usecond();
|
||||
autoView(r_v,result,AcceleratorRead);
|
||||
accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
|
||||
#ifdef GRID_SIMT
|
||||
{
|
||||
autoView(r_v,result,CpuRead);
|
||||
autoView(p_v,pgbuf,CpuWrite);
|
||||
thread_for(idx, sgrid->lSites(),{
|
||||
Coordinate cbuf(Nd);
|
||||
sobj s;
|
||||
sgrid->LocalIndexToLocalCoor(idx,cbuf);
|
||||
peekLocalSite(s,r_v,cbuf);
|
||||
cbuf[dim]+=((pc+p) % processors[dim])*L;
|
||||
pokeLocalSite(s,p_v,cbuf);
|
||||
});
|
||||
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
|
||||
#else
|
||||
for(int lane=0;lane<Nsimd;lane++) {
|
||||
#endif
|
||||
Coordinate icoor;
|
||||
Coordinate ocoor;
|
||||
Coordinate pgcoor;
|
||||
|
||||
Lexicographic::CoorFromIndex(icoor,lane,sdims);
|
||||
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
|
||||
|
||||
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
|
||||
for(int d=0,dd=1;d<Ndim;d++){
|
||||
if ( d!=dim ) {
|
||||
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
|
||||
dd++;
|
||||
}
|
||||
}
|
||||
|
||||
// Map coordinates in lattice layout to FFTW index
|
||||
int64_t pgidx;
|
||||
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
|
||||
|
||||
vector_type *from = (vector_type *)&r_v[idx];
|
||||
scalar_type stmp;
|
||||
for(int w=0;w<Ncomp;w++){
|
||||
int64_t pg_idx = pgidx + w*pgvol;
|
||||
stmp = getlane(from[w], lane);
|
||||
pgbuf_v[pg_idx] = stmp;
|
||||
}
|
||||
#ifdef GRID_SIMT
|
||||
}
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
});
|
||||
|
||||
t_copy+=usecond();
|
||||
if (p != processors[dim] - 1) {
|
||||
result = Cshift(result,dim,L);
|
||||
Lattice<vobj> temp(grid);
|
||||
t_shift-=usecond();
|
||||
temp = Cshift(result,dim,L); result = temp;
|
||||
t_shift+=usecond();
|
||||
}
|
||||
}
|
||||
t_pencil += usecond();
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
||||
// Loop over orthog coords
|
||||
int NN=pencil_g.lSites();
|
||||
GridStopWatch timer;
|
||||
timer.Start();
|
||||
thread_for( idx,NN,{
|
||||
Coordinate cbuf(Nd);
|
||||
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
|
||||
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
|
||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
|
||||
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
|
||||
FFTW<scalar>::fftw_execute_dft(p,in,out);
|
||||
}
|
||||
});
|
||||
timer.Stop();
|
||||
|
||||
FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
|
||||
FFTW_scalar *out= (FFTW_scalar *)pgbuf_v;
|
||||
t_fft = -usecond();
|
||||
FFTW<scalar>::fftw_execute_dft(p,in,out,sign);
|
||||
t_fft += usecond();
|
||||
|
||||
// performance counting
|
||||
double add,mul,fma;
|
||||
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
|
||||
flops_call = add+mul+2.0*fma;
|
||||
usec += timer.useconds();
|
||||
flops+= flops_call*NN;
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
||||
// writing out result
|
||||
flops_call = 5.0*howmany*G*log2(G);
|
||||
usec = t_fft;
|
||||
flops= flops_call;
|
||||
|
||||
result = Zero();
|
||||
|
||||
double t_insert = -usecond();
|
||||
{
|
||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||
autoView(result_v,result,CpuWrite);
|
||||
thread_for(idx,sgrid->lSites(),{
|
||||
Coordinate clbuf(Nd), cgbuf(Nd);
|
||||
sobj s;
|
||||
sgrid->LocalIndexToLocalCoor(idx,clbuf);
|
||||
cgbuf = clbuf;
|
||||
cgbuf[dim] = clbuf[dim]+L*pc;
|
||||
peekLocalSite(s,pgbuf_v,cgbuf);
|
||||
pokeLocalSite(s,result_v,clbuf);
|
||||
autoView(r_v,result,AcceleratorWrite);
|
||||
accelerator_for(idx,grid->oSites(),Nsimd,{
|
||||
#ifdef GRID_SIMT
|
||||
{
|
||||
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
|
||||
#else
|
||||
for(int lane=0;lane<Nsimd;lane++) {
|
||||
#endif
|
||||
Coordinate icoor(Ndim);
|
||||
Coordinate ocoor(Ndim);
|
||||
Coordinate pgcoor(Ndim);
|
||||
|
||||
Lexicographic::CoorFromIndex(icoor,lane,sdims);
|
||||
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
|
||||
|
||||
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
|
||||
for(int d=0,dd=1;d<Ndim;d++){
|
||||
if ( d!=dim ) {
|
||||
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
|
||||
dd++;
|
||||
}
|
||||
}
|
||||
// Map coordinates in lattice layout to FFTW index
|
||||
int64_t pgidx;
|
||||
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
|
||||
|
||||
vector_type *to = (vector_type *)&r_v[idx];
|
||||
scalar_type stmp;
|
||||
for(int w=0;w<Ncomp;w++){
|
||||
int64_t pg_idx = pgidx + w*pgvol;
|
||||
stmp = pgbuf_v[pg_idx];
|
||||
putlane(to[w], stmp, lane);
|
||||
}
|
||||
|
||||
#ifdef GRID_SIMT
|
||||
}
|
||||
#else
|
||||
}
|
||||
#endif
|
||||
});
|
||||
}
|
||||
|
||||
result = result*div;
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
||||
|
||||
t_insert +=usecond();
|
||||
|
||||
// destroying plan
|
||||
FFTW<scalar>::fftw_destroy_plan(p);
|
||||
#endif
|
||||
|
||||
t_total +=usecond();
|
||||
|
||||
std::cout <<GridLogPerformance<< " FFT took "<<t_total/1.0e6 <<" s" << std::endl;
|
||||
std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
|
||||
std::cout <<GridLogPerformance<< " of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
|
||||
std::cout <<GridLogPerformance<< " of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
|
||||
std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
|
||||
std::cout <<GridLogPerformance<< " FFT insert "<<t_insert/1.0e6 <<" s" << std::endl;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
@@ -108,7 +108,7 @@ public:
|
||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static int RankWorld(void) ;
|
||||
static void BroadcastWorld(int root,void* data, int bytes);
|
||||
static void BroadcastWorld(int root,void* data, uint64_t bytes);
|
||||
static void BarrierWorld(void);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
@@ -175,27 +175,27 @@ public:
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes,int dir);
|
||||
uint64_t bytes,int dir);
|
||||
|
||||
void SendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
uint64_t bytes);
|
||||
|
||||
int IsOffNode(int rank);
|
||||
double StencilSendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
int recv_from_rank,int do_recv,
|
||||
int bytes,int dir);
|
||||
uint64_t bytes,int dir);
|
||||
|
||||
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
int recv_from_rank,int do_recv,
|
||||
int xbytes,int rbytes,int dir);
|
||||
uint64_t xbytes,uint64_t rbytes,int dir);
|
||||
|
||||
// Could do a PollHtoD and have a CommsMerge dependence
|
||||
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
|
||||
@@ -206,7 +206,7 @@ public:
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,void *recv_comp,
|
||||
int recv_from_rank,int do_recv,
|
||||
int xbytes,int rbytes,int dir);
|
||||
uint64_t xbytes,uint64_t rbytes,int dir);
|
||||
|
||||
|
||||
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
|
||||
@@ -220,7 +220,7 @@ public:
|
||||
////////////////////////////////////////////////////////////
|
||||
// Broadcast a buffer and composite larger
|
||||
////////////////////////////////////////////////////////////
|
||||
void Broadcast(int root,void* data, int bytes);
|
||||
void Broadcast(int root,void* data, uint64_t bytes);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// All2All down one dimension
|
||||
|
@@ -342,23 +342,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes,int dir)
|
||||
uint64_t bytes,int dir)
|
||||
{
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
GRID_ASSERT(dest != _processor);
|
||||
GRID_ASSERT(from != _processor);
|
||||
|
||||
GRID_ASSERT(bytes/(sizeof(int32_t))<= 2*1024*1024*1024);
|
||||
int tag;
|
||||
|
||||
tag= dir+from*32;
|
||||
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
|
||||
int ierr=MPI_Irecv(recv,(int)( bytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator,&rrq);
|
||||
GRID_ASSERT(ierr==0);
|
||||
list.push_back(rrq);
|
||||
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
|
||||
ierr =MPI_Isend(xmit,(int)(bytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator,&xrq);
|
||||
GRID_ASSERT(ierr==0);
|
||||
list.push_back(xrq);
|
||||
}
|
||||
@@ -379,7 +379,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
uint64_t bytes)
|
||||
{
|
||||
std::vector<MpiCommsRequest_t> reqs(0);
|
||||
|
||||
@@ -392,8 +392,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
ierr=MPI_Sendrecv(xmit,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,dest,myrank,
|
||||
recv,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
GRID_ASSERT(ierr==0);
|
||||
|
||||
@@ -403,7 +403,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int dest, int dox,
|
||||
void *recv,
|
||||
int from, int dor,
|
||||
int bytes,int dir)
|
||||
uint64_t bytes,int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
@@ -426,7 +426,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
uint64_t xbytes,uint64_t rbytes,int dir)
|
||||
{
|
||||
return 0.0; // Do nothing -- no preparation required
|
||||
}
|
||||
@@ -435,7 +435,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
int dest,int dox,
|
||||
void *recv,void *recv_comp,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
uint64_t xbytes,uint64_t rbytes,int dir)
|
||||
{
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
@@ -458,7 +458,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
|
||||
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
ierr=MPI_Irecv(recv_comp,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
|
||||
GRID_ASSERT(ierr==0);
|
||||
list.push_back(rrq);
|
||||
off_node_bytes+=rbytes;
|
||||
@@ -476,7 +476,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
if (dox) {
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
ierr =MPI_Isend(xmit_comp,(int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
|
||||
GRID_ASSERT(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=xbytes;
|
||||
@@ -543,7 +543,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
uint64_t xbytes,uint64_t rbytes,int dir)
|
||||
{
|
||||
/*
|
||||
* Bring sequence from Stencil.h down to lower level.
|
||||
@@ -584,7 +584,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
host_recv = this->HostBufferMalloc(rbytes);
|
||||
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
|
||||
GRID_ASSERT(ierr==0);
|
||||
CommsRequest_t srq;
|
||||
srq.PacketType = InterNodeRecv;
|
||||
@@ -686,7 +686,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
|
||||
if ( acceleratorEventIsComplete(list[idx].ev) ) {
|
||||
|
||||
void *host_xmit = list[idx].host_buf;
|
||||
uint32_t xbytes = list[idx].bytes;
|
||||
uint64_t xbytes = list[idx].bytes;
|
||||
int dest = list[idx].dest;
|
||||
int tag = list[idx].tag;
|
||||
int commdir = list[idx].commdir;
|
||||
@@ -697,7 +697,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
|
||||
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
|
||||
|
||||
MPI_Request xrq;
|
||||
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
int ierr =MPI_Isend(host_xmit, (int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
|
||||
GRID_ASSERT(ierr==0);
|
||||
|
||||
list[idx].req = xrq; // Update the MPI request in the list
|
||||
@@ -718,7 +718,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
int dest,int dox,
|
||||
void *recv,void *recv_comp,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
uint64_t xbytes,uint64_t rbytes,int dir)
|
||||
{
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
@@ -884,11 +884,11 @@ void CartesianCommunicator::Barrier(void)
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
GRID_ASSERT(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
void CartesianCommunicator::Broadcast(int root,void* data,uint64_t bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("Broadcast");
|
||||
int ierr=MPI_Bcast(data,
|
||||
bytes,
|
||||
(int)bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator);
|
||||
@@ -904,11 +904,11 @@ void CartesianCommunicator::BarrierWorld(void){
|
||||
int ierr = MPI_Barrier(communicator_world);
|
||||
GRID_ASSERT(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("BroadcastWorld");
|
||||
int ierr= MPI_Bcast(data,
|
||||
bytes,
|
||||
(int)bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator_world);
|
||||
|
@@ -89,7 +89,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
uint64_t bytes)
|
||||
{
|
||||
GRID_ASSERT(0);
|
||||
}
|
||||
@@ -99,7 +99,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes,int dir)
|
||||
uint64_t bytes,int dir)
|
||||
{
|
||||
GRID_ASSERT(0);
|
||||
}
|
||||
@@ -115,8 +115,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
|
||||
|
||||
int CartesianCommunicator::RankWorld(void){return 0;}
|
||||
void CartesianCommunicator::Barrier(void){}
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, uint64_t bytes) {}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) { }
|
||||
void CartesianCommunicator::BarrierWorld(void) { }
|
||||
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
|
||||
@@ -132,7 +132,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
void *recv,
|
||||
int recv_from_rank,int dor,
|
||||
int bytes, int dir)
|
||||
uint64_t bytes, int dir)
|
||||
{
|
||||
return 2.0*bytes;
|
||||
}
|
||||
@@ -143,7 +143,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
int xmit_to_rank,int dox,
|
||||
void *recv,
|
||||
int recv_from_rank,int dor,
|
||||
int xbytes,int rbytes, int dir)
|
||||
uint64_t xbytes,uint64_t rbytes, int dir)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
@@ -152,7 +152,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
int xmit_to_rank,int dox,
|
||||
void *recv, void *recv_comp,
|
||||
int recv_from_rank,int dor,
|
||||
int xbytes,int rbytes, int dir)
|
||||
uint64_t xbytes,uint64_t rbytes, int dir)
|
||||
{
|
||||
return xbytes+rbytes;
|
||||
}
|
||||
|
@@ -49,6 +49,20 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
if( shift ==0 ) {
|
||||
ret = rhs;
|
||||
return ret;
|
||||
}
|
||||
//
|
||||
// Potential easy fast cases:
|
||||
// Shift is a multiple of the local lattice extent.
|
||||
// Then need only to shift whole subvolumes
|
||||
int L = rhs.Grid()->_ldimensions[dimension];
|
||||
if ( (shift%L )==0 && !rhs.Grid()->CheckerBoarded(dimension) ) {
|
||||
Cshift_simple(ret,rhs,dimension,shift);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
|
||||
|
||||
// the permute type
|
||||
@@ -73,6 +87,55 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
GridBase *grid=rhs.Grid();
|
||||
int comm_proc, xmit_to_rank, recv_from_rank;
|
||||
|
||||
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
int ld = rhs.Grid()->_ldimensions[dimension];
|
||||
int pd = rhs.Grid()->_processors[dimension];
|
||||
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||
|
||||
comm_proc = ((shift)/ld)%pd;
|
||||
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
if(comm_dim) {
|
||||
|
||||
int64_t bytes = sizeof(vobj) * grid->oSites();
|
||||
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
autoView(ret_v , ret, AcceleratorWrite);
|
||||
void *send_buf = (void *)&rhs_v[0];
|
||||
void *recv_buf = (void *)&ret_v[0];
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom(send_buf,
|
||||
xmit_to_rank,
|
||||
recv_buf,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
#else
|
||||
static hostVector<vobj> hrhs; hrhs.resize(grid->oSites());
|
||||
static hostVector<vobj> hret; hret.resize(grid->oSites());
|
||||
|
||||
void *hsend_buf = (void *)&hrhs[0];
|
||||
void *hrecv_buf = (void *)&hret[0];
|
||||
|
||||
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
||||
|
||||
grid->SendToRecvFrom(hsend_buf,
|
||||
xmit_to_rank,
|
||||
hrecv_buf,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
|
@@ -60,12 +60,16 @@ inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
|
||||
}
|
||||
inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
|
||||
{
|
||||
double secs = 1.0*now.count()*1.0e-3;
|
||||
stream << secs<<" s";
|
||||
/*
|
||||
GridSecs second(1);
|
||||
auto secs = now/second ;
|
||||
auto subseconds = now%second ;
|
||||
auto fill = stream.fill();
|
||||
stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
|
||||
stream.fill(fill);
|
||||
*/
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
|
||||
|
@@ -10,12 +10,11 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
||||
--disable-fermion-reps \
|
||||
--enable-simd=GPU \
|
||||
--with-gmp=$OLCF_GMP_ROOT \
|
||||
--with-fftw=$FFTW_DIR/.. \
|
||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||
--disable-fermion-reps \
|
||||
CXX=hipcc MPICXX=mpicxx \
|
||||
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
|
||||
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft"
|
||||
|
||||
|
||||
|
||||
|
@@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace Grid;
|
||||
;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
@@ -116,10 +115,10 @@ int main (int argc, char ** argv)
|
||||
|
||||
Stilde=S;
|
||||
std::cout<<" Benchmarking FFT of LatticeSpinMatrix "<<std::endl;
|
||||
theFFT.FFT_dim(Stilde,S,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||
theFFT.FFT_dim(Stilde,S,1,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||
theFFT.FFT_dim(Stilde,S,2,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||
theFFT.FFT_dim(Stilde,S,3,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||
theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||
theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||
theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||
theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
|
||||
|
||||
SpinMatrixD Sp;
|
||||
Sp = Zero(); Sp = Sp+cVol;
|
||||
@@ -202,11 +201,16 @@ int main (int argc, char ** argv)
|
||||
FFT theFFT5(FGrid);
|
||||
|
||||
theFFT5.FFT_dim(result5,tmp5,1,FFT::forward); tmp5 = result5;
|
||||
std::cout<<"Fourier xformed Ddwf 1 "<<norm2(result5)<<std::endl;
|
||||
theFFT5.FFT_dim(result5,tmp5,2,FFT::forward); tmp5 = result5;
|
||||
std::cout<<"Fourier xformed Ddwf 2 "<<norm2(result5)<<std::endl;
|
||||
theFFT5.FFT_dim(result5,tmp5,3,FFT::forward); tmp5 = result5;
|
||||
theFFT5.FFT_dim(result5,tmp5,4,FFT::forward); result5 = result5*ComplexD(::sqrt(1.0/vol),0.0);
|
||||
std::cout<<"Fourier xformed Ddwf 3 "<<norm2(result5)<<std::endl;
|
||||
theFFT5.FFT_dim(result5,tmp5,4,FFT::forward);
|
||||
std::cout<<"Fourier xformed Ddwf 4 "<<norm2(result5)<<std::endl;
|
||||
result5 = result5*ComplexD(::sqrt(1.0/vol),0.0);
|
||||
|
||||
std::cout<<"Fourier xformed Ddwf"<<std::endl;
|
||||
std::cout<<"Fourier xformed Ddwf "<<norm2(result5)<<std::endl;
|
||||
|
||||
tmp5 = src5;
|
||||
theFFT5.FFT_dim(src5_p,tmp5,1,FFT::forward); tmp5 = src5_p;
|
||||
@@ -214,7 +218,7 @@ int main (int argc, char ** argv)
|
||||
theFFT5.FFT_dim(src5_p,tmp5,3,FFT::forward); tmp5 = src5_p;
|
||||
theFFT5.FFT_dim(src5_p,tmp5,4,FFT::forward); src5_p = src5_p*ComplexD(::sqrt(1.0/vol),0.0);
|
||||
|
||||
std::cout<<"Fourier xformed src5"<<std::endl;
|
||||
std::cout<<"Fourier xformed src5"<< norm2(src5)<<" -> "<<norm2(src5_p)<<std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// work out the predicted from Fourier
|
||||
@@ -251,7 +255,8 @@ int main (int argc, char ** argv)
|
||||
Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src5_p);
|
||||
|
||||
}
|
||||
|
||||
std::cout << " src5 "<<norm2(src5_p)<<std::endl;
|
||||
std::cout << " Kinetic "<<norm2(Kinetic)<<std::endl;
|
||||
// NB implicit sum over mu
|
||||
//
|
||||
// 1-1/2 Dw = 1 - 1/2 ( eip+emip)
|
||||
@@ -260,18 +265,23 @@ int main (int argc, char ** argv)
|
||||
// = 2 sink/2 ink/2 = sk2
|
||||
|
||||
W = one - M5 + sk2;
|
||||
std::cout << " W "<<norm2(W)<<std::endl;
|
||||
Kinetic = Kinetic + W * src5_p;
|
||||
|
||||
std::cout << " Kinetic "<<norm2(Kinetic)<<std::endl;
|
||||
|
||||
LatticeCoordinate(scoor,sdir);
|
||||
|
||||
tmp5 = Cshift(src5_p,sdir,+1);
|
||||
tmp5 = (tmp5 - G5*tmp5)*0.5;
|
||||
tmp5 = where(scoor==Integer(Ls-1),mass*tmp5,-tmp5);
|
||||
std::cout << " tmp5 "<<norm2(tmp5)<<std::endl;
|
||||
Kinetic = Kinetic + tmp5;
|
||||
|
||||
tmp5 = Cshift(src5_p,sdir,-1);
|
||||
tmp5 = (tmp5 + G5*tmp5)*0.5;
|
||||
tmp5 = where(scoor==Integer(0),mass*tmp5,-tmp5);
|
||||
std::cout << " tmp5 "<<norm2(tmp5)<<std::endl;
|
||||
Kinetic = Kinetic + tmp5;
|
||||
|
||||
std::cout<<"Momentum space Ddwf "<< norm2(Kinetic)<<std::endl;
|
||||
@@ -339,7 +349,7 @@ int main (int argc, char ** argv)
|
||||
Ddwf.Mdag(src5,tmp5);
|
||||
src5=tmp5;
|
||||
MdagMLinearOperator<DomainWallFermionD,LatticeFermionD> HermOp(Ddwf);
|
||||
ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000);
|
||||
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
||||
CG(HermOp,src5,result5);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@@ -423,7 +433,7 @@ int main (int argc, char ** argv)
|
||||
Dov.Mdag(src5,tmp5);
|
||||
src5=tmp5;
|
||||
MdagMLinearOperator<OverlapWilsonCayleyTanhFermionD,LatticeFermionD> HermOp(Dov);
|
||||
ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000);
|
||||
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
||||
CG(HermOp,src5,result5);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
|
Reference in New Issue
Block a user