1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-08-27 16:37:10 +01:00

FFT offload to GPU and MUCH faster comms.

40x speed up on Frontier
This commit is contained in:
Peter Boyle
2025-08-21 16:44:55 -04:00
parent 76c0ada1e1
commit fe0db53842
8 changed files with 443 additions and 176 deletions

View File

@@ -28,6 +28,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_FFT_H_ #ifndef _GRID_FFT_H_
#define _GRID_FFT_H_ #define _GRID_FFT_H_
#ifdef GRID_CUDA
#include <cufft.h>
#endif
#ifdef GRID_HIP
#include <hipfft/hipfft.h>
#endif
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
#if defined(USE_MKL) || defined(GRID_SYCL) #if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h> #include <fftw/fftw3.h>
@@ -38,85 +46,184 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; #ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#define FFTW_ESTIMATE (0)
#endif
#ifdef HAVE_FFTW template<class scalar> struct FFTW {
};
#ifdef GRID_HIP
template<> struct FFTW<ComplexD> { template<> struct FFTW<ComplexD> {
public: public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftDoubleComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
hipfftDestroy(p);
}
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
hipfftDestroy(p);
}
};
#endif
#ifdef GRID_CUDA
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftDoubleComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
else cufftExecZ2Z(p,in,out,CUFFT_BACKWARD);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
cufftDestroy(p);
}
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
else cufftExecC2C(p,in,out,CUFFT_BACKWARD);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
cufftDestroy(p);
}
};
#endif
#ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> {
public:
typedef fftw_complex FFTW_scalar; typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan; typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, FFTW_scalar *in, int *inembed,
FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, const int *onembed, FFTW_scalar *out, int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out); ::fftw_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p); ::fftw_destroy_plan(p);
} }
}; };
template<> struct FFTW<ComplexF> { template<> struct FFTW<ComplexF> {
public: public:
typedef fftwf_complex FFTW_scalar; typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan; typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, FFTW_scalar *in, int *inembed,
FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, const int *onembed, FFTW_scalar *out, int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out); ::fftwf_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p); ::fftwf_destroy_plan(p);
} }
}; };
#endif
#ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#endif #endif
class FFT { class FFT {
private: private:
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops; double flops;
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
Coordinate dimensions;
Coordinate processors;
Coordinate processor_coor;
public: public:
static const int forward=FFTW_FORWARD; static const int forward=FFTW_FORWARD;
@@ -126,31 +233,25 @@ public:
double MFlops(void) {return flops/usec;} double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;} double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) : FFT ( GridCartesian * grid )
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{ {
flops=0; flops=0;
usec =0; usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors,*grid);
}; };
~FFT ( void) { ~FFT ( void) {
delete sgrid; // delete sgrid;
} }
template<class vobj> template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){ void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result.Grid(),vgrid); // vgrid=result.Grid();
conformable(source.Grid(),vgrid); // conformable(result.Grid(),vgrid);
Lattice<vobj> tmp(vgrid); // conformable(source.Grid(),vgrid);
tmp = source; const int Ndim = source.Grid()->Nd();
for(int d=0;d<Nd;d++){ Lattice<vobj> tmp = source;
for(int d=0;d<Ndim;d++){
if( mask[d] ) { if( mask[d] ) {
FFT_dim(result,tmp,d,sign); FFT_dim(result,tmp,d,sign);
tmp=result; tmp=result;
@@ -160,62 +261,70 @@ public:
template<class vobj> template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1); const int Ndim = source.Grid()->Nd();
Coordinate mask(Ndim,1);
FFT_dim_mask(result,source,mask,sign); FFT_dim_mask(result,source,mask,sign);
} }
template<class vobj> template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW const int Ndim = source.Grid()->Nd();
std::cerr << "FFTW is not compiled but is called"<<std::endl; GridBase *grid = source.Grid();
GRID_ASSERT(0); conformable(result.Grid(),source.Grid());
#else
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
int L = vgrid->_ldimensions[dim]; int L = grid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim]; int G = grid->_fdimensions[dim];
Coordinate layout(Nd,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
Coordinate layout(Ndim,1);
// Construct pencils // Construct pencils
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar; typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
//std::cout << "CPU view" << std::endl; //std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar); int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1; int64_t Nlow = 1;
int64_t Nhigh = 1;
for(int d=0;d<dim;d++){ for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d]; Nlow*=grid->_ldimensions[d];
} }
for(int d=dim+1;d<Ndim;d++){
Nhigh*=grid->_ldimensions[d];
}
int64_t Nperp=Nlow*Nhigh;
deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
pgbuf.resize(Nperp*Ncomp*G);
scalar *pgbuf_v = &pgbuf[0];
int rank = 1; /* 1d transforms */ int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */ int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp; int howmany = Ncomp * Nperp;
int odist,idist,istride,ostride; int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */ idist = odist = G; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ istride = ostride = 1; /* Distance between two elements in the same FT */
int *inembed = n, *onembed = n; int *inembed = n, *onembed = n;
scalar div; scalar div;
if ( sign == backward ) div = 1.0/G; if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0; else if ( sign == forward ) div = 1.0;
else GRID_ASSERT(0); else GRID_ASSERT(0);
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; double t_pencil=0;
double t_fft =0;
double t_total =-usecond();
// std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
/*
*
*/
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,72 +338,154 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
//std::cout << GridLogPerformance<<"Making pencil" << std::endl; // std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd); Coordinate lcoor(Ndim), gcoor(Ndim);
double t_copy=0;
double t_shift=0;
t_pencil = -usecond();
result = source; result = source;
int pc = processor_coor[dim]; int pc = grid->_processor_coor[dim];
const Coordinate ldims = grid->_ldimensions;
const Coordinate rdims = grid->_rdimensions;
const Coordinate sdims = grid->_simd_layout;
Coordinate processors = grid->_processors;
Coordinate pgdims(Ndim);
pgdims[0] = G;
for(int d=0, dd=1;d<Ndim;d++){
if ( d!=dim ) pgdims[dd++] = ldims[d];
}
int64_t pgvol=1;
for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
const int Nsimd = vobj::Nsimd();
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
t_copy-=usecond();
autoView(r_v,result,AcceleratorRead);
accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
#ifdef GRID_SIMT
{ {
autoView(r_v,result,CpuRead); int lane=acceleratorSIMTlane(Nsimd); // buffer lane
autoView(p_v,pgbuf,CpuWrite); #else
thread_for(idx, sgrid->lSites(),{ for(int lane=0;lane<Nsimd;lane++) {
Coordinate cbuf(Nd); #endif
sobj s; Coordinate icoor;
sgrid->LocalIndexToLocalCoor(idx,cbuf); Coordinate ocoor;
peekLocalSite(s,r_v,cbuf); Coordinate pgcoor;
cbuf[dim]+=((pc+p) % processors[dim])*L;
pokeLocalSite(s,p_v,cbuf); Lexicographic::CoorFromIndex(icoor,lane,sdims);
}); Lexicographic::CoorFromIndex(ocoor,idx,rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
for(int d=0,dd=1;d<Ndim;d++){
if ( d!=dim ) {
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
dd++;
}
}
// Map coordinates in lattice layout to FFTW index
int64_t pgidx;
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
vector_type *from = (vector_type *)&r_v[idx];
scalar_type stmp;
for(int w=0;w<Ncomp;w++){
int64_t pg_idx = pgidx + w*pgvol;
stmp = getlane(from[w], lane);
pgbuf_v[pg_idx] = stmp;
}
#ifdef GRID_SIMT
} }
#else
}
#endif
});
t_copy+=usecond();
if (p != processors[dim] - 1) { if (p != processors[dim] - 1) {
result = Cshift(result,dim,L); Lattice<vobj> temp(grid);
t_shift-=usecond();
temp = Cshift(result,dim,L); result = temp;
t_shift+=usecond();
} }
} }
t_pencil += usecond();
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl; FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
// Loop over orthog coords FFTW_scalar *out= (FFTW_scalar *)pgbuf_v;
int NN=pencil_g.lSites(); t_fft = -usecond();
GridStopWatch timer; FFTW<scalar>::fftw_execute_dft(p,in,out,sign);
timer.Start(); t_fft += usecond();
thread_for( idx,NN,{
Coordinate cbuf(Nd);
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
});
timer.Stop();
// performance counting // performance counting
double add,mul,fma; flops_call = 5.0*howmany*G*log2(G);
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma); usec = t_fft;
flops_call = add+mul+2.0*fma; flops= flops_call;
usec += timer.useconds();
flops+= flops_call*NN; result = Zero();
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl; double t_insert = -usecond();
// writing out result
{ {
autoView(pgbuf_v,pgbuf,CpuRead); autoView(r_v,result,AcceleratorWrite);
autoView(result_v,result,CpuWrite); accelerator_for(idx,grid->oSites(),Nsimd,{
thread_for(idx,sgrid->lSites(),{ #ifdef GRID_SIMT
Coordinate clbuf(Nd), cgbuf(Nd); {
sobj s; int lane=acceleratorSIMTlane(Nsimd); // buffer lane
sgrid->LocalIndexToLocalCoor(idx,clbuf); #else
cgbuf = clbuf; for(int lane=0;lane<Nsimd;lane++) {
cgbuf[dim] = clbuf[dim]+L*pc; #endif
peekLocalSite(s,pgbuf_v,cgbuf); Coordinate icoor(Ndim);
pokeLocalSite(s,result_v,clbuf); Coordinate ocoor(Ndim);
Coordinate pgcoor(Ndim);
Lexicographic::CoorFromIndex(icoor,lane,sdims);
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
for(int d=0,dd=1;d<Ndim;d++){
if ( d!=dim ) {
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
dd++;
}
}
// Map coordinates in lattice layout to FFTW index
int64_t pgidx;
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
vector_type *to = (vector_type *)&r_v[idx];
scalar_type stmp;
for(int w=0;w<Ncomp;w++){
int64_t pg_idx = pgidx + w*pgvol;
stmp = pgbuf_v[pg_idx];
putlane(to[w], stmp, lane);
}
#ifdef GRID_SIMT
}
#else
}
#endif
}); });
} }
result = result*div; result = result*div;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl; t_insert +=usecond();
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);
#endif
t_total +=usecond();
std::cout <<GridLogPerformance<< " FFT took "<<t_total/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT insert "<<t_insert/1.0e6 <<" s" << std::endl;
} }
}; };

View File

@@ -108,7 +108,7 @@ public:
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ; static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes); static void BroadcastWorld(int root,void* data, uint64_t bytes);
static void BarrierWorld(void); static void BarrierWorld(void);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@@ -175,27 +175,27 @@ public:
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir); uint64_t bytes,int dir);
void SendToRecvFrom(void *xmit, void SendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
int bytes); uint64_t bytes);
int IsOffNode(int rank); int IsOffNode(int rank);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int bytes,int dir); uint64_t bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir); uint64_t xbytes,uint64_t rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence // Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list); void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
@@ -206,7 +206,7 @@ public:
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv,void *recv_comp, void *recv,void *recv_comp,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir); uint64_t xbytes,uint64_t rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -220,7 +220,7 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger // Broadcast a buffer and composite larger
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes); void Broadcast(int root,void* data, uint64_t bytes);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// All2All down one dimension // All2All down one dimension

View File

@@ -342,23 +342,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir) uint64_t bytes,int dir)
{ {
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
GRID_ASSERT(dest != _processor); GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor); GRID_ASSERT(from != _processor);
GRID_ASSERT(bytes/(sizeof(int32_t))<= 2*1024*1024*1024);
int tag; int tag;
tag= dir+from*32; tag= dir+from*32;
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq); int ierr=MPI_Irecv(recv,(int)( bytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator,&rrq);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(rrq); list.push_back(rrq);
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq); ierr =MPI_Isend(xmit,(int)(bytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator,&xrq);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(xrq); list.push_back(xrq);
} }
@@ -379,7 +379,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes) uint64_t bytes)
{ {
std::vector<MpiCommsRequest_t> reqs(0); std::vector<MpiCommsRequest_t> reqs(0);
@@ -392,8 +392,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, ierr=MPI_Sendrecv(xmit,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,dest,myrank,
recv,bytes,MPI_CHAR,from, from, recv,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,from, from,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
@@ -403,7 +403,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dox, int dest, int dox,
void *recv, void *recv,
int from, int dor, int from, int dor,
int bytes,int dir) uint64_t bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
@@ -426,7 +426,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int dest,int dox, int dest,int dox,
void *recv, void *recv,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) uint64_t xbytes,uint64_t rbytes,int dir)
{ {
return 0.0; // Do nothing -- no preparation required return 0.0; // Do nothing -- no preparation required
} }
@@ -435,7 +435,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int dest,int dox, int dest,int dox,
void *recv,void *recv_comp, void *recv,void *recv_comp,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) uint64_t xbytes,uint64_t rbytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
@@ -458,7 +458,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl; // std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); ierr=MPI_Irecv(recv_comp,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
@@ -476,7 +476,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if (dox) { if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit_comp,(int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=xbytes; off_node_bytes+=xbytes;
@@ -543,7 +543,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int dest,int dox, int dest,int dox,
void *recv, void *recv,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) uint64_t xbytes,uint64_t rbytes,int dir)
{ {
/* /*
* Bring sequence from Stencil.h down to lower level. * Bring sequence from Stencil.h down to lower level.
@@ -584,7 +584,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes); host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
CommsRequest_t srq; CommsRequest_t srq;
srq.PacketType = InterNodeRecv; srq.PacketType = InterNodeRecv;
@@ -686,7 +686,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
if ( acceleratorEventIsComplete(list[idx].ev) ) { if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf; void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes; uint64_t xbytes = list[idx].bytes;
int dest = list[idx].dest; int dest = list[idx].dest;
int tag = list[idx].tag; int tag = list[idx].tag;
int commdir = list[idx].commdir; int commdir = list[idx].commdir;
@@ -697,7 +697,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl; // std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq; MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); int ierr =MPI_Isend(host_xmit, (int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list list[idx].req = xrq; // Update the MPI request in the list
@@ -718,7 +718,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int dest,int dox, int dest,int dox,
void *recv,void *recv_comp, void *recv,void *recv_comp,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) uint64_t xbytes,uint64_t rbytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
@@ -884,11 +884,11 @@ void CartesianCommunicator::Barrier(void)
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::Broadcast(int root,void* data,uint64_t bytes)
{ {
FlightRecorder::StepLog("Broadcast"); FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data, int ierr=MPI_Bcast(data,
bytes, (int)bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator); communicator);
@@ -904,11 +904,11 @@ void CartesianCommunicator::BarrierWorld(void){
int ierr = MPI_Barrier(communicator_world); int ierr = MPI_Barrier(communicator_world);
GRID_ASSERT(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes)
{ {
FlightRecorder::StepLog("BroadcastWorld"); FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
bytes, (int)bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator_world); communicator_world);

View File

@@ -89,7 +89,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes) uint64_t bytes)
{ {
GRID_ASSERT(0); GRID_ASSERT(0);
} }
@@ -99,7 +99,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir) uint64_t bytes,int dir)
{ {
GRID_ASSERT(0); GRID_ASSERT(0);
} }
@@ -115,8 +115,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int CartesianCommunicator::RankWorld(void){return 0;} int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, uint64_t bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) { }
void CartesianCommunicator::BarrierWorld(void) { } void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
@@ -132,7 +132,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
int bytes, int dir) uint64_t bytes, int dir)
{ {
return 2.0*bytes; return 2.0*bytes;
} }
@@ -143,7 +143,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
int xbytes,int rbytes, int dir) uint64_t xbytes,uint64_t rbytes, int dir)
{ {
return 0.0; return 0.0;
} }
@@ -152,7 +152,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv_comp, void *recv, void *recv_comp,
int recv_from_rank,int dor, int recv_from_rank,int dor,
int xbytes,int rbytes, int dir) uint64_t xbytes,uint64_t rbytes, int dir)
{ {
return xbytes+rbytes; return xbytes+rbytes;
} }

View File

@@ -49,6 +49,20 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
// Map to always positive shift modulo global full dimension. // Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
if( shift ==0 ) {
ret = rhs;
return ret;
}
//
// Potential easy fast cases:
// Shift is a multiple of the local lattice extent.
// Then need only to shift whole subvolumes
int L = rhs.Grid()->_ldimensions[dimension];
if ( (shift%L )==0 && !rhs.Grid()->CheckerBoarded(dimension) ) {
Cshift_simple(ret,rhs,dimension,shift);
return ret;
}
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type // the permute type
@@ -73,6 +87,55 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
return ret; return ret;
} }
template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
GridBase *grid=rhs.Grid();
int comm_proc, xmit_to_rank, recv_from_rank;
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int ld = rhs.Grid()->_ldimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
comm_proc = ((shift)/ld)%pd;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
if(comm_dim) {
int64_t bytes = sizeof(vobj) * grid->oSites();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(ret_v , ret, AcceleratorWrite);
void *send_buf = (void *)&rhs_v[0];
void *recv_buf = (void *)&ret_v[0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom(send_buf,
xmit_to_rank,
recv_buf,
recv_from_rank,
bytes);
#else
static hostVector<vobj> hrhs; hrhs.resize(grid->oSites());
static hostVector<vobj> hret; hret.resize(grid->oSites());
void *hsend_buf = (void *)&hrhs[0];
void *hrecv_buf = (void *)&hret[0];
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom(hsend_buf,
xmit_to_rank,
hrecv_buf,
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
}
}
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{ {
int sshift[2]; int sshift[2];

View File

@@ -60,12 +60,16 @@ inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
} }
inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now) inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
{ {
double secs = 1.0*now.count()*1.0e-3;
stream << secs<<" s";
/*
GridSecs second(1); GridSecs second(1);
auto secs = now/second ; auto secs = now/second ;
auto subseconds = now%second ; auto subseconds = now%second ;
auto fill = stream.fill(); auto fill = stream.fill();
stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s"; stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
stream.fill(fill); stream.fill(fill);
*/
return stream; return stream;
} }
inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now) inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)

View File

@@ -10,12 +10,11 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
--disable-fermion-reps \ --disable-fermion-reps \
--enable-simd=GPU \ --enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \ --with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \ --disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \ CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \ CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas" LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft"

View File

@@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/Grid.h> #include <Grid/Grid.h>
using namespace Grid; using namespace Grid;
;
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
@@ -116,10 +115,10 @@ int main (int argc, char ** argv)
Stilde=S; Stilde=S;
std::cout<<" Benchmarking FFT of LatticeSpinMatrix "<<std::endl; std::cout<<" Benchmarking FFT of LatticeSpinMatrix "<<std::endl;
theFFT.FFT_dim(Stilde,S,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl; theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
theFFT.FFT_dim(Stilde,S,1,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl; theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
theFFT.FFT_dim(Stilde,S,2,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl; theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
theFFT.FFT_dim(Stilde,S,3,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl; theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
SpinMatrixD Sp; SpinMatrixD Sp;
Sp = Zero(); Sp = Sp+cVol; Sp = Zero(); Sp = Sp+cVol;
@@ -202,11 +201,16 @@ int main (int argc, char ** argv)
FFT theFFT5(FGrid); FFT theFFT5(FGrid);
theFFT5.FFT_dim(result5,tmp5,1,FFT::forward); tmp5 = result5; theFFT5.FFT_dim(result5,tmp5,1,FFT::forward); tmp5 = result5;
std::cout<<"Fourier xformed Ddwf 1 "<<norm2(result5)<<std::endl;
theFFT5.FFT_dim(result5,tmp5,2,FFT::forward); tmp5 = result5; theFFT5.FFT_dim(result5,tmp5,2,FFT::forward); tmp5 = result5;
std::cout<<"Fourier xformed Ddwf 2 "<<norm2(result5)<<std::endl;
theFFT5.FFT_dim(result5,tmp5,3,FFT::forward); tmp5 = result5; theFFT5.FFT_dim(result5,tmp5,3,FFT::forward); tmp5 = result5;
theFFT5.FFT_dim(result5,tmp5,4,FFT::forward); result5 = result5*ComplexD(::sqrt(1.0/vol),0.0); std::cout<<"Fourier xformed Ddwf 3 "<<norm2(result5)<<std::endl;
theFFT5.FFT_dim(result5,tmp5,4,FFT::forward);
std::cout<<"Fourier xformed Ddwf 4 "<<norm2(result5)<<std::endl;
result5 = result5*ComplexD(::sqrt(1.0/vol),0.0);
std::cout<<"Fourier xformed Ddwf"<<std::endl; std::cout<<"Fourier xformed Ddwf "<<norm2(result5)<<std::endl;
tmp5 = src5; tmp5 = src5;
theFFT5.FFT_dim(src5_p,tmp5,1,FFT::forward); tmp5 = src5_p; theFFT5.FFT_dim(src5_p,tmp5,1,FFT::forward); tmp5 = src5_p;
@@ -214,7 +218,7 @@ int main (int argc, char ** argv)
theFFT5.FFT_dim(src5_p,tmp5,3,FFT::forward); tmp5 = src5_p; theFFT5.FFT_dim(src5_p,tmp5,3,FFT::forward); tmp5 = src5_p;
theFFT5.FFT_dim(src5_p,tmp5,4,FFT::forward); src5_p = src5_p*ComplexD(::sqrt(1.0/vol),0.0); theFFT5.FFT_dim(src5_p,tmp5,4,FFT::forward); src5_p = src5_p*ComplexD(::sqrt(1.0/vol),0.0);
std::cout<<"Fourier xformed src5"<<std::endl; std::cout<<"Fourier xformed src5"<< norm2(src5)<<" -> "<<norm2(src5_p)<<std::endl;
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// work out the predicted from Fourier // work out the predicted from Fourier
@@ -251,7 +255,8 @@ int main (int argc, char ** argv)
Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src5_p); Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src5_p);
} }
std::cout << " src5 "<<norm2(src5_p)<<std::endl;
std::cout << " Kinetic "<<norm2(Kinetic)<<std::endl;
// NB implicit sum over mu // NB implicit sum over mu
// //
// 1-1/2 Dw = 1 - 1/2 ( eip+emip) // 1-1/2 Dw = 1 - 1/2 ( eip+emip)
@@ -260,18 +265,23 @@ int main (int argc, char ** argv)
// = 2 sink/2 ink/2 = sk2 // = 2 sink/2 ink/2 = sk2
W = one - M5 + sk2; W = one - M5 + sk2;
std::cout << " W "<<norm2(W)<<std::endl;
Kinetic = Kinetic + W * src5_p; Kinetic = Kinetic + W * src5_p;
std::cout << " Kinetic "<<norm2(Kinetic)<<std::endl;
LatticeCoordinate(scoor,sdir); LatticeCoordinate(scoor,sdir);
tmp5 = Cshift(src5_p,sdir,+1); tmp5 = Cshift(src5_p,sdir,+1);
tmp5 = (tmp5 - G5*tmp5)*0.5; tmp5 = (tmp5 - G5*tmp5)*0.5;
tmp5 = where(scoor==Integer(Ls-1),mass*tmp5,-tmp5); tmp5 = where(scoor==Integer(Ls-1),mass*tmp5,-tmp5);
std::cout << " tmp5 "<<norm2(tmp5)<<std::endl;
Kinetic = Kinetic + tmp5; Kinetic = Kinetic + tmp5;
tmp5 = Cshift(src5_p,sdir,-1); tmp5 = Cshift(src5_p,sdir,-1);
tmp5 = (tmp5 + G5*tmp5)*0.5; tmp5 = (tmp5 + G5*tmp5)*0.5;
tmp5 = where(scoor==Integer(0),mass*tmp5,-tmp5); tmp5 = where(scoor==Integer(0),mass*tmp5,-tmp5);
std::cout << " tmp5 "<<norm2(tmp5)<<std::endl;
Kinetic = Kinetic + tmp5; Kinetic = Kinetic + tmp5;
std::cout<<"Momentum space Ddwf "<< norm2(Kinetic)<<std::endl; std::cout<<"Momentum space Ddwf "<< norm2(Kinetic)<<std::endl;
@@ -339,7 +349,7 @@ int main (int argc, char ** argv)
Ddwf.Mdag(src5,tmp5); Ddwf.Mdag(src5,tmp5);
src5=tmp5; src5=tmp5;
MdagMLinearOperator<DomainWallFermionD,LatticeFermionD> HermOp(Ddwf); MdagMLinearOperator<DomainWallFermionD,LatticeFermionD> HermOp(Ddwf);
ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000); ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
CG(HermOp,src5,result5); CG(HermOp,src5,result5);
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@@ -423,7 +433,7 @@ int main (int argc, char ** argv)
Dov.Mdag(src5,tmp5); Dov.Mdag(src5,tmp5);
src5=tmp5; src5=tmp5;
MdagMLinearOperator<OverlapWilsonCayleyTanhFermionD,LatticeFermionD> HermOp(Dov); MdagMLinearOperator<OverlapWilsonCayleyTanhFermionD,LatticeFermionD> HermOp(Dov);
ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000); ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
CG(HermOp,src5,result5); CG(HermOp,src5,result5);
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////