From fe0db5384212b595ce2cba3192fa0f249a3aa651 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 Aug 2025 16:44:55 -0400 Subject: [PATCH] FFT offload to GPU and MUCH faster comms. 40x speed up on Frontier --- Grid/algorithms/FFT.h | 447 ++++++++++++++++++------- Grid/communicator/Communicator_base.h | 14 +- Grid/communicator/Communicator_mpi3.cc | 42 +-- Grid/communicator/Communicator_none.cc | 14 +- Grid/cshift/Cshift_mpi.h | 63 ++++ Grid/perfmon/Timer.h | 4 + systems/Frontier/config-command | 3 +- tests/core/Test_fft.cc | 32 +- 8 files changed, 443 insertions(+), 176 deletions(-) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 36a30d38..1c0cc058 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -28,6 +28,14 @@ Author: Peter Boyle #ifndef _GRID_FFT_H_ #define _GRID_FFT_H_ +#ifdef GRID_CUDA +#include +#endif + +#ifdef GRID_HIP +#include +#endif + #ifdef HAVE_FFTW #if defined(USE_MKL) || defined(GRID_SYCL) #include @@ -38,85 +46,184 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); -template struct FFTW { }; +#ifndef FFTW_FORWARD +#define FFTW_FORWARD (-1) +#define FFTW_BACKWARD (+1) +#define FFTW_ESTIMATE (0) +#endif -#ifdef HAVE_FFTW +template struct FFTW { +}; + +#ifdef GRID_HIP template<> struct FFTW { public: + static const int forward=FFTW_FORWARD; + static const int backward=FFTW_BACKWARD; + typedef hipfftDoubleComplex FFTW_scalar; + typedef hipfftHandle FFTW_plan; + static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany, + FFTW_scalar *in, int *inembed, + int istride, int idist, + FFTW_scalar *out, int *onembed, + int ostride, int odist, + int sign, unsigned flags) { + FFTW_plan p; + auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany); + GRID_ASSERT(rv==HIPFFT_SUCCESS); + return p; + } + + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) { + hipfftResult rv; + if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD); + else rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD); + accelerator_barrier(); + GRID_ASSERT(rv==HIPFFT_SUCCESS); + } + inline static void fftw_destroy_plan(const FFTW_plan p) { + hipfftDestroy(p); + } +}; +template<> struct FFTW { +public: + static const int forward=FFTW_FORWARD; + static const int backward=FFTW_BACKWARD; + typedef hipfftComplex FFTW_scalar; + typedef hipfftHandle FFTW_plan; + static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany, + FFTW_scalar *in, int *inembed, + int istride, int idist, + FFTW_scalar *out, int *onembed, + int ostride, int odist, + int sign, unsigned flags) { + FFTW_plan p; + auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany); + GRID_ASSERT(rv==HIPFFT_SUCCESS); + return p; + } + + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) { + hipfftResult rv; + if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD); + else rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD); + accelerator_barrier(); + GRID_ASSERT(rv==HIPFFT_SUCCESS); + } + inline static void fftw_destroy_plan(const FFTW_plan p) { + hipfftDestroy(p); + } +}; +#endif + +#ifdef GRID_CUDA +template<> struct FFTW { +public: + static const int forward=FFTW_FORWARD; + static const int backward=FFTW_BACKWARD; + typedef cufftDoubleComplex FFTW_scalar; + typedef cufftHandle FFTW_plan; + + static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany, + FFTW_scalar *in, int *inembed, + int istride, int idist, + FFTW_scalar *out, int *onembed, + int ostride, int odist, + int sign, unsigned flags) { + FFTW_plan p; + cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany); + return p; + } + + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) { + if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD); + else cufftExecZ2Z(p,in,out,CUFFT_BACKWARD); + accelerator_barrier(); + } + inline static void fftw_destroy_plan(const FFTW_plan p) { + cufftDestroy(p); + } +}; +template<> struct FFTW { +public: + static const int forward=FFTW_FORWARD; + static const int backward=FFTW_BACKWARD; + typedef cufftComplex FFTW_scalar; + typedef cufftHandle FFTW_plan; + + static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany, + FFTW_scalar *in, int *inembed, + int istride, int idist, + FFTW_scalar *out, int *onembed, + int ostride, int odist, + int sign, unsigned flags) { + FFTW_plan p; + cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany); + return p; + } + + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) { + if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD); + else cufftExecC2C(p,in,out,CUFFT_BACKWARD); + accelerator_barrier(); + } + inline static void fftw_destroy_plan(const FFTW_plan p) { + cufftDestroy(p); + } +}; +#endif + +#ifdef HAVE_FFTW +template<> struct FFTW { +public: typedef fftw_complex FFTW_scalar; typedef fftw_plan FFTW_plan; - - static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, - FFTW_scalar *in, const int *inembed, + static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany, + FFTW_scalar *in, int *inembed, int istride, int idist, - FFTW_scalar *out, const int *onembed, + FFTW_scalar *out, int *onembed, int ostride, int odist, int sign, unsigned flags) { return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); } - static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ - ::fftw_flops(p,add,mul,fmas); - } - - inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) { ::fftw_execute_dft(p,in,out); } inline static void fftw_destroy_plan(const FFTW_plan p) { ::fftw_destroy_plan(p); } }; - template<> struct FFTW { public: - typedef fftwf_complex FFTW_scalar; typedef fftwf_plan FFTW_plan; - - static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, - FFTW_scalar *in, const int *inembed, + static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany, + FFTW_scalar *in, int *inembed, int istride, int idist, - FFTW_scalar *out, const int *onembed, + FFTW_scalar *out, int *onembed, int ostride, int odist, int sign, unsigned flags) { return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); } - static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ - ::fftwf_flops(p,add,mul,fmas); - } - - inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) { ::fftwf_execute_dft(p,in,out); } inline static void fftw_destroy_plan(const FFTW_plan p) { ::fftwf_destroy_plan(p); } }; - -#endif - -#ifndef FFTW_FORWARD -#define FFTW_FORWARD (-1) -#define FFTW_BACKWARD (+1) #endif class FFT { private: - GridCartesian *vgrid; - GridCartesian *sgrid; - - int Nd; double flops; double flops_call; uint64_t usec; - Coordinate dimensions; - Coordinate processors; - Coordinate processor_coor; - public: static const int forward=FFTW_FORWARD; @@ -126,31 +233,25 @@ public: double MFlops(void) {return flops/usec;} double USec(void) {return (double)usec;} - FFT ( GridCartesian * grid ) : - vgrid(grid), - Nd(grid->_ndimension), - dimensions(grid->_fdimensions), - processors(grid->_processors), - processor_coor(grid->_processor_coor) + FFT ( GridCartesian * grid ) { flops=0; usec =0; - Coordinate layout(Nd,1); - sgrid = new GridCartesian(dimensions,layout,processors,*grid); }; ~FFT ( void) { - delete sgrid; + // delete sgrid; } template void FFT_dim_mask(Lattice &result,const Lattice &source,Coordinate mask,int sign){ - conformable(result.Grid(),vgrid); - conformable(source.Grid(),vgrid); - Lattice tmp(vgrid); - tmp = source; - for(int d=0;dNd(); + Lattice tmp = source; + for(int d=0;d void FFT_all_dim(Lattice &result,const Lattice &source,int sign){ - Coordinate mask(Nd,1); + const int Ndim = source.Grid()->Nd(); + Coordinate mask(Ndim,1); FFT_dim_mask(result,source,mask,sign); } template void FFT_dim(Lattice &result,const Lattice &source,int dim, int sign){ -#ifndef HAVE_FFTW - std::cerr << "FFTW is not compiled but is called"<Nd(); + GridBase *grid = source.Grid(); + conformable(result.Grid(),source.Grid()); - int L = vgrid->_ldimensions[dim]; - int G = vgrid->_fdimensions[dim]; - - Coordinate layout(Nd,1); - Coordinate pencil_gd(vgrid->_fdimensions); - - pencil_gd[dim] = G*processors[dim]; - - // Pencil global vol LxLxGxLxL per node - GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid); + int L = grid->_ldimensions[dim]; + int G = grid->_fdimensions[dim]; + Coordinate layout(Ndim,1); + // Construct pencils typedef typename vobj::scalar_object sobj; - typedef typename sobj::scalar_type scalar; + typedef typename vobj::scalar_type scalar; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; - Lattice pgbuf(&pencil_g); - autoView(pgbuf_v , pgbuf, CpuWrite); //std::cout << "CPU view" << std::endl; typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; int Ncomp = sizeof(sobj)/sizeof(scalar); - int Nlow = 1; + int64_t Nlow = 1; + int64_t Nhigh = 1; + for(int d=0;d_ldimensions[d]; + Nlow*=grid->_ldimensions[d]; } + for(int d=dim+1;d_ldimensions[d]; + } + int64_t Nperp=Nlow*Nhigh; + + deviceVector pgbuf; // Layout is [perp][component][dim] + pgbuf.resize(Nperp*Ncomp*G); + scalar *pgbuf_v = &pgbuf[0]; int rank = 1; /* 1d transforms */ int n[] = {G}; /* 1d transforms of length G */ - int howmany = Ncomp; + int howmany = Ncomp * Nperp; int odist,idist,istride,ostride; - idist = odist = 1; /* Distance between consecutive FT's */ - istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ + idist = odist = G; /* Distance between consecutive FT's */ + istride = ostride = 1; /* Distance between two elements in the same FT */ int *inembed = n, *onembed = n; scalar div; if ( sign == backward ) div = 1.0/G; else if ( sign == forward ) div = 1.0; else GRID_ASSERT(0); - - //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; + + double t_pencil=0; + double t_fft =0; + double t_total =-usecond(); + // std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; + /* + * + */ FFTW_plan p; { FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; @@ -229,72 +338,154 @@ public: } // Barrel shift and collect global pencil - //std::cout << GridLogPerformance<<"Making pencil" << std::endl; - Coordinate lcoor(Nd), gcoor(Nd); + // std::cout << GridLogPerformance<<"Making pencil" << std::endl; + Coordinate lcoor(Ndim), gcoor(Ndim); + double t_copy=0; + double t_shift=0; + t_pencil = -usecond(); result = source; - int pc = processor_coor[dim]; + int pc = grid->_processor_coor[dim]; + + const Coordinate ldims = grid->_ldimensions; + const Coordinate rdims = grid->_rdimensions; + const Coordinate sdims = grid->_simd_layout; + + Coordinate processors = grid->_processors; + Coordinate pgdims(Ndim); + pgdims[0] = G; + for(int d=0, dd=1;doSites(), vobj::Nsimd(), { +#ifdef GRID_SIMT { - autoView(r_v,result,CpuRead); - autoView(p_v,pgbuf,CpuWrite); - thread_for(idx, sgrid->lSites(),{ - Coordinate cbuf(Nd); - sobj s; - sgrid->LocalIndexToLocalCoor(idx,cbuf); - peekLocalSite(s,r_v,cbuf); - cbuf[dim]+=((pc+p) % processors[dim])*L; - pokeLocalSite(s,p_v,cbuf); - }); + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane temp(grid); + t_shift-=usecond(); + temp = Cshift(result,dim,L); result = temp; + t_shift+=usecond(); } } + t_pencil += usecond(); - //std::cout <::fftw_execute_dft(p,in,out); - } - }); - timer.Stop(); - + FFTW_scalar *in = (FFTW_scalar *)pgbuf_v; + FFTW_scalar *out= (FFTW_scalar *)pgbuf_v; + t_fft = -usecond(); + FFTW::fftw_execute_dft(p,in,out,sign); + t_fft += usecond(); + // performance counting - double add,mul,fma; - FFTW::fftw_flops(p,&add,&mul,&fma); - flops_call = add+mul+2.0*fma; - usec += timer.useconds(); - flops+= flops_call*NN; - - //std::cout <lSites(),{ - Coordinate clbuf(Nd), cgbuf(Nd); - sobj s; - sgrid->LocalIndexToLocalCoor(idx,clbuf); - cgbuf = clbuf; - cgbuf[dim] = clbuf[dim]+L*pc; - peekLocalSite(s,pgbuf_v,cgbuf); - pokeLocalSite(s,result_v,clbuf); + autoView(r_v,result,AcceleratorWrite); + accelerator_for(idx,grid->oSites(),Nsimd,{ +#ifdef GRID_SIMT + { + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane::fftw_destroy_plan(p); -#endif + + t_total +=usecond(); + + std::cout < &list, void *xmit, int xmit_to_rank,int do_xmit, void *recv, int recv_from_rank,int do_recv, - int xbytes,int rbytes,int dir); + uint64_t xbytes,uint64_t rbytes,int dir); // Could do a PollHtoD and have a CommsMerge dependence void StencilSendToRecvFromPollDtoH (std::vector &list); @@ -206,7 +206,7 @@ public: int xmit_to_rank,int do_xmit, void *recv,void *recv_comp, int recv_from_rank,int do_recv, - int xbytes,int rbytes,int dir); + uint64_t xbytes,uint64_t rbytes,int dir); void StencilSendToRecvFromComplete(std::vector &waitall,int i); @@ -220,7 +220,7 @@ public: //////////////////////////////////////////////////////////// // Broadcast a buffer and composite larger //////////////////////////////////////////////////////////// - void Broadcast(int root,void* data, int bytes); + void Broadcast(int root,void* data, uint64_t bytes); //////////////////////////////////////////////////////////// // All2All down one dimension diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 750a9152..b725a681 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -342,23 +342,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector & int dest, void *recv, int from, - int bytes,int dir) + uint64_t bytes,int dir) { MPI_Request xrq; MPI_Request rrq; GRID_ASSERT(dest != _processor); GRID_ASSERT(from != _processor); - + GRID_ASSERT(bytes/(sizeof(int32_t))<= 2*1024*1024*1024); int tag; tag= dir+from*32; - int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq); + int ierr=MPI_Irecv(recv,(int)( bytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator,&rrq); GRID_ASSERT(ierr==0); list.push_back(rrq); tag= dir+_processor*32; - ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq); + ierr =MPI_Isend(xmit,(int)(bytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator,&xrq); GRID_ASSERT(ierr==0); list.push_back(xrq); } @@ -379,7 +379,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, int dest, void *recv, int from, - int bytes) + uint64_t bytes) { std::vector reqs(0); @@ -392,8 +392,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, // Give the CPU to MPI immediately; can use threads to overlap optionally // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); - ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, - recv,bytes,MPI_CHAR,from, from, + ierr=MPI_Sendrecv(xmit,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,dest,myrank, + recv,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,from, from, communicator,MPI_STATUS_IGNORE); GRID_ASSERT(ierr==0); @@ -403,7 +403,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int dest, int dox, void *recv, int from, int dor, - int bytes,int dir) + uint64_t bytes,int dir) { std::vector list; double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); @@ -426,7 +426,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferMalloc(rbytes); - ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); + ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq); GRID_ASSERT(ierr==0); CommsRequest_t srq; srq.PacketType = InterNodeRecv; @@ -686,7 +686,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector &lis int dest, void *recv, int from, - int bytes,int dir) + uint64_t bytes,int dir) { GRID_ASSERT(0); } @@ -115,8 +115,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t int CartesianCommunicator::RankWorld(void){return 0;} void CartesianCommunicator::Barrier(void){} -void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} -void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } +void CartesianCommunicator::Broadcast(int root,void* data, uint64_t bytes) {} +void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) { } void CartesianCommunicator::BarrierWorld(void) { } int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } @@ -132,7 +132,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int xmit_to_rank,int dox, void *recv, int recv_from_rank,int dor, - int bytes, int dir) + uint64_t bytes, int dir) { return 2.0*bytes; } @@ -143,7 +143,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector Lattice Cshift(const Lattice &rhs,int dimension // Map to always positive shift modulo global full dimension. shift = (shift+fd)%fd; + if( shift ==0 ) { + ret = rhs; + return ret; + } + // + // Potential easy fast cases: + // Shift is a multiple of the local lattice extent. + // Then need only to shift whole subvolumes + int L = rhs.Grid()->_ldimensions[dimension]; + if ( (shift%L )==0 && !rhs.Grid()->CheckerBoarded(dimension) ) { + Cshift_simple(ret,rhs,dimension,shift); + return ret; + } + ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); // the permute type @@ -73,6 +87,55 @@ template Lattice Cshift(const Lattice &rhs,int dimension return ret; } +template void Cshift_simple(Lattice& ret,const Lattice &rhs,int dimension,int shift) +{ + GridBase *grid=rhs.Grid(); + int comm_proc, xmit_to_rank, recv_from_rank; + + int fd = rhs.Grid()->_fdimensions[dimension]; + int rd = rhs.Grid()->_rdimensions[dimension]; + int ld = rhs.Grid()->_ldimensions[dimension]; + int pd = rhs.Grid()->_processors[dimension]; + int simd_layout = rhs.Grid()->_simd_layout[dimension]; + int comm_dim = rhs.Grid()->_processors[dimension] >1 ; + + comm_proc = ((shift)/ld)%pd; + + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + if(comm_dim) { + + int64_t bytes = sizeof(vobj) * grid->oSites(); + + autoView(rhs_v , rhs, AcceleratorRead); + autoView(ret_v , ret, AcceleratorWrite); + void *send_buf = (void *)&rhs_v[0]; + void *recv_buf = (void *)&ret_v[0]; + +#ifdef ACCELERATOR_AWARE_MPI + grid->SendToRecvFrom(send_buf, + xmit_to_rank, + recv_buf, + recv_from_rank, + bytes); +#else + static hostVector hrhs; hrhs.resize(grid->oSites()); + static hostVector hret; hret.resize(grid->oSites()); + + void *hsend_buf = (void *)&hrhs[0]; + void *hrecv_buf = (void *)&hret[0]; + + acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes); + + grid->SendToRecvFrom(hsend_buf, + xmit_to_rank, + hrecv_buf, + recv_from_rank, + bytes); + + acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); +#endif + } +} template void Cshift_comms(Lattice& ret,const Lattice &rhs,int dimension,int shift) { int sshift[2]; diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h index 321c18ee..7434322f 100644 --- a/Grid/perfmon/Timer.h +++ b/Grid/perfmon/Timer.h @@ -60,12 +60,16 @@ inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time) } inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now) { + double secs = 1.0*now.count()*1.0e-3; + stream << secs<<" s"; + /* GridSecs second(1); auto secs = now/second ; auto subseconds = now%second ; auto fill = stream.fill(); stream << secs<<"."< #include using namespace Grid; - ; int main (int argc, char ** argv) { @@ -116,10 +115,10 @@ int main (int argc, char ** argv) Stilde=S; std::cout<<" Benchmarking FFT of LatticeSpinMatrix "< "< HermOp(Ddwf); - ConjugateGradient CG(1.0e-16,10000); + ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src5,result5); //////////////////////////////////////////////////////////////////////// @@ -423,7 +433,7 @@ int main (int argc, char ** argv) Dov.Mdag(src5,tmp5); src5=tmp5; MdagMLinearOperator HermOp(Dov); - ConjugateGradient CG(1.0e-16,10000); + ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src5,result5); ////////////////////////////////////////////////////////////////////////