diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 2cbc895c..dc972537 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -168,6 +168,7 @@ public: template void FFT_dim(Lattice &result,const Lattice &source,int dim, int sign){ #ifndef HAVE_FFTW + std::cerr << "FFTW is not compiled but is called"< pgbuf(&pencil_g); autoView(pgbuf_v , pgbuf, CpuWrite); - + std::cout << "CPU view" << std::endl; + typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; @@ -213,6 +215,7 @@ public: else if ( sign == forward ) div = 1.0; else assert(0); + std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; FFTW_plan p; { FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; @@ -226,6 +229,7 @@ public: } // Barrel shift and collect global pencil + std::cout << GridLogPerformance<<"Making pencil" << std::endl; Coordinate lcoor(Nd), gcoor(Nd); result = source; int pc = processor_coor[dim]; @@ -247,6 +251,7 @@ public: } } + std::cout <::fftw_destroy_plan(p); #endif diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index f4092bc5..f4245319 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid); typedef cublasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL - typedef cl::sycl::queue *gridblasHandle_t; + typedef sycl::queue *gridblasHandle_t; #endif #ifdef GRID_ONE_MKL - typedef cl::sycl::queue *gridblasHandle_t; + typedef sycl::queue *gridblasHandle_t; #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) typedef int32_t gridblasHandle_t; @@ -89,9 +89,9 @@ public: gridblasHandle = theGridAccelerator; #endif #ifdef GRID_ONE_MKL - cl::sycl::gpu_selector selector; - cl::sycl::device selectedDevice { selector }; - cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; + sycl::gpu_selector selector; + sycl::device selectedDevice { selector }; + sycl::property_list q_prop{sycl::property::queue::in_order()}; gridblasHandle =new sycl::queue (selectedDevice,q_prop); #endif gridblasInit=1; diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h index 27fee791..c434b9ef 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h @@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid); //Compute double precision rsd and also new RHS vector. Linop_d.HermOp(sol_d, tmp_d); RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector - + std::cout< bs(nshift); + std::vector rsq(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h index 23baff61..c6102eb2 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h @@ -123,11 +123,11 @@ public: assert(mresidual.size()==nshift); // dynamic sized arrays on stack; 2d is a pain with vector - RealD bs[nshift]; - RealD rsq[nshift]; - RealD rsqf[nshift]; - RealD z[nshift][2]; - int converged[nshift]; + std::vector bs(nshift); + std::vector rsq(nshift); + std::vector rsqf(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index d3fb282a..24a3228a 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -156,11 +156,11 @@ public: assert(mresidual.size()==nshift); // dynamic sized arrays on stack; 2d is a pain with vector - RealD bs[nshift]; - RealD rsq[nshift]; - RealD rsqf[nshift]; - RealD z[nshift][2]; - int converged[nshift]; + std::vector bs(nshift); + std::vector rsq(nshift); + std::vector rsqf(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/multigrid/CoarsenedMatrix.h b/Grid/algorithms/multigrid/CoarsenedMatrix.h index 42634004..60a5920c 100644 --- a/Grid/algorithms/multigrid/CoarsenedMatrix.h +++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h @@ -99,7 +99,7 @@ public: CoarseMatrix AselfInvEven; CoarseMatrix AselfInvOdd; - Vector dag_factor; + deviceVector dag_factor; /////////////////////// // Interface @@ -124,9 +124,13 @@ public: int npoint = geom.npoint; typedef LatticeView Aview; - Vector AcceleratorViewContainer; + deviceVector AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); - for(int p=0;p Aview; - Vector AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;poSites(); - Vector points(geom.npoint, 0); - for(int p=0; p points(geom.npoint); + for(int p=0; p Aview; - Vector AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;p &out) { @@ -469,14 +484,20 @@ public: // determine in what order we need the points int npoint = geom.npoint-1; - Vector points(npoint, 0); - for(int p=0; p points(npoint); + for(int p=0; p AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;p h_dag_factor(nbasis*nbasis); thread_for(i, nbasis*nbasis, { int j = i/nbasis; int k = i%nbasis; - dag_factor[i] = dag_factor_eigen(j, k); + h_dag_factor[i] = dag_factor_eigen(j, k); }); + acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD)); } void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase > &linop, diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 293ce2fb..8946a364 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -174,21 +174,11 @@ template inline bool operator!=(const devAllocator<_Tp>&, const d //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -#ifdef ACCELERATOR_CSHIFT -// Cshift on device -template using cshiftAllocator = devAllocator; -#else -// Cshift on host -template using cshiftAllocator = std::allocator; -#endif +template using hostVector = std::vector >; // Needs autoview +template using Vector = std::vector >; // +template using uvmVector = std::vector >; // auto migrating page +template using deviceVector = std::vector >; // device vector -template using Vector = std::vector >; -template using stencilVector = std::vector >; -template using commVector = std::vector >; -template using deviceVector = std::vector >; -template using cshiftVector = std::vector >; - -/* template class vecView { protected: @@ -197,8 +187,9 @@ template class vecView ViewMode mode; void * cpu_ptr; public: + // Rvalue accessor accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; - vecView(std::vector &refer_to_me,ViewMode _mode) + vecView(Vector &refer_to_me,ViewMode _mode) { cpu_ptr = &refer_to_me[0]; size = refer_to_me.size(); @@ -214,26 +205,15 @@ template class vecView } }; -template vecView VectorView(std::vector &vec,ViewMode _mode) +template vecView VectorView(Vector &vec,ViewMode _mode) { vecView ret(vec,_mode); // does the open return ret; // must be closed } -// Little autoscope assister -template -class VectorViewCloser -{ - View v; // Take a copy of view and call view close when I go out of scope automatically - public: - VectorViewCloser(View &_v) : v(_v) {}; - ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);} -}; - #define autoVecView(v_v,v,mode) \ auto v_v = VectorView(v,mode); \ ViewCloser _autoView##v_v(v_v); -*/ NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index c610fb9c..b53e1510 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -1,17 +1,15 @@ #include #ifndef GRID_UVM -#warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); #define MAXLINE 512 static char print_buffer [ MAXLINE ]; -#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; -#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; +#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl; +#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl; //#define dprintf(...) - //////////////////////////////////////////////////////////// // For caching copies of data on device //////////////////////////////////////////////////////////// @@ -169,7 +167,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -184,7 +182,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n", + (uint64_t)AccCache.bytes, + (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -265,7 +265,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", + dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, diff --git a/Grid/allocator/MemoryStats.cc b/Grid/allocator/MemoryStats.cc index 0d1707d9..37269785 100644 --- a/Grid/allocator/MemoryStats.cc +++ b/Grid/allocator/MemoryStats.cc @@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES) uint64_t virt_pfn = (uint64_t)Buf / page_size; off_t offset = sizeof(uint64_t) * virt_pfn; uint64_t npages = (BYTES + page_size-1) / page_size; - uint64_t pagedata[npages]; + std::vector pagedata(npages); uint64_t ret = lseek(fd, offset, SEEK_SET); assert(ret == offset); - ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages); assert(ret == sizeof(uint64_t) * npages); int nhugepages = npages / 512; int n4ktotal, nnothuge; diff --git a/Grid/communicator/Communicator_base.cc b/Grid/communicator/Communicator_base.cc index 79efb90c..f9a4c442 100644 --- a/Grid/communicator/Communicator_base.cc +++ b/Grid/communicator/Communicator_base.cc @@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return // very VERY rarely (Log, serial RNG) we need world without a grid //////////////////////////////////////////////////////////////////////////////// +#ifdef USE_GRID_REDUCTION +void CartesianCommunicator::GlobalSum(ComplexF &c) +{ + GlobalSumP2P(c); +} +void CartesianCommunicator::GlobalSum(ComplexD &c) +{ + GlobalSumP2P(c); +} +#else void CartesianCommunicator::GlobalSum(ComplexF &c) { GlobalSumVector((float *)&c,2); } -void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) -{ - GlobalSumVector((float *)c,2*N); -} void CartesianCommunicator::GlobalSum(ComplexD &c) { GlobalSumVector((double *)&c,2); } +#endif +void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) +{ + GlobalSumVector((float *)c,2*N); +} void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) { GlobalSumVector((double *)c,2*N); diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index c5e333f4..c72fcc79 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -136,7 +136,7 @@ public: for(int d=0;d<_ndimension;d++){ column.resize(_processors[d]); column[0] = accum; - std::vector list; + std::vector list; for(int p=1;p<_processors[d];p++){ ShiftedRanks(d,p,source,dest); SendToRecvFromBegin(list, @@ -166,8 +166,8 @@ public: //////////////////////////////////////////////////////////// // Face exchange, buffer swap in translational invariant way //////////////////////////////////////////////////////////// - void CommsComplete(std::vector &list); - void SendToRecvFromBegin(std::vector &list, + void CommsComplete(std::vector &list); + void SendToRecvFromBegin(std::vector &list, void *xmit, int dest, void *recv, @@ -186,6 +186,12 @@ public: int recv_from_rank,int do_recv, int bytes,int dir); + double StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int xmit_to_rank,int do_xmit, + void *recv, + int recv_from_rank,int do_recv, + int xbytes,int rbytes,int dir); double StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int do_xmit, diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 5fa70da4..7dc706df 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -257,6 +257,25 @@ CartesianCommunicator::~CartesianCommunicator() } } } +#ifdef USE_GRID_REDUCTION +void CartesianCommunicator::GlobalSum(float &f){ + CartesianCommunicator::GlobalSumP2P(f); +} +void CartesianCommunicator::GlobalSum(double &d) +{ + CartesianCommunicator::GlobalSumP2P(d); +} +#else +void CartesianCommunicator::GlobalSum(float &f){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(double &d) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); + assert(ierr==0); +} +#endif void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); @@ -287,27 +306,18 @@ void CartesianCommunicator::GlobalMax(double &d) int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); assert(ierr==0); } -void CartesianCommunicator::GlobalSum(float &f){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); - assert(ierr==0); -} void CartesianCommunicator::GlobalSumVector(float *f,int N) { int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); } -void CartesianCommunicator::GlobalSum(double &d) -{ - int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); - assert(ierr==0); -} void CartesianCommunicator::GlobalSumVector(double *d,int N) { int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); assert(ierr==0); } -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, +void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, int dest, void *recv, @@ -332,7 +342,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis assert(ierr==0); list.push_back(xrq); } -void CartesianCommunicator::CommsComplete(std::vector &list) +void CartesianCommunicator::CommsComplete(std::vector &list) { int nreq=list.size(); @@ -351,7 +361,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, int from, int bytes) { - std::vector reqs(0); + std::vector reqs(0); unsigned long xcrc = crc32(0L, Z_NULL, 0); unsigned long rcrc = crc32(0L, Z_NULL, 0); @@ -381,12 +391,224 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int bytes,int dir) { std::vector list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); + double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); + offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } -#undef NVLINK_GET // Define to use get instead of put DMA + +#ifdef ACCELERATOR_AWARE_MPI +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) +{ + return 0.0; // Do nothing -- no preparation required +} +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) +{ + int ncomm =communicator_halo.size(); + int commdir=dir%ncomm; + + MPI_Request xrq; + MPI_Request rrq; + + int ierr; + int gdest = ShmRanks[dest]; + int gfrom = ShmRanks[from]; + int gme = ShmRanks[_processor]; + + assert(dest != _processor); + assert(from != _processor); + assert(gme == ShmRank); + double off_node_bytes=0.0; + int tag; + + if ( dor ) { + if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+from*32; + ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); + assert(ierr==0); + list.push_back(rrq); + off_node_bytes+=rbytes; + } + } + + if (dox) { + if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+_processor*32; + ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + assert(ierr==0); + list.push_back(xrq); + off_node_bytes+=xbytes; + } else { + void *shm = (void *) this->ShmBufferTranslate(dest,recv); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + } + } + return off_node_bytes; +} + +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) +{ + int nreq=list.size(); + + acceleratorCopySynchronise(); + + if (nreq==0) return; + std::vector status(nreq); + int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + assert(ierr==0); + list.resize(0); + this->StencilBarrier(); +} + +#else /* NOT ... ACCELERATOR_AWARE_MPI */ +/////////////////////////////////////////// +// Pipeline mode through host memory +/////////////////////////////////////////// + /* + * In prepare (phase 1): + * PHASE 1: (prepare) + * - post MPI receive buffers asynch + * - post device - host send buffer transfer asynch + * PHASE 2: (Begin) + * - complete all copies + * - post MPI send asynch + * - post device - device transfers + * PHASE 3: (Complete) + * - MPI_waitall + * - host-device transfers + * + ********************************* + * NB could split this further: + *-------------------------------- + * PHASE 1: (Prepare) + * - post MPI receive buffers asynch + * - post device - host send buffer transfer asynch + * PHASE 2: (BeginInterNode) + * - complete all copies + * - post MPI send asynch + * PHASE 3: (BeginIntraNode) + * - post device - device transfers + * PHASE 4: (Complete) + * - MPI_waitall + * - host-device transfers asynch + * - (complete all copies) + */ +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) +{ +/* + * Bring sequence from Stencil.h down to lower level. + * Assume using XeLink is ok + */ + int ncomm =communicator_halo.size(); + int commdir=dir%ncomm; + + MPI_Request xrq; + MPI_Request rrq; + + int ierr; + int gdest = ShmRanks[dest]; + int gfrom = ShmRanks[from]; + int gme = ShmRanks[_processor]; + + assert(dest != _processor); + assert(from != _processor); + assert(gme == ShmRank); + double off_node_bytes=0.0; + int tag; + + void * host_recv = NULL; + void * host_xmit = NULL; + + /* + * PHASE 1: (Prepare) + * - post MPI receive buffers asynch + * - post device - host send buffer transfer asynch + */ + + if ( dor ) { + if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+from*32; + host_recv = this->HostBufferMalloc(rbytes); + ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); + assert(ierr==0); + CommsRequest_t srq; + srq.PacketType = InterNodeRecv; + srq.bytes = rbytes; + srq.req = rrq; + srq.host_buf = host_recv; + srq.device_buf = recv; + list.push_back(srq); + off_node_bytes+=rbytes; + } + } + + if (dox) { + if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { +#undef DEVICE_TO_HOST_CONCURRENT // pipeline +#ifdef DEVICE_TO_HOST_CONCURRENT + tag= dir+_processor*32; + + host_xmit = this->HostBufferMalloc(xbytes); + acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch + + // ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + // assert(ierr==0); + // off_node_bytes+=xbytes; + + CommsRequest_t srq; + srq.PacketType = InterNodeXmit; + srq.bytes = xbytes; + // srq.req = xrq; + srq.host_buf = host_xmit; + srq.device_buf = xmit; + list.push_back(srq); +#else + tag= dir+_processor*32; + + host_xmit = this->HostBufferMalloc(xbytes); + const int chunks=1; + for(int n=0;n &list, void *xmit, int dest,int dox, @@ -411,54 +633,86 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(from,xmit); - assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); -#endif - } + void * host_xmit = NULL; + + //////////////////////////////// + // Receives already posted + // Copies already started + //////////////////////////////// + /* + * PHASE 2: (Begin) + * - complete all copies + * - post MPI send asynch + */ + + // static int printed; + // if((printed<8) && this->IsBoss() ) { + // printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); + // printed++; + // } if (dox) { - // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); + if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { +#ifdef DEVICE_TO_HOST_CONCURRENT tag= dir+_processor*32; - ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + // Find the send in the prepared list + int list_idx=-1; + for(int idx = 0; idxShmBufferTranslate(dest,recv); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); -#endif - } } - return off_node_bytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) { int nreq=list.size(); - acceleratorCopySynchronise(); - if (nreq==0) return; - std::vector status(nreq); - int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + std::vector MpiRequests(nreq); + + for(int r=0;rHostBufferFreeAll(); // Clean up the buffer allocs + this->StencilBarrier(); } +#endif +//////////////////////////////////////////// +// END PIPELINE MODE / NO CUDA AWARE MPI +//////////////////////////////////////////// + void CartesianCommunicator::StencilBarrier(void) { MPI_Barrier (ShmComm); diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 7e7dfac8..8e6206ef 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -132,6 +132,15 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, { return 2.0*bytes; } +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int xmit_to_rank,int dox, + void *recv, + int recv_from_rank,int dor, + int xbytes,int rbytes, int dir) +{ + return xbytes+rbytes; +} double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int dox, diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index 94e9741e..422be8aa 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -46,8 +46,22 @@ NAMESPACE_BEGIN(Grid); #if defined (GRID_COMMS_MPI3) typedef MPI_Comm Grid_MPI_Comm; +typedef MPI_Request MpiCommsRequest_t; +#ifdef ACCELERATOR_AWARE_MPI typedef MPI_Request CommsRequest_t; +#else +enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv }; +typedef struct { + PacketType_t PacketType; + void *host_buf; + void *device_buf; + unsigned long bytes; + MpiCommsRequest_t req; +} CommsRequest_t; +#endif + #else +typedef int MpiCommsRequest_t; typedef int CommsRequest_t; typedef int Grid_MPI_Comm; #endif diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 2600ce9c..dc22aee0 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -42,6 +42,11 @@ Author: Christoph Lehner #ifdef ACCELERATOR_AWARE_MPI #define GRID_SYCL_LEVEL_ZERO_IPC #define SHM_SOCKETS +#else +#ifdef HAVE_NUMAIF_H + #warning " Using NUMAIF " +#include +#endif #endif #include #endif @@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI - HostCommBuf= malloc(bytes); + printf("Host buffer allocate for GPU non-aware MPI\n"); +#if 0 + HostCommBuf= acceleratorAllocHost(bytes); +#else + HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host +#ifdef HAVE_NUMAIF_H + #warning "Moving host buffers to specific NUMA domain" + int numa; + char *numa_name=(char *)getenv("MPI_BUF_NUMA"); + if(numa_name) { + unsigned long page_size = sysconf(_SC_PAGESIZE); + numa = atoi(numa_name); + unsigned long page_count = bytes/page_size; + std::vector pages(page_count); + std::vector nodes(page_count,numa); + std::vector status(page_count,-1); + for(unsigned long p=0;p(theGridAccelerator->get_device()); - auto zeContext = cl::sycl::get_native(theGridAccelerator->get_context()); + auto zeDevice = sycl::get_native(theGridAccelerator->get_device()); + auto zeContext = sycl::get_native(theGridAccelerator->get_context()); ze_ipc_mem_handle_t ihandle; clone_mem_t handle; diff --git a/Grid/cshift/Cshift.h b/Grid/cshift/Cshift.h index c7b9e3cb..ae1dea51 100644 --- a/Grid/cshift/Cshift.h +++ b/Grid/cshift/Cshift.h @@ -51,7 +51,6 @@ Author: Peter Boyle #endif NAMESPACE_BEGIN(Grid); - template::value,void>::type * = nullptr> auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) { diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index 309517b2..fdb98cd4 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -30,12 +30,11 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); extern std::vector > Cshift_table; -extern commVector > Cshift_table_device; +extern deviceVector > Cshift_table_device; inline std::pair *MapCshiftTable(void) { // GPU version -#ifdef ACCELERATOR_CSHIFT uint64_t sz=Cshift_table.size(); if (Cshift_table_device.size()!=sz ) { Cshift_table_device.resize(sz); @@ -45,16 +44,13 @@ inline std::pair *MapCshiftTable(void) sizeof(Cshift_table[0])*sz); return &Cshift_table_device[0]; -#else - return &Cshift_table[0]; -#endif // CPU version use identify map } /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dimension,int plane,int cbmask, int off=0) +Gather_plane_simple (const Lattice &rhs,deviceVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dim { auto buffer_p = & buffer[0]; auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for(i,ent,{ - buffer_p[table[i].first]=rhs_v[table[i].second]; - }); -#endif } } @@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(nn,e1*e2,1,{ int n = nn%e1; @@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice &rhs, vobj temp =rhs_v[so+o+b]; extract(temp,pointers,offset); }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for2d(n,e1,b,e2,{ - int o = n*n1; - int offset = b+n*e2; - - vobj temp =rhs_v[so+o+b]; - extract(temp,pointers,offset); - }); -#endif } else { Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate cdm =rhs.Grid()->_checker_dim_mask; std::cout << " Dense packed buffer WARNING " < &rhs, extract(temp,pointers,offset); } }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for2d(n,e1,b,e2,{ - - Coordinate coor; - - int o=n*n1; - int oindex = o+b; - - int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); - - int ocb=1<(temp,pointers,offset); - } - }); -#endif } } ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Scatter_plane_simple (Lattice &rhs,cshiftVector &buffer, int dimension,int plane,int cbmask) +template void Scatter_plane_simple (Lattice &rhs,deviceVector &buffer, int dimension,int plane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -245,17 +202,10 @@ template void Scatter_plane_simple (Lattice &rhs,cshiftVector< { auto buffer_p = & buffer[0]; auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView( rhs_v, rhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); }); -#else - autoView( rhs_v, rhs, CpuWrite); - thread_for(i,ent,{ - rhs_v[table[i].first]=buffer_p[table[i].second]; - }); -#endif } } @@ -278,7 +228,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA if(cbmask ==0x3 ) { int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; -#ifdef ACCELERATOR_CSHIFT autoView( rhs_v , rhs, AcceleratorWrite); accelerator_for(nn,e1*e2,1,{ int n = nn%e1; @@ -287,14 +236,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); }); -#else - autoView( rhs_v , rhs, CpuWrite); - thread_for2d(n,e1,b,e2,{ - int o = n*_slice_stride; - int offset = b+n*_slice_block; - merge(rhs_v[so+o+b],pointers,offset); - }); -#endif } else { // Case of SIMD split AND checker dim cannot currently be hit, except in @@ -360,19 +301,11 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs { auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); -#else - autoView(rhs_v , rhs, CpuRead); - autoView(lhs_v , lhs, CpuWrite); - thread_for(i,ent,{ - lhs_v[table[i].first]=rhs_v[table[i].second]; - }); -#endif } } @@ -412,19 +345,11 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice NAMESPACE_BEGIN(Grid); - +const int Cshift_verbose=0; template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { typedef typename vobj::vector_type vector_type; @@ -55,20 +55,20 @@ template Lattice Cshift(const Lattice &rhs,int dimension RealD t1,t0; t0=usecond(); if ( !comm_dim ) { - //std::cout << "CSHIFT: Cshift_local" < void Cshift_comms(Lattice& ret,const Lattice &rhs,int dimension,int shift) { int sshift[2]; @@ -94,18 +94,16 @@ template void Cshift_comms_simd(Lattice& ret,const LatticeCheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); - //std::cout << "Cshift_comms_simd dim "< void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { typedef typename vobj::vector_type vector_type; @@ -125,8 +123,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - static cshiftVector send_buf; send_buf.resize(buffer_size); - static cshiftVector recv_buf; recv_buf.resize(buffer_size); + static deviceVector send_buf; send_buf.resize(buffer_size); + static deviceVector recv_buf; recv_buf.resize(buffer_size); int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -161,7 +159,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); tcomms-=usecond(); - // grid->Barrier(); + grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, @@ -169,7 +167,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r recv_from_rank, bytes); xbytes+=bytes; - // grid->Barrier(); + grid->Barrier(); tcomms+=usecond(); tscatter-=usecond(); @@ -177,13 +175,13 @@ template void Cshift_comms(Lattice &ret,const Lattice &r tscatter+=usecond(); } } - /* - std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -201,9 +199,9 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_simd_layout[dimension]; int comm_dim = grid->_processors[dimension] >1 ; - //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "< void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); - static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; @@ -281,7 +279,7 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); tcomms-=usecond(); - // grid->Barrier(); + grid->Barrier(); send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0]; @@ -292,7 +290,7 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + grid->Barrier(); tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; @@ -305,13 +303,13 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -400,13 +398,13 @@ template void Cshift_comms(Lattice &ret,const Lattice &r tscatter+=usecond(); } } - /* - std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -532,15 +530,16 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice NAMESPACE_BEGIN(Grid); std::vector > Cshift_table; -commVector > Cshift_table_device; +deviceVector > Cshift_table_device; NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index 5b37532f..f40d23da 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -257,17 +257,30 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice }); } +#define FAST_AXPY_NORM template inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y) { GRID_TRACE("axpy_norm"); - return axpy_norm_fast(ret,a,x,y); +#ifdef FAST_AXPY_NORM + return axpy_norm_fast(ret,a,x,y); +#else + ret = a*x+y; + RealD nn=norm2(ret); + return nn; +#endif } template inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y) { GRID_TRACE("axpby_norm"); - return axpby_norm_fast(ret,a,b,x,y); +#ifdef FAST_AXPY_NORM + return axpby_norm_fast(ret,a,b,x,y); +#else + ret = a*x+b*y; + RealD nn=norm2(ret); + return nn; +#endif } /// Trace product diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index ecb67b59..515c847f 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -237,9 +237,12 @@ public: vobj vtmp; vtmp = r; #if 0 + deviceVector vvtmp(1); + acceleratorPut(vvtmp[0],vtmp); + vobj *vvtmp_p = & vvtmp[0]; auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ - auto stmp=coalescedRead(vtmp); + auto stmp=coalescedRead(*vvtmp_p); coalescedWrite(me[ss],stmp); }); #else diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 03a869fb..c9c65928 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) typedef decltype(basis[0]) Field; typedef decltype(basis[0].View(AcceleratorRead)) View; - Vector basis_v; basis_v.reserve(basis.size()); - typedef typename std::remove_reference::type vobj; + hostVector h_basis_v(basis.size()); + deviceVector d_basis_v(basis.size()); + typedef typename std::remove_reference::type vobj; typedef typename std::remove_reference::type Coeff_t; + GridBase* grid = basis[0].Grid(); for(int k=0;k Bt(Nm * max_threads); - thread_region - { - vobj* B = &Bt[Nm * thread_num()]; - thread_for_in_region(ss, grid->oSites(),{ - for(int j=j0; joSites(); uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead - Vector Bt(siteBlock * nrot); + deviceVector Bt(siteBlock * nrot); auto Bp=&Bt[0]; // GPU readable copy of matrix - Vector Qt_jv(Nm*Nm); + hostVector h_Qt_jv(Nm*Nm); + deviceVector Qt_jv(Nm*Nm); Coeff_t *Qt_p = & Qt_jv[0]; thread_for(i,Nm*Nm,{ int j = i/Nm; int k = i%Nm; - Qt_p[i]=Qt(j,k); + h_Qt_jv[i]=Qt(j,k); }); + acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t)); // Block the loop to keep storage footprint down for(uint64_t s=0;s &basis,Eigen::MatrixXd& Qt,in result.Checkerboard() = basis[0].Checkerboard(); - Vector basis_v; basis_v.reserve(basis.size()); + hostVector h_basis_v(basis.size()); + deviceVector d_basis_v(basis.size()); for(int k=0;k Qt_jv(Nm); - double * Qt_j = & Qt_jv[0]; - for(int k=0;k Qt_jv(Nm); + double * Qt_j = & Qt_jv[0]; + for(int k=0;koSites(),vobj::Nsimd(),{ vobj zzz=Zero(); @@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in } coalescedWrite(result_v[ss], B); }); - for(int k=0;k diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 58004eac..53a592d1 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) // const int Nsimd = vobj::Nsimd(); const int nthread = GridThread::GetThreads(); - Vector sumarray(nthread); + std::vector sumarray(nthread); for(int i=0;i sumarray(nthread); + std::vector sumarray(nthread); for(int i=0;i inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); + bool ok; #ifdef GRID_SYCL uint64_t csum=0; + uint64_t csum2=0; if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) { // Hack @@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); uint64_t *base= (uint64_t *)&l_v[0]; csum=svm_xor(base,words); + ok = FlightRecorder::CsumLog(csum); + if ( !ok ) { + csum2=svm_xor(base,words); + std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<GlobalSumP2P(nrm); grid->GlobalSum(nrm); + FlightRecorder::StepLog("Finished global sum"); + // std::cout << " norm "<< nrm << " p2p norm "< &z,sobj a,sobj b,const Lattice &x,const Latt autoView( x_v, x, AcceleratorRead); autoView( y_v, y, AcceleratorRead); autoView( z_v, z, AcceleratorWrite); -#if 0 - typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; - Vector inner_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto tmp = a*x_v(ss)+b*y_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp)); - coalescedWrite(z_v[ss],tmp); - }); - nrm = real(TensorRemove(sum(inner_tmp_v,sites))); -#else typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; deviceVector inner_tmp; inner_tmp.resize(sites); @@ -365,9 +375,44 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(z_v[ss],tmp); }); - nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); + bool ok; +#ifdef GRID_SYCL + uint64_t csum=0; + uint64_t csum2=0; + if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) + { + // z_v + { + Integer words = sites*sizeof(vobj)/sizeof(uint64_t); + uint64_t *base= (uint64_t *)&z_v[0]; + csum=svm_xor(base,words); + ok = FlightRecorder::CsumLog(csum); + if ( !ok ) { + csum2=svm_xor(base,words); + std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<GlobalSum(nrm); + FlightRecorder::ReductionLog(local,real(nrm)); return nrm; } @@ -377,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti conformable(left,right); typedef typename vobj::vector_typeD vector_type; - Vector tmp(2); + std::vector tmp(2); GridBase *grid = left.Grid(); @@ -387,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti // GPU typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t; - Vector inner_tmp(sites); - Vector norm_tmp(sites); + deviceVector inner_tmp(sites); + deviceVector norm_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; auto norm_tmp_v = &norm_tmp[0]; { @@ -438,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression & expr) // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) +template inline void sliceSum(const Lattice &Data, + std::vector &result, + int orthogdim) { /////////////////////////////////////////////////////// // FIXME precision promoted summation @@ -460,8 +507,8 @@ template inline void sliceSum(const Lattice &Data,std::vector< int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,Zero()); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,Zero()); // sum across these down to scalars ExtractBuffer extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node @@ -509,6 +556,8 @@ template inline void sliceSum(const Lattice &Data,std::vector< scalar_type * ptr = (scalar_type *) &result[0]; int words = fd*sizeof(sobj)/sizeof(scalar_type); grid->GlobalSumVector(ptr, words); + // std::cout << GridLogMessage << " sliceSum local"< inline std::vector @@ -552,8 +601,8 @@ static void sliceInnerProductVector( std::vector & result, const Latti int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars ExtractBuffer > extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index e82494f5..91cb8226 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi // Move out of UVM // Turns out I had messed up the synchronise after move to compute stream // as running this on the default stream fools the synchronise -#undef UVM_BLOCK_BUFFER -#ifndef UVM_BLOCK_BUFFER - commVector buffer(numBlocks); + deviceVector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); -#else - Vector buffer(numBlocks); - sobj *buffer_v = &buffer[0]; - sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); - accelerator_barrier(); - result = *buffer_v; -#endif return result; } @@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi const int words = sizeof(vobj)/sizeof(vector); - Vector buffer(osites); + deviceVector buffer(osites); vector *dat = (vector *)lat; vector *buf = &buffer[0]; iScalar *tbuf =(iScalar *) &buffer[0]; diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index b8dc5378..bc9257b9 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -4,33 +4,28 @@ NAMESPACE_BEGIN(Grid); // Possibly promote to double and sum ///////////////////////////////////////////////////////////////////////////////////////////////////////// + template inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_objectD sobjD; - static Vector mysum; - mysum.resize(1); - sobj *mysum_p = & mysum[0]; + sobj identity; zeroit(identity); - mysum[0] = identity; - sobj ret ; - + sobj ret; zeroit(ret); Integer nsimd= vobj::Nsimd(); - - const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList); - cgh.parallel_for(cl::sycl::range<1>{osites}, - Reduction, - [=] (cl::sycl::id<1> item, auto &sum) { - auto osite = item[0]; - sum +=Reduce(lat[osite]); - }); - }); - theGridAccelerator->wait(); - ret = mysum[0]; - // free(mysum,*theGridAccelerator); + { + sycl::buffer abuff(&ret, {1}); + theGridAccelerator->submit([&](sycl::handler &cgh) { + auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>()); + cgh.parallel_for(sycl::range<1>{osites}, + Reduction, + [=] (sycl::id<1> item, auto &sum) { + auto osite = item[0]; + sum +=Reduce(lat[osite]); + }); + }); + } sobjD dret; convertType(dret,ret); return dret; } @@ -76,59 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite template Word svm_xor(Word *vec,uint64_t L) { - Word xorResult; xorResult = 0; - static Vector d_sum; - d_sum.resize(1); - Word *d_sum_p=&d_sum[0]; Word identity; identity=0; - d_sum[0] = identity; - const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); - cgh.parallel_for(cl::sycl::range<1>{L}, - Reduction, - [=] (cl::sycl::id<1> index, auto &sum) { - sum^=vec[index]; - }); - }); + Word ret = 0; + { + sycl::buffer abuff(&ret, {1}); + theGridAccelerator->submit([&](sycl::handler &cgh) { + auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); + cgh.parallel_for(sycl::range<1>{L}, + Reduction, + [=] (sycl::id<1> index, auto &sum) { + sum ^=vec[index]; + }); + }); + } theGridAccelerator->wait(); - Word ret = d_sum[0]; - // free(d_sum,*theGridAccelerator); return ret; } NAMESPACE_END(Grid); -/* - -template -inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) -{ - typedef typename vobj::vector_type vector; - typedef typename vobj::scalar_type scalar; - - typedef typename vobj::scalar_typeD scalarD; - typedef typename vobj::scalar_objectD sobjD; - - sobjD ret; - scalarD *ret_p = (scalarD *)&ret; - - const int nsimd = vobj::Nsimd(); - const int words = sizeof(vobj)/sizeof(vector); - - Vector buffer(osites*nsimd); - scalar *buf = &buffer[0]; - vector *dat = (vector *)lat; - - for(int w=0;w inline void sliceSumReduction_cub_small(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { +template +inline void sliceSumReduction_cub_small(const vobj *Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) +{ size_t subvol_size = e1*e2; - commVector reduction_buffer(rd*subvol_size); + deviceVector reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; vobj zero_init; zeroit(zero_init); @@ -94,7 +103,15 @@ template inline void sliceSumReduction_cub_small(const vobj *Data, V #if defined(GRID_SYCL) -template inline void sliceSumReduction_sycl_small(const vobj *Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template +inline void sliceSumReduction_sycl_small(const vobj *Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { size_t subvol_size = e1*e2; @@ -105,7 +122,7 @@ template inline void sliceSumReduction_sycl_small(const vobj *Data, mysum[r] = vobj_zero; } - commVector reduction_buffer(rd*subvol_size); + deviceVector reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; @@ -124,11 +141,11 @@ template inline void sliceSumReduction_sycl_small(const vobj *Data, }); for (int r = 0; r < rd; r++) { - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); - cgh.parallel_for(cl::sycl::range<1>{subvol_size}, + theGridAccelerator->submit([&](sycl::handler &cgh) { + auto Reduction = sycl::reduction(&mysum[r],std::plus<>()); + cgh.parallel_for(sycl::range<1>{subvol_size}, Reduction, - [=](cl::sycl::id<1> item, auto &sum) { + [=](sycl::id<1> item, auto &sum) { auto s = item[0]; sum += rb_p[r*subvol_size+s]; }); @@ -144,14 +161,23 @@ template inline void sliceSumReduction_sycl_small(const vobj *Data, } #endif -template inline void sliceSumReduction_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { +template +inline void sliceSumReduction_large(const vobj *Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) +{ typedef typename vobj::vector_type vector; const int words = sizeof(vobj)/sizeof(vector); const int osites = rd*e1*e2; - commVectorbuffer(osites); + deviceVectorbuffer(osites); vector *dat = (vector *)Data; vector *buf = &buffer[0]; - Vector lvSum_small(rd); + std::vector lvSum_small(rd); vector *lvSum_ptr = (vector *)&lvSum[0]; for (int w = 0; w < words; w++) { @@ -168,13 +194,18 @@ template inline void sliceSumReduction_large(const vobj *Data, Vecto for (int r = 0; r < rd; r++) { lvSum_ptr[w+words*r]=lvSum_small[r]; } - } - - } -template inline void sliceSumReduction_gpu(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +template +inline void sliceSumReduction_gpu(const Lattice &Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) { autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. if constexpr (sizeof(vobj) <= 256) { @@ -192,7 +223,15 @@ template inline void sliceSumReduction_gpu(const Lattice &Data } -template inline void sliceSumReduction_cpu(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template +inline void sliceSumReduction_cpu(const Lattice &Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { // sum over reduced dimension planes, breaking out orthog dir // Parallel over orthog direction @@ -208,16 +247,20 @@ template inline void sliceSumReduction_cpu(const Lattice &Data }); } -template inline void sliceSumReduction(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template inline void sliceSumReduction(const Lattice &Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { - #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) - +#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #else +#else sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #endif +#endif } diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index ad1496f5..fb533212 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase inline void ScatterSlice(const cshiftVector &buf, +template inline void ScatterSlice(const deviceVector &buf, Lattice &lat, int x, int dim, @@ -140,7 +140,7 @@ template inline void ScatterSlice(const cshiftVector &buf, }); } -template inline void GatherSlice(cshiftVector &buf, +template inline void GatherSlice(deviceVector &buf, const Lattice &lat, int x, int dim, @@ -462,13 +462,13 @@ public: int rNsimd = Nsimd / simd[dimension]; assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); - static cshiftVector send_buf; - static cshiftVector recv_buf; + static deviceVector send_buf; + static deviceVector recv_buf; send_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth); - std::vector fwd_req; - std::vector bwd_req; + std::vector fwd_req; + std::vector bwd_req; int words = buffer_size; int bytes = words * sizeof(vobj); diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index 8acae81b..c3a46729 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -98,7 +98,7 @@ public: virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative - + ///////////////////////////////////////////////////////////// // virtual smeared interface through configuration container ///////////////////////////////////////////////////////////// @@ -132,6 +132,10 @@ public: template class EmptyAction : public Action { + using Action::refresh; + using Action::Sinitial; + using Action::deriv; + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative diff --git a/Grid/qcd/action/fermion/AbstractEOFAFermion.h b/Grid/qcd/action/fermion/AbstractEOFAFermion.h index 18bcb394..3c203d17 100644 --- a/Grid/qcd/action/fermion/AbstractEOFAFermion.h +++ b/Grid/qcd/action/fermion/AbstractEOFAFermion.h @@ -55,6 +55,11 @@ public: RealD alpha; // Mobius scale RealD k; // EOFA normalization constant + // Device resident + deviceVector d_shift_coefficients; + deviceVector d_MooeeInv_shift_lc; + deviceVector d_MooeeInv_shift_norm; + virtual void Instantiatable(void) = 0; // EOFA-specific operations @@ -92,6 +97,11 @@ public: this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) / ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) / ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) ); + + d_shift_coefficients.resize(Ls); + d_MooeeInv_shift_lc.resize(Ls); + d_MooeeInv_shift_norm.resize(Ls); + }; }; diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index cf39ec99..ec80b692 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -90,16 +90,16 @@ public: void M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, - Vector &lower, - Vector &diag, - Vector &upper); + std::vector &lower, + std::vector &diag, + std::vector &upper); void M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - Vector &lower, - Vector &diag, - Vector &upper); + std::vector &lower, + std::vector &diag, + std::vector &upper); virtual void Instantiatable(void)=0; @@ -119,35 +119,51 @@ public: RealD mass_plus, mass_minus; // Save arguments to SetCoefficientsInternal - Vector _gamma; + std::vector _gamma; RealD _zolo_hi; RealD _b; RealD _c; + // possible boost + std::vector qmu; + void set_qmu(std::vector _qmu) { qmu=_qmu; assert(qmu.size()==Nd);}; + void addQmu(const FermionField &in, FermionField &out, int dag); + // Cayley form Moebius (tanh and zolotarev) - Vector omega; - Vector bs; // S dependent coeffs - Vector cs; - Vector as; + std::vector omega; + std::vector bs; // S dependent coeffs + std::vector cs; + std::vector as; // For preconditioning Cayley form - Vector bee; - Vector cee; - Vector aee; - Vector beo; - Vector ceo; - Vector aeo; + std::vector bee; + std::vector cee; + std::vector aee; + std::vector beo; + std::vector ceo; + std::vector aeo; // LDU factorisation of the eeoo matrix - Vector lee; - Vector leem; - Vector uee; - Vector ueem; - Vector dee; + std::vector lee; + std::vector leem; + std::vector uee; + std::vector ueem; + std::vector dee; + + // Device memory + deviceVector d_diag; + deviceVector d_upper; + deviceVector d_lower; + + deviceVector d_lee; + deviceVector d_dee; + deviceVector d_uee; + deviceVector d_leem; + deviceVector d_ueem; // Matrices of 5d ee inverse params - Vector > MatpInv; - Vector > MatmInv; - Vector > MatpInvDag; - Vector > MatmInvDag; + // std::vector > MatpInv; + // std::vector > MatmInv; + // std::vector > MatpInvDag; + // std::vector > MatmInvDag; /////////////////////////////////////////////////////////////// // Conserved current utilities @@ -187,7 +203,7 @@ public: protected: virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); - virtual void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c); + virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c); }; NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 2300afd3..3fb84cd5 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -60,6 +60,50 @@ public: // virtual void Instantiatable(void)=0; virtual void Instantiatable(void) =0; + void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector boundary, std::vector twist) + { + std::cout << "Free Propagator for PartialFraction"<_fdimensions[nu+shift]))); + //momenta for propagator shifted by twist+boundary + twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI)); + } + in_buf = exp(ci*ph*(-1.0))*in; + + theFFT.FFT_all_dim(in_k,in,FFT::forward); + this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist); + theFFT.FFT_all_dim(out,prop_k,FFT::backward); + + //phase for boundary condition + out = out * exp(ci*ph); + }; + + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { + std::vector twist(Nd,0.0); //default: periodic boundarys in all directions + std::vector boundary; + for(int i=0;i &out); @@ -90,12 +134,12 @@ protected: RealD mass; RealD R; RealD ZoloHiInv; - Vector Beta; - Vector cc;; - Vector cc_d;; - Vector sqrt_cc; - Vector See; - Vector Aee; + std::vector Beta; + std::vector cc;; + std::vector cc_d;; + std::vector sqrt_cc; + std::vector See; + std::vector Aee; }; diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h index bcc97176..ff2420d5 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h @@ -69,10 +69,10 @@ public: // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); virtual void RefreshShiftCoefficients(RealD new_shift); @@ -83,7 +83,7 @@ public: RealD _M5, const ImplParams& p=ImplParams()); protected: - void SetCoefficientsInternal(RealD zolo_hi, Vector& gamma, RealD b, RealD c); + void SetCoefficientsInternal(RealD zolo_hi, std::vector& gamma, RealD b, RealD c); }; NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index 60cfc727..f7655f24 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -102,11 +102,11 @@ public: GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); ////////////////////////////////////////////////////////////////////////// @@ -164,8 +164,6 @@ public: DoubledGaugeField UUUmuEven; DoubledGaugeField UUUmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; /////////////////////////////////////////////////////////////// // Conserved current utilities diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 5b26b35c..2641a6b8 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -100,7 +100,6 @@ public: int dag); void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -108,7 +107,6 @@ public: int dag); void DhopInternalOverlappedComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -116,7 +114,6 @@ public: int dag); void DhopInternalSerialComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -192,8 +189,6 @@ public: DoubledGaugeField UUUmuEven; DoubledGaugeField UUUmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; // Comms buffer // std::vector > comm_buf; diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermion.h b/Grid/qcd/action/fermion/MobiusEOFAFermion.h index 6e4f79eb..39c21643 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h +++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h @@ -42,11 +42,11 @@ public: public: // Shift operator coefficients for red-black preconditioned Mobius EOFA - Vector Mooee_shift; - Vector MooeeInv_shift_lc; - Vector MooeeInv_shift_norm; - Vector MooeeInvDag_shift_lc; - Vector MooeeInvDag_shift_norm; + std::vector Mooee_shift; + std::vector MooeeInv_shift_lc; + std::vector MooeeInv_shift_norm; + std::vector MooeeInvDag_shift_lc; + std::vector MooeeInvDag_shift_norm; virtual void Instantiatable(void) {}; @@ -74,18 +74,18 @@ public: // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper, - Vector& shift_coeffs); + std::vector& lower, std::vector& diag, std::vector& upper, + std::vector& shift_coeffs); void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper, - Vector& shift_coeffs); + std::vector& lower, std::vector& diag, std::vector& upper, + std::vector& shift_coeffs); virtual void RefreshShiftCoefficients(RealD new_shift); diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h index 5f69c2b1..9ec6be90 100644 --- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -102,11 +102,11 @@ public: GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); ////////////////////////////////////////////////////////////////////////// @@ -152,9 +152,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - /////////////////////////////////////////////////////////////// // Conserved current utilities /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h index 350e89e2..8f0c91eb 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h @@ -42,7 +42,7 @@ public: void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { this->MomentumSpacePropagatorHw(out,in,_m,twist); - }; + }; // Constructors OverlapWilsonCayleyTanhFermion(GaugeField &_Umu, diff --git a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h index d15690fa..33e59b88 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h @@ -41,6 +41,10 @@ public: public: // Constructors + virtual void Instantiatable(void){}; + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h index 9d1a9a86..5b603017 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h @@ -41,6 +41,9 @@ public: public: virtual void Instantiatable(void){}; + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; // Constructors OverlapWilsonContFracTanhFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h index ce796d4a..747cb508 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h @@ -40,6 +40,9 @@ public: INHERIT_IMPL_TYPES(Impl); virtual void Instantiatable(void){}; + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; // Constructors OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h index f2fb46cd..7210d6af 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h @@ -41,6 +41,9 @@ public: public: virtual void Instantiatable(void){}; + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; // Constructors OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h index f98b64a9..f0be4388 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h @@ -40,6 +40,11 @@ public: INHERIT_IMPL_TYPES(Impl); virtual void Instantiatable(void){}; + + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; + // Constructors OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index 54f8547f..a71fc3f3 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D public: INHERIT_IMPL_TYPES(Impl); - const int part_frac_chroma_convention=1; + const int part_frac_chroma_convention=0; void Meooe_internal(const FermionField &in, FermionField &out,int dag); void Mooee_internal(const FermionField &in, FermionField &out,int dag); @@ -83,19 +83,78 @@ public: GridRedBlackCartesian &FourDimRedBlackGrid, RealD _mass,RealD M5,const ImplParams &p= ImplParams()); + PartialFractionFermion5D(GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass,RealD M5,std::vector &_qmu,const ImplParams &p= ImplParams()); + + void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector boundary, std::vector twist) + { + std::cout << "Free Propagator for PartialFraction"<_fdimensions[nu+shift]))); + //momenta for propagator shifted by twist+boundary + twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI)); + } + in_buf = exp(ci*ph*(-1.0))*in; + + theFFT.FFT_all_dim(in_k,in,FFT::forward); + if ( this->qmu.size() ){ + this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu); + } else { + this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist); + } + theFFT.FFT_all_dim(out,prop_k,FFT::backward); + + //phase for boundary condition + out = out * exp(ci*ph); + }; + + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { + std::vector twist(Nd,0.0); //default: periodic boundarys in all directions + std::vector boundary; + for(int i=0;i _qmu) { qmu=_qmu; assert(qmu.size()==Nd);}; + void addQmu(const FermionField &in, FermionField &out, int dag); + protected: virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale); virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata); + std::vector qmu; + // Part frac RealD mass; RealD dw_diag; RealD R; RealD amax; RealD scale; - Vector p; - Vector q; + std::vector p; + std::vector q; }; diff --git a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h index 1545c245..00ac222f 100644 --- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h +++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h @@ -35,7 +35,7 @@ template class KappaSimilarityTransform { public: INHERIT_IMPL_TYPES(Matrix); - Vector kappa, kappaDag, kappaInv, kappaInvDag; + std::vector kappa, kappaDag, kappaInv, kappaInvDag; KappaSimilarityTransform (Matrix &zmob) { for (int i=0;i<(int)zmob.bs.size();i++) { diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index d67105bb..c609be03 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -49,10 +49,10 @@ template class StaggeredKernels : public FermionOperator , pub public: - void DhopImproved(StencilImpl &st, LebesgueOrder &lo, + void DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior); - void DhopNaive(StencilImpl &st, LebesgueOrder &lo, + void DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior); diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 186fa278..605bdcec 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -47,7 +47,7 @@ public: static int PartialCompressionFactor(GridBase *grid) { return 1;} #endif template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -109,7 +109,7 @@ public: // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. //////////////////////////////////////////////////////////////////////////////////////////// template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -197,7 +197,7 @@ public: #endif template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -208,7 +208,7 @@ public: else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); } template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -402,7 +402,6 @@ public: typedef CartesianStencil Base; typedef typename Base::View_type View_type; - typedef typename Base::StencilVector StencilVector; // Vector surface_list; WilsonStencil(GridBase *grid, @@ -415,29 +414,6 @@ public: // surface_list.resize(0); this->same_node.resize(npoints); }; - - /* - void BuildSurfaceList(int Ls,int vol4){ - - // find same node for SHM - // Here we know the distance is 1 for WilsonStencil - for(int point=0;point_npoints;point++){ - this->same_node[point] = this->SameNode(point); - } - - for(int site = 0 ;site< vol4;site++){ - int local = 1; - for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ - local = 0; - } - } - if(local == 0) { - surface_list.push_back(site); - } - } - } - */ template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index a7a1bb69..16320a93 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -126,14 +126,17 @@ public: void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternal(StencilImpl &st, + DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - const FermionField &in, FermionField &out, int dag); + void DhopInternalSerial(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, @@ -168,9 +171,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - WilsonAnisotropyCoefficients anisotropyCoeff; /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 0b07d320..40c1871f 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -109,6 +109,8 @@ public: void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector twist, + std::vector qmu) ; // Implement hopping term non-hermitian hopping term; half cb or both // Implement s-diagonal DW @@ -117,6 +119,9 @@ public: void DhopOE(const FermionField &in, FermionField &out,int dag); void DhopEO(const FermionField &in, FermionField &out,int dag); + void DhopComms (const FermionField &in, FermionField &out); + void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids); + // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); @@ -135,21 +140,18 @@ public: int dag); void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); void DhopInternalOverlappedComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); void DhopInternalSerialComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, @@ -203,9 +205,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - // Comms buffer // std::vector > comm_buf; diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 2d868c27..ad077dd3 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -57,6 +57,10 @@ public: int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; + static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + uint64_t *ids); + static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; diff --git a/Grid/qcd/action/fermion/ZMobiusFermion.h b/Grid/qcd/action/fermion/ZMobiusFermion.h index fc8a7439..f8d1f11f 100644 --- a/Grid/qcd/action/fermion/ZMobiusFermion.h +++ b/Grid/qcd/action/fermion/ZMobiusFermion.h @@ -58,7 +58,7 @@ public: { // RealD eps = 1.0; std::cout< zgamma(this->Ls); + std::vector zgamma(this->Ls); for(int s=0;sLs;s++){ zgamma[s] = gamma[s]; } diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h similarity index 99% rename from Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h rename to Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h index e3bf67db..478fbb8b 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h +++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h @@ -1,3 +1,5 @@ +#if 0 + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -818,3 +820,5 @@ CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi, } NAMESPACE_END(Grid); + +#endif diff --git a/Grid/stencil/Lebesgue.cc b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc similarity index 99% rename from Grid/stencil/Lebesgue.cc rename to Grid/qcd/action/fermion/deprecated/Lebesgue.cc index 656ecca8..480483ed 100644 --- a/Grid/stencil/Lebesgue.cc +++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc @@ -1,3 +1,4 @@ +#if 0 /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void) } NAMESPACE_END(Grid); +#endif diff --git a/Grid/stencil/Lebesgue.h b/Grid/qcd/action/fermion/deprecated/Lebesgue.h similarity index 97% rename from Grid/stencil/Lebesgue.h rename to Grid/qcd/action/fermion/deprecated/Lebesgue.h index 25fa772e..0416ad80 100644 --- a/Grid/stencil/Lebesgue.h +++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.h @@ -72,7 +72,7 @@ public: void ThreadInterleave(void); private: - Vector _LebesgueReorder; + deviceVector _LebesgueReorder; }; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index 2b8a3a18..2ace6c18 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -48,7 +48,8 @@ CayleyFermion5D::CayleyFermion5D(GaugeField &_Umu, FourDimGrid, FourDimRedBlackGrid,_M5,p), mass_plus(_mass), mass_minus(_mass) -{ +{ + // qmu defaults to zero size; } /////////////////////////////////////////////////////////////// @@ -156,18 +157,18 @@ template void CayleyFermion5D::M5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag (Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1]=mass_minus; - Vector lower(Ls,-1.0); lower[0] =mass_plus; + std::vector diag (Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1]=mass_minus; + std::vector lower(Ls,-1.0); lower[0] =mass_plus; M5D(psi,chi,chi,lower,diag,upper); } template void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - Vector diag = bs; - Vector upper= cs; - Vector lower= cs; + std::vector diag = bs; + std::vector upper= cs; + std::vector lower= cs; upper[Ls-1]=-mass_minus*upper[Ls-1]; lower[0] =-mass_plus*lower[0]; M5D(psi,psi,Din,lower,diag,upper); @@ -176,9 +177,9 @@ void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &D template void CayleyFermion5D::Meo5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = beo; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = beo; + std::vector upper(Ls); + std::vector lower(Ls); for(int i=0;i void CayleyFermion5D::Mooee (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int i=0;i void CayleyFermion5D::MooeeDag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = bee; + std::vector upper(Ls); + std::vector lower(Ls); for (int s=0;s void CayleyFermion5D::M5Ddag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); - Vector lower(Ls,-1.0); + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); + std::vector lower(Ls,-1.0); upper[Ls-1]=-mass_plus*upper[Ls-1]; lower[0] =-mass_minus*lower[0]; M5Ddag(psi,chi,chi,lower,diag,upper); @@ -248,9 +249,9 @@ template void CayleyFermion5D::MeooeDag5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - Vector diag =bs; - Vector upper=cs; - Vector lower=cs; + std::vector diag =bs; + std::vector upper=cs; + std::vector lower=cs; for (int s=0;s::MeooeDag5D (const FermionField &psi, FermionField M5Ddag(psi,psi,Din,lower,diag,upper); } +template +void CayleyFermion5D::addQmu(const FermionField &psi,FermionField &chi, int dag) +{ + if ( qmu.size() ) { + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + std::vector coeff(Nd); + ComplexD ci(0,1); + + assert(qmu.size()==Nd); + + for(int mu=0;mu void CayleyFermion5D::M (const FermionField &psi, FermionField &chi) { @@ -277,8 +306,12 @@ void CayleyFermion5D::M (const FermionField &psi, FermionField &chi) // Assemble Din Meooe5D(psi,Din); - + this->DW(Din,chi,DaggerNo); + + // add i q_mu gamma_mu here + addQmu(Din,chi,DaggerNo); + // ((b D_W + D_w hop terms +1) on s-diag axpby(chi,1.0,1.0,chi,psi); @@ -295,6 +328,9 @@ void CayleyFermion5D::Mdag (const FermionField &psi, FermionField &chi) FermionField Din(psi.Grid()); // Apply Dw this->DW(psi,Din,DaggerYes); + + // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not. + addQmu(psi,Din,DaggerYes); MeooeDag5D(Din,chi); @@ -394,7 +430,7 @@ void CayleyFermion5D::MeoDeriv(GaugeField &mat,const FermionField &U,const template void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) { - Vector gamma(this->Ls); + std::vector gamma(this->Ls); for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; SetCoefficientsInternal(1.0,gamma,b,c); } @@ -402,13 +438,13 @@ void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re template void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) { - Vector gamma(this->Ls); + std::vector gamma(this->Ls); for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; SetCoefficientsInternal(zolo_hi,gamma,b,c); } //Zolo template -void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) +void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c) { int Ls=this->Ls; @@ -488,7 +524,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,Vector::SetCoefficientsInternal(RealD zolo_hi,VectorMooeeInternalCompute(0,inv,MatpInv,MatmInv); // this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 0d2516c4..5fbc7612 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -43,9 +43,9 @@ void CayleyFermion5D::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, - Vector &diag, - Vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); @@ -55,12 +55,16 @@ CayleyFermion5D::M5D(const FermionField &psi_i, autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - int Ls =this->Ls; + acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + // 10 = 3 complex mult + 2 complex add // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) uint64_t nloop = grid->oSites(); @@ -82,9 +86,9 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, - Vector &diag, - Vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); @@ -93,12 +97,16 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - int Ls=this->Ls; + acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol uint64_t nloop = grid->oSites(); accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -126,11 +134,17 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi int Ls=this->Ls; - auto plee = & lee [0]; - auto pdee = & dee [0]; - auto puee = & uee [0]; - auto pleem = & leem[0]; - auto pueem = & ueem[0]; + acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -182,11 +196,17 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi autoView(psi , psi_i,AcceleratorRead); autoView(chi , chi_i,AcceleratorWrite); - auto plee = & lee [0]; - auto pdee = & dee [0]; - auto puee = & uee [0]; - auto pleem = & leem[0]; - auto pueem = & ueem[0]; + acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; assert(psi.Checkerboard() == psi.Checkerboard()); diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h index 6687800e..4bfbd31e 100644 --- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h @@ -42,13 +42,13 @@ template void ContinuedFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata) { // How to check Ls matches?? - // std::cout<n << " - n"<da << " -da "<db << " -db"<dn << " -dn"<dd << " -dd"<n << " - n"<da << " -da "<db << " -db"<dn << " -dn"<dd << " -dd"<Ls; + std::cout<db==Ls);// Beta has Ls coeffs R=(1+this->mass)/(1-this->mass); @@ -320,7 +320,7 @@ ContinuedFractionFermion5D::ContinuedFractionFermion5D( int Ls = this->Ls; conformable(solution5d.Grid(),this->FermionGrid()); conformable(exported4d.Grid(),this->GaugeGrid()); - ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); + ExtractSlice(exported4d, solution5d, Ls-1, 0); } template void ContinuedFractionFermion5D::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) @@ -330,7 +330,7 @@ ContinuedFractionFermion5D::ContinuedFractionFermion5D( conformable(input4d.Grid() ,this->GaugeGrid()); FermionField tmp(this->FermionGrid()); tmp=Zero(); - InsertSlice(input4d, tmp, Ls-1, Ls-1); + InsertSlice(input4d, tmp, Ls-1, 0); tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; this->Dminus(tmp,imported5d); } diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 6b8336cc..ae126bb5 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid); // Pplus backwards.. template void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, - Vector& lower, Vector& diag, Vector& upper) + std::vector& lower, std::vector& diag, std::vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); int Ls = this->Ls; @@ -50,9 +50,15 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi autoView( psi , psi_i, AcceleratorRead); autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); + // Flops = 6.0*(Nc*Ns) *Ls*vol auto nloop=grid->oSites()/Ls; @@ -73,7 +79,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi template void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, - Vector& lower, Vector& diag, Vector& upper) + std::vector& lower, std::vector& diag, std::vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); @@ -83,9 +89,14 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio autoView( phi , phi_i, AcceleratorRead); autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + + acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); // Flops = 6.0*(Nc*Ns) *Ls*vol @@ -114,12 +125,17 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; - auto plee = & this->lee[0]; - auto pdee = & this->dee[0]; - auto puee = & this->uee[0]; - - auto pleem = & this->leem[0]; - auto pueem = & this->ueem[0]; + auto plee = & this->d_lee [0]; + auto pdee = & this->d_dee [0]; + auto puee = & this->d_uee [0]; + auto pleem = & this->d_leem[0]; + auto pueem = & this->d_ueem[0]; + + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); uint64_t nloop=grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h index 64ee4033..53b44ca2 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h @@ -131,9 +131,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi, FermionField& chi else{ shiftm = -shift*(mq3-mq2); } } - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; - Vector lower(Ls,-1.0); lower[0] = mq1 + shiftp; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; + std::vector lower(Ls,-1.0); lower[0] = mq1 + shiftp; #if(0) std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl; @@ -168,9 +168,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, FermionField& else{ shiftm = -shift*(mq3-mq2); } } - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; - Vector lower(Ls,-1.0); lower[0] = mq1 + shiftm; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; + std::vector lower(Ls,-1.0); lower[0] = mq1 + shiftm; this->M5Ddag(psi, chi, chi, lower, diag, upper); } @@ -181,9 +181,9 @@ void DomainWallEOFAFermion::Mooee(const FermionField& psi, FermionField& c { int Ls = this->Ls; - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; @@ -200,9 +200,9 @@ void DomainWallEOFAFermion::MooeeDag(const FermionField& psi, FermionField { int Ls = this->Ls; - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; @@ -218,7 +218,7 @@ void DomainWallEOFAFermion::MooeeDag(const FermionField& psi, FermionField //Zolo template -void DomainWallEOFAFermion::SetCoefficientsInternal(RealD zolo_hi, Vector& gamma, RealD b, RealD c) +void DomainWallEOFAFermion::SetCoefficientsInternal(RealD zolo_hi, std::vector& gamma, RealD b, RealD c) { int Ls = this->Ls; int pm = this->pm; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index d235abbb..d2b4450e 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GridCartesian UUUmu(&FourDimGrid), UUUmuEven(&FourDimRedBlackGrid), UUUmuOdd(&FourDimRedBlackGrid), - Lebesgue(&FourDimGrid), - LebesgueEvenOdd(&FourDimRedBlackGrid), _tmp(&FiveDimRedBlackGrid) { @@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D::DhopDerivOE(GaugeField &mat, /*CHANGE */ template -void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + DhopInternalOverlappedComms(st,U,UUU,in,out,dag); else - DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopInternalSerialComms(st,U,UUU,in,out,dag); } template -void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { @@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & { int interior=1; int exterior=0; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } st.CommsMerge(compressor); @@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & { int interior=0; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } template -void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { @@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, { int interior=1; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } /*CHANGE END*/ @@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionFie assert(in.Checkerboard()==Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag); + DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag); } template void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) @@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionFie assert(in.Checkerboard()==Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag); + DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag); } template void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) @@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); + DhopInternal(Stencil,Umu,UUUmu,in,out,dag); } ///////////////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 4c80a1d5..bd9dd132 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -48,8 +48,6 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, G StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), @@ -339,7 +337,7 @@ void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField & out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag); + DhopInternal(Stencil, Umu, UUUmu, in, out, dag); } template @@ -351,7 +349,7 @@ void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag); } template @@ -363,7 +361,7 @@ void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag); } template @@ -394,19 +392,19 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel template -void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + DhopInternalOverlappedComms(st,U,UUU,in,out,dag); else - DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopInternalSerialComms(st,U,UUU,in,out,dag); } template -void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -429,7 +427,7 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st { int interior=1; int exterior=0; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } st.CommunicateComplete(requests); @@ -440,13 +438,13 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st { int interior=0; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } template -void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -460,7 +458,7 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le { int interior=1; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } }; diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 617a18df..b9165edb 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid); template void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper) + std::vector &lower, std::vector &diag, std::vector &upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -50,10 +50,14 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); + // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -74,8 +78,8 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField template void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper, - Vector &shift_coeffs) + std::vector &lower, std::vector &diag, std::vector &upper, + std::vector &shift_coeffs) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -86,13 +90,18 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion auto pm = this->pm; int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator - + assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - auto pshift_coeffs = &shift_coeffs[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + auto pshift_coeffs = &this->d_shift_coefficients[0]; + + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t)); // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -119,7 +128,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion template void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper) + std::vector &lower, std::vector &diag, std::vector &upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -129,10 +138,14 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -154,8 +167,8 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie template void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper, - Vector &shift_coeffs) + std::vector &lower, std::vector &diag, std::vector &upper, + std::vector &shift_coeffs) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -167,11 +180,16 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - auto pshift_coeffs = &shift_coeffs[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + auto pshift_coeffs = &this->d_shift_coefficients[0]; + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t)); + // Flops = 6.0*(Nc*Ns) *Ls*vol auto pm = this->pm; @@ -212,11 +230,17 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; + auto plee = & this->d_lee [0]; + auto pdee = & this->d_dee [0]; + auto puee = & this->d_uee [0]; + auto pleem = & this->d_leem[0]; + auto pueem = & this->d_ueem[0]; + + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } @@ -268,14 +292,23 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); + // Move into object and constructor auto pm = this->pm; - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; - auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0]; - auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0]; + auto plee = & this->d_lee [0]; + auto pdee = & this->d_dee [0]; + auto puee = & this->d_uee [0]; + auto pleem = & this->d_leem[0]; + auto pueem = & this->d_ueem[0]; + auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0]; + auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0]; + + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t)); int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -333,11 +366,17 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; + auto plee = &this->d_lee [0]; + auto pdee = &this->d_dee [0]; + auto puee = &this->d_uee [0]; + auto pleem = &this->d_leem[0]; + auto pueem = &this->d_ueem[0]; + + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -387,13 +426,25 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi int Ls = this->Ls; auto pm = this->pm; - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; - auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; - auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; + auto plee = & this->d_lee [0]; + auto pdee = & this->d_dee [0]; + auto puee = & this->d_uee [0]; + auto pleem = & this->d_leem[0]; + auto pueem = & this->d_ueem[0]; + + auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0]; + auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0]; + + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t)); + + // auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; + // auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h index 9b9db178..70f06dfc 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h @@ -196,9 +196,9 @@ void MobiusEOFAFermion::M5D(const FermionField& psi, FermionField& chi) { int Ls = this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; - Vector lower(Ls,-1.0); lower[0] = this->mq1; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; + std::vector lower(Ls,-1.0); lower[0] = this->mq1; // no shift term if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); } @@ -212,9 +212,9 @@ void MobiusEOFAFermion::M5Ddag(const FermionField& psi, FermionField& chi) { int Ls = this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; - Vector lower(Ls,-1.0); lower[0] = this->mq1; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; + std::vector lower(Ls,-1.0); lower[0] = this->mq1; // no shift term if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); } @@ -230,9 +230,9 @@ void MobiusEOFAFermion::Mooee(const FermionField& psi, FermionField& chi) int Ls = this->Ls; // coefficients of Mooee - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; lower[s] = -this->cee[s]; @@ -253,9 +253,9 @@ void MobiusEOFAFermion::MooeeDag(const FermionField& psi, FermionField& ch int Ls = this->Ls; // coefficients of MooeeDag - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s+1]; @@ -314,10 +314,10 @@ void MobiusEOFAFermion::SetCoefficientsPrecondShiftOps() // Tridiagonal solve for MooeeInvDag_shift_lc { Coeff_t m(0.0); - Vector d = Mooee_shift; - Vector u(Ls,0.0); - Vector y(Ls,0.0); - Vector q(Ls,0.0); + std::vector d = Mooee_shift; + std::vector u(Ls,0.0); + std::vector y(Ls,0.0); + std::vector q(Ls,0.0); if(pm == 1){ u[0] = 1.0; } else{ u[Ls-1] = 1.0; } diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h index bf23d99d..b596dc44 100644 --- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h @@ -48,8 +48,6 @@ NaiveStaggeredFermion::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), @@ -268,7 +266,7 @@ void NaiveStaggeredFermion::Dhop(const FermionField &in, FermionField &out out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); + DhopInternal(Stencil, Umu, in, out, dag); } template @@ -280,7 +278,7 @@ void NaiveStaggeredFermion::DhopOE(const FermionField &in, FermionField &o assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, in, out, dag); } template @@ -292,7 +290,7 @@ void NaiveStaggeredFermion::DhopEO(const FermionField &in, FermionField &o assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, in, out, dag); } template @@ -323,18 +321,18 @@ void NaiveStaggeredFermion::DhopDir(const FermionField &in, FermionField & template -void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else - DhopInternalSerialComms(st,lo,U,in,out,dag); + DhopInternalSerialComms(st,U,in,out,dag); } template -void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -356,7 +354,7 @@ void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, L { int interior=1; int exterior=0; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } st.CommunicateComplete(requests); @@ -367,12 +365,12 @@ void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, L { int interior=0; int exterior=1; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } } template -void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -385,7 +383,7 @@ void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Lebes { int interior=1; int exterior=1; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } }; diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h index 0206828b..84884c6d 100644 --- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h @@ -237,7 +237,32 @@ void PartialFractionFermion5D::M_internal(const FermionField &psi, Fermi // ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H // - this->DW(psi,D,DaggerNo); + this->DW(psi,D,DaggerNo); + + // DW - DW+iqslash + // (g5 Dw)^dag = g5 Dw + // (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu + if ( qmu.size() ) { + + std::cout<< "Mat" << "qmu ("<::M_internal(const FermionField &psi, Fermi } { + // The 'conventional' Cayley overlap operator is + // + // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw + // + // + // With massless limit 1/2(1+g5 sgnHw) + // + // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2) + // + // However, the conventional normalisation has both a leading order factor of 2 in Zq + // at tree level AND a mass dependent (1-m) that are convenient to absorb. + // + // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is + // + // num = -i sin kmu gmu + // + // denom ( sqrt(sk^2 + (2shk^2 - 1)^2 + // b_k = sk2 - M5; + // + // w_k = sqrt(sk + b_k*b_k); + // + // denom= ( w_k + b_k + mass*mass) ; + // + // denom= one/denom; + // out = num*denom; + // + // Chroma, and Grid define partial fraction via 4d operator + // + // Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw + // + // Now since: + // + // (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m) + // + // This corresponds to a modified mass parameter + // + // It has an annoying + // + // double R=(1+this->mass)/(1-this->mass); - //R g5 psi[Ls] + p[0] H + //R g5 psi[Ls] + p[0] Hw ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1); - + for(int b=0;b::SetCoefficientsZolotarev(RealD zolo_hi,App int Ls = this->Ls; conformable(solution5d.Grid(),this->FermionGrid()); conformable(exported4d.Grid(),this->GaugeGrid()); - ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); + ExtractSlice(exported4d, solution5d, Ls-1, 0); } template void PartialFractionFermion5D::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) { + //void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice, int orthog) int Ls = this->Ls; conformable(imported5d.Grid(),this->FermionGrid()); conformable(input4d.Grid() ,this->GaugeGrid()); FermionField tmp(this->FermionGrid()); tmp=Zero(); - InsertSlice(input4d, tmp, Ls-1, Ls-1); + InsertSlice(input4d, tmp, Ls-1, 0); tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; this->Dminus(tmp,imported5d); } @@ -442,7 +508,7 @@ PartialFractionFermion5D::PartialFractionFermion5D(GaugeField &_Umu, { int Ls = this->Ls; - + qmu.resize(0); assert((Ls&0x1)==1); // Odd Ls required int nrational=Ls-1; @@ -460,6 +526,22 @@ PartialFractionFermion5D::PartialFractionFermion5D(GaugeField &_Umu, Approx::zolotarev_free(zdata); } +template +PartialFractionFermion5D::PartialFractionFermion5D(GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass,RealD M5, + std::vector &_qmu, + const ImplParams &p) + : PartialFractionFermion5D(_Umu, + FiveDimGrid,FiveDimRedBlackGrid, + FourDimGrid,FourDimRedBlackGrid, + _mass,M5,p) +{ + qmu=_qmu; +} NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h index 2b6087bc..04337671 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h @@ -375,23 +375,6 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, } } -/* -#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ - template void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ - \ - template void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ - \ - template void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ -*/ #undef LOAD_CHI #undef HAND_DECLARATIONS diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index a39b529f..05dbf3b2 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -256,7 +256,7 @@ void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie }); template -void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior) { @@ -294,7 +294,7 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, assert(0 && " Kernel optimisation case not covered "); } template -void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 95af4c38..3d4e5cc5 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -58,15 +58,9 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, Umu(_FourDimGrid), UmuEven(_FourDimRedBlackGrid), UmuOdd (_FourDimRedBlackGrid), - Lebesgue(_FourDimGrid), - LebesgueEvenOdd(_FourDimRedBlackGrid), _tmp(&FiveDimRedBlackGrid), Dirichlet(0) { - Stencil.lo = &Lebesgue; - StencilEven.lo = &LebesgueEvenOdd; - StencilOdd.lo = &LebesgueEvenOdd; - // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); @@ -305,19 +299,19 @@ void WilsonFermion5D::DhopDerivOE(GaugeField &mat, } template -void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternal(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else - DhopInternalSerialComms(st,lo,U,in,out,dag); + DhopInternalSerialComms(st,U,in,out,dag); } template -void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { @@ -331,22 +325,22 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Start comms // Gather intranode and extra node differentiated?? ///////////////////////////// { + // std::cout << " WilsonFermion5D gather " < > requests; - auto id=traceStart("Communicate overlapped"); - st.CommunicateBegin(requests); +#if 1 ///////////////////////////// // Overlap with comms ///////////////////////////// - { - GRID_TRACE("MergeSHM"); - st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms - } - + st.CommunicateBegin(requests); + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms +#endif + ///////////////////////////// // do the compute interior ///////////////////////////// @@ -358,22 +352,35 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg GRID_TRACE("DhopInterior"); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); } - + + //ifdef GRID_ACCELERATED +#if 0 + ///////////////////////////// + // Overlap with comms -- on GPU the interior kernel call is nonblocking + ///////////////////////////// + st.CommunicateBegin(requests); + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms +#endif + + ///////////////////////////// // Complete comms ///////////////////////////// + // std::cout << " WilsonFermion5D Comms Complete " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg GRID_TRACE("DhopExterior"); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); } + // std::cout << " WilsonFermion5D Done " < -void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) @@ -395,11 +403,13 @@ void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOr int LLs = in.Grid()->_rdimensions[0]; + // std::cout << " WilsonFermion5D Halo exch " <::DhopInternalSerialComms(StencilImpl & st, LebesgueOr GRID_TRACE("Dhop"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); } + // std::cout << " WilsonFermion5D Done " <::DhopOE(const FermionField &in, FermionField &out,int assert(in.Checkerboard()==Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag); + DhopInternal(StencilEven,UmuOdd,in,out,dag); } template void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) @@ -431,8 +442,31 @@ void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int assert(in.Checkerboard()==Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag); + DhopInternal(StencilOdd,UmuEven,in,out,dag); } +template +void WilsonFermion5D::DhopComms(const FermionField &in, FermionField &out) +{ + int dag =0 ; + conformable(in.Grid(),FermionGrid()); // verifies full grid + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + Compressor compressor(dag); + Stencil.HaloExchangeOpt(in,compressor); +} +template +void WilsonFermion5D::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids) +{ + conformable(in.Grid(),FermionGrid()); // verifies full grid + conformable(in.Grid(),out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + int LLs = in.Grid()->_rdimensions[0]; + int Opt = WilsonKernelsStatic::Opt; + Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids); +} + template void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { @@ -441,7 +475,7 @@ void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int d out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil,Lebesgue,Umu,in,out,dag); + DhopInternal(Stencil,Umu,in,out,dag); } template void WilsonFermion5D::DW(const FermionField &in, FermionField &out,int dag) @@ -735,6 +769,15 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe template void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) +{ + std::vector empty_q(Nd,0.0); + MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q); +} +template +void WilsonFermion5D::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in, + RealD mass, + std::vector twist, + std::vector qmu) { Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, @@ -750,6 +793,7 @@ void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const Fe typedef typename FermionField::scalar_type ScalComplex; typedef Lattice > LatComplex; + typedef iSpinMatrix SpinMat; Coordinate latt_size = _grid->_fdimensions; @@ -767,8 +811,10 @@ void WilsonFermion5D::MomentumSpacePropagatorHw(FermionField &out,const Fe LatComplex kmu(_grid); ScalComplex ci(0.0,1.0); + std::cout<< "Feynman Rule" << "qmu ("<::MomentumSpacePropagatorHw(FermionField &out,const Fe kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5); - sk = sk + sin(kmu)*sin(kmu); - num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in); + sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]); + + // Terms for boosted Fermion + // 1/2 [ -i gamma.(sin p + q ) ] + // [ --------------------- + 1 ] + // [ wq + b ] + // + // wq = sqrt( (sinp+q)^2 + b^2 ) + // + + num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in); } num = num + mass * in ; diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 1a262533..8c58f692 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -52,17 +52,12 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), _tmp(&Hgrid), anisotropyCoeff(anis) { - Stencil.lo = &Lebesgue; - StencilEven.lo = &LebesgueEvenOdd; - StencilOdd.lo = &LebesgueEvenOdd; // Allocate the required comms buffer ImportGauge(_Umu); if (anisotropyCoeff.isAnisotropic){ @@ -314,7 +309,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); + DhopInternal(Stencil, Umu, in, out, dag); } template @@ -326,7 +321,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, in, out, dag); } template @@ -338,7 +333,7 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, in, out, dag); } template @@ -391,21 +386,21 @@ void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out, }; template -void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void WilsonFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else #endif - DhopInternalSerial(st,lo,U,in,out,dag); + DhopInternalSerial(st,U,in,out,dag); } template -void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -474,10 +469,10 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO template -void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, int dag) +void WilsonFermion::DhopInternalSerial(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { GRID_TRACE("DhopSerial"); assert((dag == DaggerNo) || (dag == DaggerYes)); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h index e025ba41..2633c127 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h @@ -40,11 +40,11 @@ Author: paboyle /// Switch off the 5d vectorised code optimisations #undef DWFVEC5D -static Vector signsF; +static std::vector signsF; template - int setupSigns(Vector& signs ){ - Vector bother(2); + int setupSigns(std::vector& signs ){ + std::vector bother(2); signs = bother; vrsign(signs[0]); visign(signs[1]); @@ -364,7 +364,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Doubled #include -static Vector signsD; +static std::vector signsD; static int signInitD = setupSigns(signsD); #define MAYBEPERM(A,perm) if (perm) { A ; } diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 90defc54..1d0dfb61 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -411,6 +411,46 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #undef LoopBody } +#ifdef GRID_SYCL +extern "C" { + ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void ); + void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value ); +} +#ifdef GRID_SIMT +#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id()) +#else +#define MAKE_ID(A) (0) +#endif + +#else + +#define MAKE_ID(A) (0) + +#endif + + +#define KERNEL_CALL_ID(A) \ + const uint64_t NN = Nsite*Ls; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ + const int Nsimd = SiteHalfSpinor::Nsimd(); \ + const int lane=acceleratorSIMTlane(Nsimd); \ + int idx=sF*Nsimd+lane; \ + uint64_t id = MAKE_ID(); \ + ids[idx]=id; \ + }); \ + accelerator_barrier(); #define KERNEL_CALLNB(A) \ const uint64_t NN = Nsite*Ls; \ @@ -418,7 +458,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S int sF = ss; \ int sU = ss/Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ - }); + }); #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); @@ -434,7 +474,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #define ASM_CALL(A) \ thread_for( sss, Nsite, { \ - int ss = st.lo->Reorder(sss); \ + int ss = sss; /*st.lo->Reorder(sss);*/ \ int sU = ss; \ int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ @@ -451,6 +491,8 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ });} + + template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, @@ -485,6 +527,18 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField } assert(0 && " Kernel optimisation case not covered "); } + +template +void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + uint64_t *ids) +{ + autoView(U_v , U,AcceleratorRead); + autoView(in_v , in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v , st,AcceleratorRead); + KERNEL_CALL_ID(GenericDhopSite); +} template void WilsonKernels::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index b9d6ac16..b7f31d0e 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -40,6 +40,11 @@ public: INHERIT_GIMPL_TYPES(Gimpl); + using Action::S; + using Action::Sinitial; + using Action::deriv; + using Action::refresh; + private: RealD c_plaq; RealD c_rect; diff --git a/Grid/qcd/action/gauge/WilsonGaugeAction.h b/Grid/qcd/action/gauge/WilsonGaugeAction.h index f535b54f..22c792cc 100644 --- a/Grid/qcd/action/gauge/WilsonGaugeAction.h +++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h @@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action { public: INHERIT_GIMPL_TYPES(Gimpl); + using Action::S; + using Action::Sinitial; + using Action::deriv; + using Action::refresh; + /////////////////////////// constructors explicit WilsonGaugeAction(RealD beta_):beta(beta_){}; diff --git a/Grid/qcd/representations/adjoint.h b/Grid/qcd/representations/adjoint.h index ee54b465..8d7e9e3c 100644 --- a/Grid/qcd/representations/adjoint.h +++ b/Grid/qcd/representations/adjoint.h @@ -40,7 +40,7 @@ public: U = Zero(); LatticeColourMatrix tmp(Uin.Grid()); - Vector::Matrix> ta(Dimension); + std::vector::Matrix> ta(Dimension); // Debug lines // LatticeMatrix uno(Uin.Grid()); diff --git a/Grid/qcd/representations/two_index.h b/Grid/qcd/representations/two_index.h index 24d6d7cb..c9c1db94 100644 --- a/Grid/qcd/representations/two_index.h +++ b/Grid/qcd/representations/two_index.h @@ -43,7 +43,7 @@ public: U = Zero(); LatticeColourMatrix tmp(Uin.Grid()); - Vector::Matrix> eij(Dimension); + std::vector::Matrix> eij(Dimension); for (int a = 0; a < Dimension; a++) GaugeGroupTwoIndex::base(a, eij[a]); diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index b63d8571..7089fd1b 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -6,6 +6,34 @@ NAMESPACE_BEGIN(Grid); #undef DELTA_F_EQ_2 +/////////////////////////////////////////////////////////////////// +//Meson +// Interested in +// +// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] +// +// Conventional meson field: +// +// = sum_x,y Trace[ sum_j G |v_j(y,ty)> +// = sum_ij PI_ji(tx) PI_ij(ty) +// +// G5-Hermiticity +// +// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] +// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ] +// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> +// = sum_ij PionVV(ty) PionWW(tx) +// +// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker +// expectation value. Otherwise biased. +//////////////////////////////////////////////////////////////////// + template class A2Autils { @@ -26,7 +54,9 @@ public: typedef iSpinColourMatrix SpinColourMatrix_v; - template // output: rank 5 tensor, e.g. Eigen::Tensor + + // output: rank 5 tensor, e.g. Eigen::Tensor + template static void MesonField(TensorType &mat, const FermionField *lhs_wi, const FermionField *rhs_vj, @@ -34,31 +64,6 @@ public: const std::vector &mom, int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); - static void PionFieldWVmom(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - const std::vector &mom, - int orthogdim); - - static void PionFieldXX(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - int orthogdim, - int g5); - - static void PionFieldWV(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - int orthogdim); - static void PionFieldWW(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *wj, - int orthogdim); - static void PionFieldVV(Eigen::Tensor &mat, - const FermionField *vi, - const FermionField *vj, - int orthogdim); - template // output: rank 5 tensor, e.g. Eigen::Tensor static void AslashField(TensorType &mat, const FermionField *lhs_wi, @@ -118,6 +123,211 @@ private: const int Ns, const int ss); }; +const int A2Ablocking=8; + +template using iVecSpinMatrix = iVector, Ns>, A2Ablocking>; +typedef iVecSpinMatrix VecSpinMatrix; +typedef iVecSpinMatrix vVecSpinMatrix; +typedef Lattice LatticeVecSpinMatrix; + +template using iVecComplex = iVector >, A2Ablocking>; +typedef iVecComplex VecComplex; +typedef iVecComplex vVecComplex; +typedef Lattice LatticeVecComplex; + +#define A2A_GPU_KERNELS +#ifdef A2A_GPU_KERNELS +template +template +void A2Autils::MesonField(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + std::vector gammas, + const std::vector &mom, + int orthogdim, double *t_kernel, double *t_gsum) +{ + const int block=A2Ablocking; + typedef typename FImpl::SiteSpinor vobj; + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + // assert(Lblock % block==0); + // assert(Rblock % block==0); + + GridBase *grid = lhs_wi[0].Grid(); + + // const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Ngamma = gammas.size(); + int Nmom = mom.size(); + + LatticeVecSpinMatrix SpinMat(grid); + LatticeVecSpinMatrix MomSpinMat(grid); + + std::vector sliced; + for(int i=0;ioSites(),(size_t)Nsimd,{ + auto left = conjugate(lhs_v(ss)); + auto right = rhs_v(ss); + auto vv = SpinMat_v(ss); + for(int s1=0;s1(sliced[t],jj); + auto trSG = trace(tmp*Gamma(gammas[mu])); + mat(m,mu,t,i,j) = trSG()(); + } + } + } + } + }//jo + } +} + +// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x) +// +// With: +// +// B_0 = A_0 + i A_1 +// B_1 = A_2 + i A_3 +// +// then in spin space +// +// ( 0 0 -conj(B_1) -B_0 ) +// i * A_mu g_mu = ( 0 0 -conj(B_0) B_1 ) +// ( B_1 B_0 0 0 ) +// ( conj(B_0) -conj(B_1) 0 0 ) + +template +template +void A2Autils::AslashField(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + const std::vector &emB0, + const std::vector &emB1, + int orthogdim, double *t_kernel, double *t_gsum) +{ + const int block=A2Ablocking; + typedef typename FImpl::SiteSpinor vobj; + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + int Nem = emB0.size(); + assert(emB1.size() == Nem); + + // assert(Lblock % block==0); + // assert(Rblock % block==0); + + GridBase *grid = lhs_wi[0].Grid(); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + + LatticeVecSpinMatrix SpinMat(grid); + LatticeVecComplex Aslash(grid); + std::vector sliced; + + for(int i=0;ioSites(),(size_t)Nsimd,{ + auto left = conjugate(lhs_v(ss)); + auto right = rhs_v(ss); + auto vv = SpinMat_v(ss); + for(int s1=0;s1oSites(),(size_t)Nsimd,{ + auto vv = SpinMat_v(ss); + auto b0 = emB0_v(ss); + auto b1 = emB1_v(ss); + auto cb0 = conjugate(b0); + auto cb1 = conjugate(b1); + auto asl = Aslash_v(ss); + for(int j=jo;j template void A2Autils::MesonField(TensorType &mat, @@ -158,15 +368,15 @@ void A2Autils::MesonField(TensorType &mat, int MFrvol = rd*Lblock*Rblock*Nmom; int MFlvol = ld*Lblock*Rblock*Nmom; - Vector lvSum(MFrvol); - thread_for( r, MFrvol,{ + std::vector lvSum(MFrvol); + for(int r=0;r lsSum(MFlvol); - thread_for(r,MFlvol,{ + std::vector lsSum(MFlvol); + for(int r=0;r_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; @@ -174,7 +384,7 @@ void A2Autils::MesonField(TensorType &mat, // potentially wasting cores here if local time extent too small if (t_kernel) *t_kernel = -usecond(); - thread_for(r,rd,{ + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -213,10 +423,10 @@ void A2Autils::MesonField(TensorType &mat, } } } - }); + }; // Sum across simd lanes in the plane, breaking out orthog dir. - thread_for(rt,rd,{ + for(int rt=0;rt extracted(Nsimd); @@ -241,7 +451,7 @@ void A2Autils::MesonField(TensorType &mat, } }}} - }); + } if (t_kernel) *t_kernel += usecond(); assert(mat.dimension(0) == Nmom); assert(mat.dimension(1) == Ngamma); @@ -290,423 +500,54 @@ void A2Autils::MesonField(TensorType &mat, if (t_gsum) *t_gsum += usecond(); } - -/////////////////////////////////////////////////////////////////// -//Meson -// Interested in -// -// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] -// -// Conventional meson field: -// -// = sum_x,y Trace[ sum_j G |v_j(y,ty)> -// = sum_ij PI_ji(tx) PI_ij(ty) -// -// G5-Hermiticity -// -// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] -// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ] -// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> -// = sum_ij PionVV(ty) PionWW(tx) -// -// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker -// expectation value. Otherwise biased. -//////////////////////////////////////////////////////////////////// - -template -void A2Autils::PionFieldXX(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - int orthogdim, - int g5) -{ - int Lblock = mat.dimension(1); - int Rblock = mat.dimension(2); - - GridBase *grid = wi[0].Grid(); - - const int nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - int Nt = grid->GlobalDimensions()[orthogdim]; - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - // will locally sum vectors first - // sum across these down to scalars - // splitting the SIMD - int MFrvol = rd*Lblock*Rblock; - int MFlvol = ld*Lblock*Rblock; - - Vector lvSum(MFrvol); - thread_for(r,MFrvol,{ - lvSum[r] = Zero(); - }); - - Vector lsSum(MFlvol); - thread_for(r,MFlvol,{ - lsSum[r]=scalar_type(0.0); - }); - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - - thread_for(r,rd,{ - - int so=r*grid->_ostride[orthogdim]; // base offset for start of plane - - for(int n=0;n temp; - ExtractBuffer > extracted(Nsimd); - - for(int i=0;iiCoorFromIindex(icoor,idx); - - int ldx = rt+icoor[orthogdim]*rd; - - int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx; - - lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; - - } - }} - }); - - assert(mat.dimension(0) == Nt); - // ld loop and local only?? - int pd = grid->_processors[orthogdim]; - int pc = grid->_processor_coor[orthogdim]; - thread_for_collapse(2,lt,ld,{ - for(int pt=0;ptGlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock); -} - -template -void A2Autils::PionFieldWVmom(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - const std::vector &mom, - int orthogdim) -{ - int Lblock = mat.dimension(2); - int Rblock = mat.dimension(3); - - GridBase *grid = wi[0].Grid(); - - const int nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - int Nt = grid->GlobalDimensions()[orthogdim]; - int Nmom = mom.size(); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - // will locally sum vectors first - // sum across these down to scalars - // splitting the SIMD - int MFrvol = rd*Lblock*Rblock*Nmom; - int MFlvol = ld*Lblock*Rblock*Nmom; - - Vector lvSum(MFrvol); - thread_for(r,MFrvol,{ - lvSum[r] = Zero(); - }); - - Vector lsSum(MFlvol); - thread_for(r,MFlvol,{ - lsSum[r]=scalar_type(0.0); - }); - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - - thread_for(r,rd,{ - - int so=r*grid->_ostride[orthogdim]; // base offset for start of plane - - for(int n=0;n temp; - ExtractBuffer > extracted(Nsimd); - - for(int i=0;iiCoorFromIindex(icoor,idx); - - int ldx = rt+icoor[orthogdim]*rd; - - int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; - - lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; - - } - }}} - }); - - assert(mat.dimension(0) == Nmom); - assert(mat.dimension(1) == Nt); - - int pd = grid->_processors[orthogdim]; - int pc = grid->_processor_coor[orthogdim]; - thread_for_collapse(2,lt,ld,{ - for(int pt=0;ptGlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock); -} - -template -void A2Autils::PionFieldWV(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - int orthogdim) -{ - const int g5=1; - PionFieldXX(mat,wi,vj,orthogdim,g5); -} -template -void A2Autils::PionFieldWW(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *wj, - int orthogdim) -{ - const int nog5=0; - PionFieldXX(mat,wi,wj,orthogdim,nog5); -} -template -void A2Autils::PionFieldVV(Eigen::Tensor &mat, - const FermionField *vi, - const FermionField *vj, - int orthogdim) -{ - const int nog5=0; - PionFieldXX(mat,vi,vj,orthogdim,nog5); -} - -// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x) -// -// With: -// -// B_0 = A_0 + i A_1 -// B_1 = A_2 + i A_3 -// -// then in spin space -// -// ( 0 0 -conj(B_1) -B_0 ) -// i * A_mu g_mu = ( 0 0 -conj(B_0) B_1 ) -// ( B_1 B_0 0 0 ) -// ( conj(B_0) -conj(B_1) 0 0 ) template template void A2Autils::AslashField(TensorType &mat, - const FermionField *lhs_wi, - const FermionField *rhs_vj, - const std::vector &emB0, - const std::vector &emB1, - int orthogdim, double *t_kernel, double *t_gsum) + const FermionField *lhs_wi, + const FermionField *rhs_vj, + const std::vector &emB0, + const std::vector &emB1, + int orthogdim, double *t_kernel, double *t_gsum) { - typedef typename FermionField::vector_object vobj; - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; + typedef typename FermionField::vector_object vobj; + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; - typedef iSpinMatrix SpinMatrix_v; - typedef iSpinMatrix SpinMatrix_s; - typedef iSinglet Singlet_v; - typedef iSinglet Singlet_s; + typedef iSpinMatrix SpinMatrix_v; + typedef iSpinMatrix SpinMatrix_s; + typedef iSinglet Singlet_v; + typedef iSinglet Singlet_s; - int Lblock = mat.dimension(3); - int Rblock = mat.dimension(4); + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + GridBase *grid = lhs_wi[0].Grid(); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); - GridBase *grid = lhs_wi[0].Grid(); - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - int Nt = grid->GlobalDimensions()[orthogdim]; - int Nem = emB0.size(); - assert(emB1.size() == Nem); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; + int Nt = grid->GlobalDimensions()[orthogdim]; + int Nem = emB0.size(); + assert(emB1.size() == Nem); + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + // will locally sum vectors first // sum across these down to scalars // splitting the SIMD int MFrvol = rd*Lblock*Rblock*Nem; int MFlvol = ld*Lblock*Rblock*Nem; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for(r,MFrvol, { lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol, { lsSum[r] = scalar_type(0.0); @@ -719,7 +560,7 @@ void A2Autils::AslashField(TensorType &mat, // Nested parallelism would be ok // Wasting cores here. Test case r if (t_kernel) *t_kernel = -usecond(); - thread_for(r,rd, + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -746,8 +587,8 @@ void A2Autils::AslashField(TensorType &mat, + left()(s2)(1) * right()(s1)(1) + left()(s2)(2) * right()(s1)(2); } - - // After getting the sitewise product do the mom phase loop + + // After getting the sitewise product do the mom phase loop int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r; for ( int m=0;m::AslashField(TensorType &mat, } } } - }); + } // Sum across simd lanes in the plane, breaking out orthog dir. thread_for(rt,rd, @@ -833,7 +674,7 @@ void A2Autils::AslashField(TensorType &mat, grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock); if (t_gsum) *t_gsum += usecond(); } - +#endif //////////////////////////////////////////// // Schematic thoughts about more generalised four quark insertion // @@ -992,9 +833,9 @@ typename std::enable_if<(std::is_same, TensorType>::va std::is_same>, TensorType>::value), void>::type A2Autils::ContractWWVV(std::vector &WWVV, - const TensorType &WW_sd, - const FermionField *vs, - const FermionField *vd) + const TensorType &WW_sd, + const FermionField *vs, + const FermionField *vd) { GridBase *grid = vs[0].Grid(); @@ -1062,7 +903,6 @@ A2Autils::ContractWWVV(std::vector &WWVV, } for (int t = 0; t < N_t; t++){ - std::cout << GridLogMessage << "Contraction t = " << t << std::endl; buf = WW_sd[t]; thread_for(ss,grid->oSites(),{ for(int d_o=0;d_o::DeltaFeq2(int dt_min,int dt_max, } #endif + /* + static void PionFieldWVmom(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + const std::vector &mom, + int orthogdim); + + static void PionFieldXX(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + int orthogdim, + int g5); + + static void PionFieldWV(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + int orthogdim); + static void PionFieldWW(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *wj, + int orthogdim); + static void PionFieldVV(Eigen::Tensor &mat, + const FermionField *vi, + const FermionField *vj, + int orthogdim); + */ + +/* + +template +template +void A2Autils::MesonField(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + std::vector gammas, + const std::vector &mom, + int orthogdim, double *t_kernel, double *t_gsum) +{ + typedef typename FImpl::SiteSpinor vobj; + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + typedef iSpinMatrix SpinMatrix_v; + typedef iSpinMatrix SpinMatrix_s; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + GridBase *grid = lhs_wi[0].Grid(); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Ngamma = gammas.size(); + int Nmom = mom.size(); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*Lblock*Rblock*Nmom; + int MFlvol = ld*Lblock*Rblock*Nmom; + + std::vector lvSum(MFrvol); + for(int r=0;r lsSum(MFlvol); + for(int r=0;r_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + // potentially wasting cores here if local time extent too small + if (t_kernel) *t_kernel = -usecond(); + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n extracted(Nsimd); + + for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + + int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; + + } + }}} + } + if (t_kernel) *t_kernel += usecond(); + assert(mat.dimension(0) == Nmom); + assert(mat.dimension(1) == Ngamma); + assert(mat.dimension(2) == Nt); + + // ld loop and local only?? + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + thread_for_collapse(2,lt,ld,{ + for(int pt=0;ptGlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); + if (t_gsum) *t_gsum += usecond(); +} + +template +void A2Autils::PionFieldXX(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + int orthogdim, + int g5) +{ + int Lblock = mat.dimension(1); + int Rblock = mat.dimension(2); + + GridBase *grid = wi[0].Grid(); + + const int nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*Lblock*Rblock; + int MFlvol = ld*Lblock*Rblock; + + std::vector lvSum(MFrvol); + thread_for(r,MFrvol,{ + lvSum[r] = Zero(); + }); + + std::vector lsSum(MFlvol); + thread_for(r,MFlvol,{ + lsSum[r]=scalar_type(0.0); + }); + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + thread_for(r,rd,{ + + int so=r*grid->_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n temp; + ExtractBuffer > extracted(Nsimd); + + for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + + int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; + + } + }} + }); + + assert(mat.dimension(0) == Nt); + // ld loop and local only?? + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + thread_for_collapse(2,lt,ld,{ + for(int pt=0;ptGlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock); +} + +template +void A2Autils::PionFieldWVmom(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + const std::vector &mom, + int orthogdim) +{ + int Lblock = mat.dimension(2); + int Rblock = mat.dimension(3); + + GridBase *grid = wi[0].Grid(); + + const int nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Nmom = mom.size(); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*Lblock*Rblock*Nmom; + int MFlvol = ld*Lblock*Rblock*Nmom; + + std::vector lvSum(MFrvol); + thread_for(r,MFrvol,{ + lvSum[r] = Zero(); + }); + + std::vector lsSum(MFlvol); + thread_for(r,MFlvol,{ + lsSum[r]=scalar_type(0.0); + }); + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + thread_for(r,rd,{ + + int so=r*grid->_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n temp; + ExtractBuffer > extracted(Nsimd); + + for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + + int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; + + } + }}} + }); + + assert(mat.dimension(0) == Nmom); + assert(mat.dimension(1) == Nt); + + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + thread_for_collapse(2,lt,ld,{ + for(int pt=0;ptGlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock); +} + +template +void A2Autils::PionFieldWV(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + int orthogdim) +{ + const int g5=1; + PionFieldXX(mat,wi,vj,orthogdim,g5); +} +template +void A2Autils::PionFieldWW(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *wj, + int orthogdim) +{ + const int nog5=0; + PionFieldXX(mat,wi,wj,orthogdim,nog5); +} +template +void A2Autils::PionFieldVV(Eigen::Tensor &mat, + const FermionField *vi, + const FermionField *vj, + int orthogdim) +{ + const int nog5=0; + PionFieldXX(mat,vi,vj,orthogdim,nog5); +} +*/ + NAMESPACE_END(Grid); diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 9d9cb508..9a1d312b 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -971,7 +971,9 @@ void BaryonUtils::BaryonGamma3pt( autoView( vq_ti , q_ti , AcceleratorRead); autoView( vq_tf , q_tf , AcceleratorRead); - Vector my_Dq_spec{Dq_spec1,Dq_spec2}; + deviceVector my_Dq_spec(2); + acceleratorPut(my_Dq_spec[0],Dq_spec1); + acceleratorPut(my_Dq_spec[1],Dq_spec2); mobj * Dq_spec_p = &my_Dq_spec[0]; if (group == 1) { @@ -1300,7 +1302,8 @@ void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, autoView( vd_tf , qd_tf , AcceleratorRead); autoView( vs_ti , qs_ti , AcceleratorRead); - Vector my_Dq_spec{Du_spec}; + deviceVector my_Dq_spec(1); + acceleratorPut(my_Dq_spec[0],Du_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ @@ -1353,7 +1356,8 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, autoView( vd_tf , qd_tf , AcceleratorRead ); autoView( vs_ti , qs_ti , AcceleratorRead ); - Vector my_Dq_spec{Du_spec}; + deviceVector my_Dq_spec(1); + acceleratorPut(my_Dq_spec[0],Du_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ @@ -1544,7 +1548,9 @@ void BaryonUtils::XiToSigmaEye(const PropagatorField &qq_loop, autoView( vd_tf , qd_tf , AcceleratorRead); autoView( vs_ti , qs_ti , AcceleratorRead); - Vector my_Dq_spec{Dd_spec,Ds_spec}; + deviceVector my_Dq_spec(2); + acceleratorPut(my_Dq_spec[0],Dd_spec); + acceleratorPut(my_Dq_spec[0],Ds_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ diff --git a/Grid/qcd/utils/SUn.impl.h b/Grid/qcd/utils/SUn.impl.h index 02fa161b..d049fcd0 100644 --- a/Grid/qcd/utils/SUn.impl.h +++ b/Grid/qcd/utils/SUn.impl.h @@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix &ta) { //////////////////////////////////////////////////////////////////////// // Map a su2 subgroup number to the pair of rows that are non zero //////////////////////////////////////////////////////////////////////// -static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { +static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); int spare = su2_index; diff --git a/Grid/qcd/utils/SUnAdjoint.h b/Grid/qcd/utils/SUnAdjoint.h index 84c7278c..cfc48bbf 100644 --- a/Grid/qcd/utils/SUnAdjoint.h +++ b/Grid/qcd/utils/SUnAdjoint.h @@ -62,7 +62,7 @@ public: // returns i(T_Adj)^index necessary for the projectors // see definitions above iAdjTa = Zero(); - Vector > ta(ncolour * ncolour - 1); + iSUnMatrix ta[ncolour * ncolour - 1]; iSUnMatrix tmp; // FIXME not very efficient to get all the generators everytime diff --git a/Grid/qcd/utils/Sp2n.impl.h b/Grid/qcd/utils/Sp2n.impl.h index 4c660d3a..196aba7e 100644 --- a/Grid/qcd/utils/Sp2n.impl.h +++ b/Grid/qcd/utils/Sp2n.impl.h @@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix &ta) { // Map a su2 subgroup number to the pair of rows that are non zero //////////////////////////////////////////////////////////////////////// template -static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) { +static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) { const int nsp=ncolour/2; assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2)); diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h index b6848977..66d25bc4 100644 --- a/Grid/stencil/GeneralLocalStencil.h +++ b/Grid/stencil/GeneralLocalStencil.h @@ -72,7 +72,7 @@ public: } // Resident in managed memory - Vector _entries; + deviceVector _entries; GeneralLocalStencil(GridBase *grid, const std::vector &shifts) { @@ -141,7 +141,7 @@ public: //////////////////////////////////////////////// // Store in look up table //////////////////////////////////////////////// - this->_entries[lex] = SE; + acceleratorPut(this->_entries[lex],SE); } }); } diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index dabd70a6..eca9cd3c 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -19,7 +19,7 @@ public: static int PartialCompressionFactor(GridBase *grid) {return 1;}; // Decompress is after merge so ok template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -35,7 +35,7 @@ public: rhs_v.ViewClose(); } template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -83,25 +83,6 @@ public: // Wilson compressor will add alternate policies for Dirichlet // and possibly partial Dirichlet for DWF //////////////////////////////////// -/* -class FaceGatherDirichlet -{ - // If it's dirichlet we don't assemble comms buffers - // - // Rely on zeroes in gauge field to drive the correct result - // NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute - template - static void Gather_plane_simple (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so){}; - template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, - compressor &compress,int type) {} - template - static void Merge(decompressor decompress,Merge &mm) { } - template - static void Decompress(decompressor decompress,Decompression &dd) {} -}; -*/ template class SimpleCompressorGather : public FaceGather { diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 80acb4ae..1142891a 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -31,7 +31,6 @@ #define STENCIL_MAX (16) #include // subdir aggregate -#include // subdir aggregate #include ////////////////////////////////////////////////////////////////////////////////////////// @@ -122,17 +121,22 @@ class CartesianStencilAccelerator { StencilVector same_node; Coordinate _simd_layout; Parameters parameters; + ViewMode mode; StencilEntry* _entries_p; + StencilEntry* _entries_host_p; cobj* u_recv_buf_p; cobj* u_send_buf_p; accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } - accelerator_inline int GetNodeLocal(int osite,int point) const { - return this->_entries_p[point+this->_npoints*osite]._is_local; + // Not a device function + inline int GetNodeLocal(int osite,int point) const { + StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite]; + return SE._is_local; } accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { - ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; + ptype = this->_permute_type[point]; + return & this->_entries_p[point+this->_npoints*osite]; } accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { @@ -165,28 +169,22 @@ class CartesianStencilView : public CartesianStencilAccelerator &refer_to_me,ViewMode _mode) - : CartesianStencilAccelerator(refer_to_me), - cpu_ptr(this->_entries_p), - mode(_mode) + : CartesianStencilAccelerator(refer_to_me) { - this->_entries_p =(StencilEntry *) - MemoryManager::ViewOpen(this->_entries_p, - this->_npoints*this->_osites*sizeof(StencilEntry), - mode, - AdviseDefault); + this->ViewOpen(_mode); + } + void ViewOpen(ViewMode _mode) + { + this->mode = _mode; } - void ViewClose(void) - { - MemoryManager::ViewClose(this->cpu_ptr,this->mode); - } + void ViewClose(void) { } }; @@ -256,7 +254,6 @@ protected: GridBase * _grid; public: GridBase *Grid(void) const { return _grid; } - LebesgueOrder *lo; //////////////////////////////////////////////////////////////////////// // Needed to conveniently communicate gparity parameters into GPU memory @@ -273,11 +270,11 @@ public: int face_table_computed; int partialDirichlet; int fullDirichlet; - std::vector > > face_table ; - Vector surface_list; + std::vector > > face_table ; + deviceVector surface_list; - stencilVector _entries; // Resident in managed memory - commVector _entries_device; // Resident in device memory + std::vector _entries; // Resident in host memory + deviceVector _entries_device; // Resident in device memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; @@ -366,11 +363,20 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + FlightRecorder::StepLog("Communicate begin"); // All GPU kernel tasks must complete // accelerator_barrier(); // All kernels should ALREADY be complete // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. -#ifdef ACCELERATOR_AWARE_MPI + for(int i=0;iStencilSendToRecvFromPrepare(MpiReqs, + Packets[i].send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } + acceleratorCopySynchronise(); for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -379,23 +385,6 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } -#else -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - for(int i=0;iHostBufferMalloc(Packets[i].xbytes); - Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); - if ( Packets[i].do_send ) { - acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); - } - _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].host_send_buf, - Packets[i].to_rank,Packets[i].do_send, - Packets[i].host_recv_buf, - Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); - } -#endif // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;i > &reqs) { + FlightRecorder::StepLog("Start communicate complete"); _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); - // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete + // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // accelerator_barrier(); - _grid->StencilBarrier(); -#ifndef ACCELERATOR_AWARE_MPI -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - for(int i=0;iHostBufferFreeAll(); -#endif - // run any checksums for(int i=0;iStencilBarrier();// Synch shared memory on a single nodes face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); } @@ -651,10 +632,10 @@ public: //////////////////////////////////////// void PrecomputeByteOffsets(void){ for(int i=0;i<_entries.size();i++){ - if( _entries[i]._is_local ) { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); + if( this->_entries[i]._is_local ) { + this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj); } else { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); + this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj); } } }; @@ -668,7 +649,7 @@ public: for(int point=0;point_npoints;point++){ this->same_node[point] = this->SameNode(point); } - + int32_t surface_list_size=0; for(int site = 0 ;site< vol4;site++){ int local = 1; for(int point=0;point_npoints;point++){ @@ -678,11 +659,30 @@ public: } if(local == 0) { for(int s=0;s surface_list_host(surface_list_size); + int32_t ss=0; + for(int site = 0 ;site< vol4;site++){ + int local = 1; + for(int point=0;point_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + for(int s=0;s_osites = _grid->oSites(); _entries.resize(this->_npoints* this->_osites); - this->_entries_p = &_entries[0]; + _entries_device.resize(this->_npoints* this->_osites); + this->_entries_host_p = &_entries[0]; + this->_entries_p = &_entries_device[0]; + + std::cout << GridLogMessage << " Stencil object allocated for "<_osites + <<" sites table "<_entries_p<< " GridPtr "<<_grid<ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); } PrecomputeByteOffsets(); + acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry)); } void Local (int point, int dimension,int shiftpm,int cbmask) @@ -1002,10 +1009,10 @@ public: for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ int idx=point+(lo+o+b)*this->_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._permute=permute; - _entries[idx]._is_local=1; - _entries[idx]._around_the_world=wrap; + this->_entries[idx]._offset =ro+o+b; + this->_entries[idx]._permute=permute; + this->_entries[idx]._is_local=1; + this->_entries[idx]._around_the_world=wrap; } o +=_grid->_slice_stride[dimension]; } @@ -1023,10 +1030,10 @@ public: if ( ocb&cbmask ) { int idx = point+(lo+o+b)*this->_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._is_local=1; - _entries[idx]._permute=permute; - _entries[idx]._around_the_world=wrap; + this->_entries[idx]._offset =ro+o+b; + this->_entries[idx]._is_local=1; + this->_entries[idx]._permute=permute; + this->_entries[idx]._around_the_world=wrap; } } @@ -1050,10 +1057,10 @@ public: for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ int idx=point+(so+o+b)*this->_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute=0; - _entries[idx]._around_the_world=wrap; + this->_entries[idx]._offset =offset+(bo++); + this->_entries[idx]._is_local=0; + this->_entries[idx]._permute=0; + this->_entries[idx]._around_the_world=wrap; } o +=_grid->_slice_stride[dimension]; } @@ -1070,10 +1077,10 @@ public: int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { int idx = point+(so+o+b)*this->_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute =0; - _entries[idx]._around_the_world=wrap; + this->_entries[idx]._offset =offset+(bo++); + this->_entries[idx]._is_local=0; + this->_entries[idx]._permute =0; + this->_entries[idx]._around_the_world=wrap; } } o +=_grid->_slice_stride[dimension]; diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 95823de5..74d1f585 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -202,15 +202,15 @@ void acceleratorInit(void) #ifdef GRID_SYCL -cl::sycl::queue *theGridAccelerator; -cl::sycl::queue *theCopyAccelerator; +sycl::queue *theGridAccelerator; +sycl::queue *theCopyAccelerator; void acceleratorInit(void) { int nDevices = 1; - cl::sycl::gpu_selector selector; - cl::sycl::device selectedDevice { selector }; - theGridAccelerator = new sycl::queue (selectedDevice); - theCopyAccelerator = new sycl::queue (selectedDevice); + // sycl::gpu_selector selector; + // sycl::device selectedDevice { selector }; + theGridAccelerator = new sycl::queue (sycl::gpu_selector_v); + theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. #ifdef GRID_SYCL_LEVEL_ZERO_IPC @@ -242,14 +242,14 @@ void acceleratorInit(void) gethostname(hostname, HOST_NAME_MAX+1); if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); - auto devices = cl::sycl::device::get_devices(); + auto devices = sycl::device::get_devices(); for(int d = 0;d().c_str()); + printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info().c_str()); #define GPU_PROP_FMT(prop,FMT) \ - printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); + printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); if ( world_rank == 0) { diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1cb56ddd..c28ca201 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -132,27 +132,17 @@ inline void cuda_mem(void) #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ - } -#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ - { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - ProfileLambdaApply<<>>(num1,num2,nsimd,lambda); \ + if ( num1*num2 ) { \ + int nt=acceleratorThreads(); \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ + } \ } #define accelerator_for6dNB(iter1, num1, \ @@ -175,19 +165,6 @@ inline void cuda_mem(void) } -#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ - { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ - } - template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { @@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) Lambda(x,y,z); } } -template __global__ -void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) -{ - // Weird permute is to make lane coalesce for large blocks - uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; - uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; - uint64_t z = threadIdx.x; - if ( (x < num1) && (y __global__ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, @@ -243,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, } \ } +inline void *acceleratorAllocHost(size_t bytes) +{ + void *ptr=NULL; + auto err = cudaMallocHost((void **)&ptr,bytes); + if( err != cudaSuccess ) { + ptr = (void *) NULL; + printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err)); + assert(0); + } + return ptr; +} inline void *acceleratorAllocShared(size_t bytes) { void *ptr=NULL; @@ -264,8 +241,10 @@ inline void *acceleratorAllocDevice(size_t bytes) } return ptr; }; + inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; +inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} @@ -302,7 +281,7 @@ NAMESPACE_END(Grid); // Force deterministic reductions #define SYCL_REDUCTION_DETERMINISTIC -#include +#include #include #include #include @@ -314,8 +293,8 @@ inline void acceleratorMem(void) std::cout <<" SYCL acceleratorMem not implemented"<>()[2]; + return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; #else return 0; #endif } // SYCL specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ + theGridAccelerator->submit([&](sycl::handler &cgh) { \ unsigned long nt=acceleratorThreads(); \ if(nt < 8)nt=8; \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ - cl::sycl::range<3> local {nt,1,nsimd}; \ - cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ + sycl::range<3> local {nt,1,nsimd}; \ + sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ cgh.parallel_for( \ - cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + sycl::nd_range<3>(global,local), \ + [=] (sycl::nd_item<3> item) /*mutable*/ \ [[intel::reqd_sub_group_size(16)]] \ { \ auto iter1 = item.get_global_id(0); \ @@ -356,12 +335,17 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; +inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; +inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } + inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} +inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); } +inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); } inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} @@ -369,8 +353,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele inline int acceleratorIsCommunicable(void *ptr) { #if 0 - auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); - if ( uvm = cl::sycl::usm::alloc::shared ) return 1; + auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); + if ( uvm = sycl::usm::alloc::shared ) return 1; else return 0; #endif return 1; @@ -472,6 +456,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) } \ } +inline void *acceleratorAllocHost(size_t bytes) +{ + void *ptr=NULL; + auto err = hipMallocHost((void **)&ptr,bytes); + if( err != hipSuccess ) { + ptr = (void *) NULL; + fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr); + } + return ptr; +}; inline void *acceleratorAllocShared(size_t bytes) { void *ptr=NULL; @@ -495,12 +489,12 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; +inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} -//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} -//inline void acceleratorCopySynchronise(void) { } + inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch @@ -517,15 +511,19 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize #endif +inline void acceleratorPin(void *ptr,unsigned long bytes) +{ +#ifdef GRID_SYCL + sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context()); +#endif +} + ////////////////////////////////////////////// // Common on all GPU targets ////////////////////////////////////////////// #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) // FIXME -- the non-blocking nature got broken March 30 2023 by PAB #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} ); -#define prof_accelerator_for( iter1, num1, nsimd, ... ) \ - prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\ - accelerator_barrier(dummy); #define accelerator_for( iter, num, nsimd, ... ) \ accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \ @@ -574,8 +572,10 @@ inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} #ifdef HAVE_MM_MALLOC_H +inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; +inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);}; #else diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index 60d18fb6..139e7957 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail; int FlightRecorder::LoggingMode; int FlightRecorder::ChecksumComms; int FlightRecorder::ChecksumCommsSend; +const char * FlightRecorder::StepName; +int32_t FlightRecorder::StepLoggingCounter; int32_t FlightRecorder::XmitLoggingCounter; int32_t FlightRecorder::RecvLoggingCounter; int32_t FlightRecorder::CsumLoggingCounter; @@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void) CsumLoggingCounter=0; NormLoggingCounter=0; ReductionLoggingCounter=0; + StepName = "No steps started"; + StepLoggingCounter=0; } void FlightRecorder::Truncate(void) { @@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode) assert(0); } } +bool FlightRecorder::StepLog(const char *name) +{ + StepName = name; + StepLoggingCounter ++; + return true; +} void FlightRecorder::SetLoggingModePrint(void) { @@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void) { return ErrorCounter; } -void FlightRecorder::NormLog(double value) +bool FlightRecorder::NormLog(double value) { uint64_t hex = * ( (uint64_t *)&value ); if(LoggingMode == LoggingModePrint) { std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "< 0); - } -#endif - if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ - arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); - GridCmdOptionIntVector(arg,LebesgueOrder::Block); - } if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){ GridLogTimestamp(0); } else { @@ -573,8 +549,34 @@ void GridLogLayout() { void * Grid_backtrace_buffer[_NBACKTRACE]; +void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr) +{ + fprintf(stderr,"Signal handler on host %s\n",hostname); + fprintf(stderr,"FlightRecorder step %d stage %s \n", + FlightRecorder::StepLoggingCounter, + FlightRecorder::StepName); + fprintf(stderr,"Caught signal %d\n",si->si_signo); + fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); + fprintf(stderr," code %d\n",si->si_code); + // x86 64bit +#ifdef __linux__ +#ifdef __x86_64__ + ucontext_t * uc= (ucontext_t *)ptr; + struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; + fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); +#endif +#endif + fflush(stderr); + BACKTRACEFP(stderr); + fprintf(stderr,"Called backtrace\n"); + fflush(stdout); + fflush(stderr); + return; +} + void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) { + fprintf(stderr,"Signal handler on host %s\n",hostname); fprintf(stderr,"Caught signal %d\n",si->si_signo); fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); fprintf(stderr," code %d\n",si->si_code); @@ -585,7 +587,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) ucontext_t * uc= (ucontext_t *)ptr; struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); -#define REG(A) printf(" %s %lx\n",#A,sc-> A); +#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A); REG(rdi); REG(rsi); REG(rbp); @@ -618,8 +620,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) void Grid_exit_handler(void) { - BACKTRACEFP(stdout); - fflush(stdout); + // BACKTRACEFP(stdout); + // fflush(stdout); } void Grid_debug_handler_init(void) { @@ -627,10 +629,10 @@ void Grid_debug_handler_init(void) sigemptyset (&sa.sa_mask); sa.sa_sigaction= Grid_sa_signal_handler; sa.sa_flags = SA_SIGINFO; - sigaction(SIGSEGV,&sa,NULL); + // sigaction(SIGSEGV,&sa,NULL); sigaction(SIGTRAP,&sa,NULL); sigaction(SIGBUS,&sa,NULL); - sigaction(SIGUSR2,&sa,NULL); + // sigaction(SIGUSR2,&sa,NULL); feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); @@ -638,7 +640,14 @@ void Grid_debug_handler_init(void) sigaction(SIGKILL,&sa,NULL); sigaction(SIGILL,&sa,NULL); - atexit(Grid_exit_handler); + // Non terminating SIGUSR1/2 handler + struct sigaction sa_ping; + sigemptyset (&sa_ping.sa_mask); + sa_ping.sa_sigaction= Grid_usr_signal_handler; + sa_ping.sa_flags = SA_SIGINFO; + sigaction(SIGHUP,&sa_ping,NULL); + + // atexit(Grid_exit_handler); } NAMESPACE_END(Grid); diff --git a/Makefile.am b/Makefile.am index d2a1a326..9addcbf5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,5 +1,5 @@ # additional include paths necessary to compile the C++ library -SUBDIRS = Grid HMC benchmarks tests examples +SUBDIRS = Grid benchmarks tests examples HMC include $(top_srcdir)/doxygen.inc diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 2b1f6261..c42136b6 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -644,11 +644,6 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); -#ifdef KNL - LebesgueOrder::Block = std::vector({8,2,2,2}); -#else - LebesgueOrder::Block = std::vector({2,2,2,2}); -#endif Benchmark::Decomposition(); int do_su4=1; diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index ce4fcfab..cbe1ee23 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -52,7 +52,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); - int Ls=16; + int Ls=8; for(int i=0;i> Ls; diff --git a/benchmarks/Benchmark_memory_asynch.cc b/benchmarks/Benchmark_memory_asynch.cc index 97825144..4c27fc2c 100644 --- a/benchmarks/Benchmark_memory_asynch.cc +++ b/benchmarks/Benchmark_memory_asynch.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) pRNG.SeedFixedIntegers(std::vector({56,17,89,101})); std::vector stop(threads); - Vector sum(threads); + std::vector sum(threads); std::vector x(threads,&Grid); for(int t=0;t diag = Dw.bs; - Vector upper= Dw.cs; - Vector lower= Dw.cs; + std::vector diag = Dw.bs; + std::vector upper= Dw.cs; + std::vector lower= Dw.cs; upper[Ls-1]=-Dw.mass_minus*upper[Ls-1]; lower[0] =-Dw.mass_plus*lower[0]; diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index 0fdf2fbd..e400138b 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -118,7 +118,7 @@ public: fprintf(FP,"Packet bytes, direction, GB/s per node\n"); for(int lat=16;lat<=maxlat;lat+=8){ // for(int Ls=8;Ls<=8;Ls*=2){ - { int Ls=12; + { int Ls=8; Coordinate latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -861,7 +861,7 @@ int main (int argc, char ** argv) } CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); - LebesgueOrder::Block = std::vector({2,2,2,2}); + // LebesgueOrder::Block = std::vector({2,2,2,2}); Benchmark::Decomposition(); @@ -872,7 +872,7 @@ int main (int argc, char ** argv) int do_dslash=1; int sel=4; - std::vector L_list({8,12,16,24,32}); + std::vector L_list({8,12,16,24}); int selm1=sel-1; std::vector clover; diff --git a/configure.ac b/configure.ac index 8e8d67af..e4b553bf 100644 --- a/configure.ac +++ b/configure.ac @@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h) AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(execinfo.h) +AC_CHECK_HEADERS(numaif.h) AC_CHECK_DECLS([ntohll],[], [], [[#include ]]) AC_CHECK_DECLS([be64toh],[], [], [[#include ]]) @@ -128,6 +129,20 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; esac +############### internal reduction +AC_ARG_ENABLE([reduction], + [AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])], + [ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid]) + +case ${ac_REDUCTION} in + mpi) + ;; + grid) + AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);; + *) + AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);; +esac + ############### tracing AC_ARG_ENABLE([tracing], [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])], @@ -225,19 +240,21 @@ case ${ac_SFW_FP16} in AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; esac -############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons + +############### MPI BOUNCE TO HOST AC_ARG_ENABLE([accelerator-aware-mpi], [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) +# Force accelerator CSHIFT now +AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device]) + case ${ac_ACCELERATOR_AWARE_MPI} in yes) - AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; *);; esac - ############### SYCL/CUDA/HIP/none AC_ARG_ENABLE([accelerator], [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])], @@ -664,16 +681,6 @@ case ${ac_SHM_FAST_PATH} in *) ;; esac -############### communication type selection -AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) - -case ${ac_COMMS_THREADS} in - yes) - AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] ) - ;; - *) ;; -esac - ############### communication type selection AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) diff --git a/examples/Example_taku.cc b/examples/Example_taku.cc deleted file mode 100644 index b9ad272e..00000000 --- a/examples/Example_taku.cc +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Warning: This code illustrative only: not well tested, and not meant for production use - * without regression / tests being applied - */ - -#include - -using namespace std; -using namespace Grid; - -RealD LLscale =1.0; -RealD LCscale =1.0; - -template class CovariantLaplacianCshift : public SparseMatrixBase -{ -public: - INHERIT_GIMPL_TYPES(Gimpl); - - GridBase *grid; - GaugeField U; - - CovariantLaplacianCshift(GaugeField &_U) : - grid(_U.Grid()), - U(_U) { }; - - virtual GridBase *Grid(void) { return grid; }; - - virtual void M (const Field &in, Field &out) - { - out=Zero(); - for(int mu=0;mu(U, mu); // NB: Inefficent - out = out - Gimpl::CovShiftForward(Umu,mu,in); - out = out - Gimpl::CovShiftBackward(Umu,mu,in); - out = out + 2.0*in; - } - }; - virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian - virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid - virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid - virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid -}; - -void MakePhase(Coordinate mom,LatticeComplex &phase) -{ - GridBase *grid = phase.Grid(); - auto latt_size = grid->GlobalDimensions(); - ComplexD ci(0.0,1.0); - phase=Zero(); - - LatticeComplex coor(phase.Grid()); - for(int mu=0;mu -void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) -{ - typedef CovariantLaplacianCshift Laplacian_t; - Laplacian_t Laplacian(U); - - Integer Iterations = 40; - Real width = 2.0; - Real coeff = (width*width) / Real(4*Iterations); - - Field tmp(U.Grid()); - smeared=unsmeared; - // chi = (1-p^2/2N)^N kronecker - for(int n = 0; n < Iterations; ++n) { - Laplacian.M(smeared,tmp); - smeared = smeared - coeff*tmp; - std::cout << " smear iter " << n<<" " < -void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = D.GaugeGrid(); - GridBase *FGrid = D.FermionGrid(); - - LatticeFermion src4 (UGrid); - LatticeFermion src5 (FGrid); - LatticeFermion result5(FGrid); - LatticeFermion result4(UGrid); - LatticePropagator prop5(FGrid); - - ConjugateGradient CG(1.0e-8,100000); - SchurRedBlackDiagMooeeSolve schur(CG); - ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - - result5=Zero(); - schur(D,src5,result5,ZG); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - LatticePropagator Axial_mu(UGrid); - LatticePropagator Vector_mu(UGrid); - - LatticeComplex PA (UGrid); - LatticeComplex VV (UGrid); - LatticeComplex PJ5q(UGrid); - LatticeComplex PP (UGrid); - - std::vector sumPA; - std::vector sumVV; - std::vector sumPP; - std::vector sumPJ5q; - - Gamma g5(Gamma::Algebra::Gamma5); - D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); - PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current - sliceSum(PA,sumPA,Tdir); - - int Nt{static_cast(sumPA.size())}; - - for(int t=0;t >, data); -}; - -void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) -{ - const int nchannel=3; - Gamma::Algebra Gammas[nchannel][2] = { - {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX}, - {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY}, - {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ} - }; - - Gamma G5(Gamma::Algebra::Gamma5); - - LatticeComplex meson_CF(q1.Grid()); - MesonFile MF; - - for(int ch=0;ch meson_T; - sliceSum(meson_CF,meson_T, Tdir); - - int nt=meson_T.size(); - - std::vector corr(nt); - for(int t=0;t seeds4({1,2,3,4}); - GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - - LatticeGaugeField Umu(UGrid); - std::string config; - RealD M5=1.8; - if( argc > 1 && argv[1][0] != '-' ) - { - std::cout<::ColdConfiguration(Umu); - config="ColdConfig"; - // RealD P=1.0; // Don't scale - RealD P=0.5871119; // 48I - // RealD P=0.6153342; // 64I - // RealD P=0.6388238 // 32Ifine - RealD u0 = sqrt(sqrt(P)); - RealD M5mf = M5 - 4.0*(1.0-u0); - RealD w0 = 1.0 - M5mf; -#if 0 - // M5=1.8 with U=u0 - Umu = Umu * u0; - LLscale = 1.0; - LCscale = 1.0; - std::cout< PointProps(nmass,UGrid); - // std::vector GaussProps(nmass,UGrid); - // std::vector Z2Props (nmass,UGrid); - - for(int m=0;m - -using namespace std; -using namespace Grid; - -RealD LLscale =1.0; -RealD LCscale =1.0; - -template class CovariantLaplacianCshift : public SparseMatrixBase -{ -public: - INHERIT_GIMPL_TYPES(Gimpl); - - GridBase *grid; - GaugeField U; - - CovariantLaplacianCshift(GaugeField &_U) : - grid(_U.Grid()), - U(_U) { }; - - virtual GridBase *Grid(void) { return grid; }; - - virtual void M (const Field &in, Field &out) - { - out=Zero(); - for(int mu=0;mu(U, mu); // NB: Inefficent - out = out - Gimpl::CovShiftForward(Umu,mu,in); - out = out - Gimpl::CovShiftBackward(Umu,mu,in); - out = out + 2.0*in; - } - }; - virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian - virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid - virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid - virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid -}; - -void MakePhase(Coordinate mom,LatticeComplex &phase) -{ - GridBase *grid = phase.Grid(); - auto latt_size = grid->GlobalDimensions(); - ComplexD ci(0.0,1.0); - phase=Zero(); - - LatticeComplex coor(phase.Grid()); - for(int mu=0;mu -void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) -{ - typedef CovariantLaplacianCshift Laplacian_t; - Laplacian_t Laplacian(U); - - Integer Iterations = 40; - Real width = 2.0; - Real coeff = (width*width) / Real(4*Iterations); - - Field tmp(U.Grid()); - smeared=unsmeared; - // chi = (1-p^2/2N)^N kronecker - for(int n = 0; n < Iterations; ++n) { - Laplacian.M(smeared,tmp); - smeared = smeared - coeff*tmp; - std::cout << " smear iter " << n<<" " < -void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = source.Grid(); - GridBase *FGrid = D.FermionGrid(); - bool fiveD = true; //calculate 5d free propagator - RealD mass = D.Mass(); - LatticeFermion src4 (UGrid); - LatticeFermion result4 (UGrid); - LatticeFermion result5(FGrid); - LatticeFermion src5(FGrid); - LatticePropagator prop5(FGrid); - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - D.FreePropagator(src5,result5,mass,true); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - - LatticePropagator Vector_mu(UGrid); - LatticeComplex VV (UGrid); - std::vector sumVV; - Gamma::Algebra GammaV[3] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ - }; - for( int mu=0;mu<3;mu++ ) { - Gamma gV(GammaV[mu]); - D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); - VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current - sliceSum(VV,sumVV,Tdir); - int Nt = sumVV.size(); - for(int t=0;t -void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - bool fiveD = false; //calculate 4d free propagator - RealD mass = D.Mass(); - GridBase *UGrid = source.Grid(); - LatticeFermion src4 (UGrid); - LatticeFermion result4 (UGrid); - for(int s=0;s(src4,source,s,c); - D.FreePropagator(src4,result4,mass,false); - FermToProp(propagator,result4,s,c); - } - } -} - -template -void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = D.GaugeGrid(); - GridBase *FGrid = D.FermionGrid(); - - LatticeFermion src4 (UGrid); - LatticeFermion src5 (FGrid); - LatticeFermion result5(FGrid); - LatticeFermion result4(UGrid); - LatticePropagator prop5(FGrid); - - ConjugateGradient CG(1.0e-10,100000); - SchurRedBlackDiagMooeeSolve schur(CG); - ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - - result5=Zero(); - schur(D,src5,result5,ZG); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - LatticePropagator Axial_mu(UGrid); - LatticePropagator Vector_mu(UGrid); - - LatticeComplex PA (UGrid); - LatticeComplex VV (UGrid); - LatticeComplex PJ5q(UGrid); - LatticeComplex PP (UGrid); - - std::vector sumPA; - std::vector sumVV; - std::vector sumPP; - std::vector sumPJ5q; - - Gamma g5(Gamma::Algebra::Gamma5); - D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); - PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current - sliceSum(PA,sumPA,Tdir); - - int Nt{static_cast(sumPA.size())}; - - for(int t=0;t >, data); -}; - -void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) -{ - const int nchannel=4; - Gamma::Algebra Gammas[nchannel][2] = { - {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5}, - {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5}, - {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5}, - {Gamma::Algebra::Identity,Gamma::Algebra::Identity} - }; - - LatticeComplex meson_CF(q1.Grid()); - MesonFile MF; - - for(int ch=0;ch meson_T; - sliceSum(meson_CF,meson_T, Tdir); - - int nt=meson_T.size(); - - std::vector corr(nt); - for(int t=0;t seeds4({1,2,3,4}); - // GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - - LatticeGaugeField Umu(UGrid); - std::string config; - RealD M5=atof(getenv("M5")); - RealD mq = atof(getenv("mass")); - int tadpole = atof(getenv("tadpole")); - std::vector masses({ mq} ); // u/d, s, c ?? - if( argc > 1 && argv[1][0] != '-' ) - { - std::cout<::ColdConfiguration(Umu); - config="ColdConfig"; - // RealD P=1.0; // Don't scale - // RealD P=0.6388238 // 32Ifine - // RealD P=0.6153342; // 64I - RealD P=0.5871119; // 48I - RealD u0 = sqrt(sqrt(P)); - RealD w0 = 1 - M5; - std::cout< boundary = {1,1,1,-1}; - FermionActionD::ImplParams Params(boundary); - RealD b=1.5; - RealD c=0.5; - std::cout< PointProps(nmass,UGrid); - // std::vector FreeProps(nmass,UGrid); - // LatticePropagator delta(UGrid); - - for(int m=0;m - -using namespace std; -using namespace Grid; - -RealD LLscale =1.0; -RealD LCscale =1.0; - -template class CovariantLaplacianCshift : public SparseMatrixBase -{ -public: - INHERIT_GIMPL_TYPES(Gimpl); - - GridBase *grid; - GaugeField U; - - CovariantLaplacianCshift(GaugeField &_U) : - grid(_U.Grid()), - U(_U) { }; - - virtual GridBase *Grid(void) { return grid; }; - - virtual void M (const Field &in, Field &out) - { - out=Zero(); - for(int mu=0;mu(U, mu); // NB: Inefficent - out = out - Gimpl::CovShiftForward(Umu,mu,in); - out = out - Gimpl::CovShiftBackward(Umu,mu,in); - out = out + 2.0*in; - } - }; - virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian - virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid - virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid - virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid -}; - -void MakePhase(Coordinate mom,LatticeComplex &phase) -{ - GridBase *grid = phase.Grid(); - auto latt_size = grid->GlobalDimensions(); - ComplexD ci(0.0,1.0); - phase=Zero(); - - LatticeComplex coor(phase.Grid()); - for(int mu=0;mu -void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) -{ - typedef CovariantLaplacianCshift Laplacian_t; - Laplacian_t Laplacian(U); - - Integer Iterations = 40; - Real width = 2.0; - Real coeff = (width*width) / Real(4*Iterations); - - Field tmp(U.Grid()); - smeared=unsmeared; - // chi = (1-p^2/2N)^N kronecker - for(int n = 0; n < Iterations; ++n) { - Laplacian.M(smeared,tmp); - smeared = smeared - coeff*tmp; - std::cout << " smear iter " << n<<" " < -void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = source.Grid(); - GridBase *FGrid = D.FermionGrid(); - bool fiveD = true; //calculate 4d free propagator - RealD mass = D.Mass(); - LatticeFermion src4 (UGrid); - LatticeFermion result4 (UGrid); - LatticeFermion result5(FGrid); - LatticeFermion src5(FGrid); - LatticePropagator prop5(FGrid); - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - D.FreePropagator(src5,result5,mass,true); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - - LatticePropagator Vector_mu(UGrid); - LatticeComplex VV (UGrid); - std::vector sumVV; - Gamma::Algebra GammaV[3] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ - }; - for( int mu=0;mu<3;mu++ ) { - Gamma gV(GammaV[mu]); - D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); - VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current - sliceSum(VV,sumVV,Tdir); - int Nt = sumVV.size(); - for(int t=0;t -void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = D.GaugeGrid(); - GridBase *FGrid = D.FermionGrid(); - - LatticeFermion src4 (UGrid); - LatticeFermion src5 (FGrid); - LatticeFermion result5(FGrid); - LatticeFermion result4(UGrid); - LatticePropagator prop5(FGrid); - - ConjugateGradient CG(1.0e-6,100000); - SchurRedBlackDiagMooeeSolve schur(CG); - ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - - result5=Zero(); - schur(D,src5,result5,ZG); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - LatticePropagator Axial_mu(UGrid); - LatticePropagator Vector_mu(UGrid); - - LatticeComplex PA (UGrid); - LatticeComplex VV (UGrid); - LatticeComplex PJ5q(UGrid); - LatticeComplex PP (UGrid); - - std::vector sumPA; - std::vector sumVV; - std::vector sumPP; - std::vector sumPJ5q; - - Gamma g5(Gamma::Algebra::Gamma5); - D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); - PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current - sliceSum(PA,sumPA,Tdir); - - int Nt{static_cast(sumPA.size())}; - - for(int t=0;t >, data); -}; - -void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) -{ - const int nchannel=3; - Gamma::Algebra Gammas[nchannel][2] = { - {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX}, - {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY}, - // {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ} - {Gamma::Algebra::Gamma5,Gamma::Algebra::Gamma5} - }; - - Gamma G5(Gamma::Algebra::Gamma5); - - LatticeComplex meson_CF(q1.Grid()); - MesonFile MF; - - for(int ch=0;ch meson_T; - sliceSum(meson_CF,meson_T, Tdir); - - int nt=meson_T.size(); - - std::vector corr(nt); - for(int t=0;t seeds4({1,2,3,4}); - // GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - - LatticeGaugeField Umu(UGrid); - std::string config; - RealD M5=atof(getenv("M5")); - RealD mq = atof(getenv("mass")); - std::vector masses({ mq} ); // u/d, s, c ?? - if( argc > 1 && argv[1][0] != '-' ) - { - std::cout<::ColdConfiguration(Umu); - config="ColdConfig"; - // RealD P=1.0; // Don't scale - // RealD P=0.6153342; // 64I - // RealD P=0.6388238 // 32Ifine - // RealD P=0.5871119; // 48I - // RealD u0 = sqrt(sqrt(P)); - // Umu = Umu * u0; - RealD w0 = 1 - M5; - LLscale = 1.0/(1-w0*w0)/(1-w0*w0); - LCscale = 1.0/(1-w0*w0)/(1-w0*w0); - std::cout< PointProps(nmass,UGrid); - std::vector FreeProps(nmass,UGrid); - LatticePropagator delta(UGrid); - - for(int m=0;m pingjob < command-line +env > environment +$CMD +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* diff --git a/systems/Aurora/benchmarks/bench1.pbs b/systems/Aurora/benchmarks/bench1.pbs index b53327f0..95141bfe 100644 --- a/systems/Aurora/benchmarks/bench1.pbs +++ b/systems/Aurora/benchmarks/bench1.pbs @@ -1,27 +1,24 @@ #!/bin/bash +##PBS -q EarlyAppAccess #PBS -q debug #PBS -l select=1 #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA -#export OMP_PROC_BIND=spread -#unset OMP_PLACES - cd $PBS_O_WORKDIR source ../sourceme.sh -module load pti-gpu -#cat $PBS_NODEFILE +cp $PBS_NODEFILE nodefile export OMP_NUM_THREADS=4 -export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPICH_OFI_NIC_POLICY=GPU +#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST - #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 @@ -29,39 +26,11 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 -export MPICH_OFI_NIC_POLICY=GPU - -# 12 ppn, 2 nodes, 24 ranks -# -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_usqcd --mpi 1.1.1.1 --grid 24.32.32.24 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD | tee usqcd.log - - -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -#$CMD | tee 1tile.dwf CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.32.32.32.48.dwf + ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.96 \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 " - -CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.64.64.32.96.dwf - -CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.64.32.32.48.dwf +echo $CMD +$CMD diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs index ea469cda..aebed04e 100644 --- a/systems/Aurora/benchmarks/bench2.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -1,55 +1,48 @@ #!/bin/bash -#PBS -q workq +##PBS -q EarlyAppAccess +#PBS -q debug #PBS -l select=2 #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA -#export OMP_PROC_BIND=spread -#unset OMP_PLACES - cd $PBS_O_WORKDIR source ../sourceme.sh -module load pti-gpu -#cat $PBS_NODEFILE +cp $PBS_NODEFILE nodefile export OMP_NUM_THREADS=4 -export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPICH_OFI_NIC_POLICY=GPU +#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 -export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 -export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 -export MPICH_OFI_NIC_POLICY=GPU - -# 12 ppn, 2 nodes, 24 ranks # +# Local vol 16.16.16.32 +# + +#VOL=32.64.64.96 + +for VOL in 32.32.32.96 32.64.64.96 +do +for AT in 32 +do CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD | tee 2node.comms + ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap " - -CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -$CMD | tee 2node.32.32.64.48.dwf - - -CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -$CMD | tee 2node.64.64.64.96.dwf +echo $CMD +$CMD +done +done diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile.sh similarity index 63% rename from systems/Aurora/benchmarks/gpu_tile_compact.sh rename to systems/Aurora/benchmarks/gpu_tile.sh index 099a0ded..a622ba3e 100755 --- a/systems/Aurora/benchmarks/gpu_tile_compact.sh +++ b/systems/Aurora/benchmarks/gpu_tile.sh @@ -4,10 +4,12 @@ #export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1); #export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) -export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ); +export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ); +export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 ); export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) -export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} unset EnableWalkerPartition @@ -17,18 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 -#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 +#export MPI_BUF_NUMA=$NUMAH + echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " if [ $PALS_RANKID = "0" ] then -# numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline "$@" -# numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" - numactl -m $NUMA -N $NUMA "$@" + numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" +# numactl -p $NUMAP -N $NUMAP "$@" else - numactl -m $NUMA -N $NUMA "$@" + numactl -p $NUMAP -N $NUMAP "$@" fi diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 5b4e378c..6e5512ff 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,8 +1,15 @@ +#Ahead of time compile for PVC + +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/" + +#JIT compile +#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " +#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " -export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " -export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions " ../../configure \ --enable-simd=GPU \ + --enable-reduction=grid \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ --enable-debug \ @@ -11,7 +18,7 @@ export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include - --with-lime=$CLIME \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-accelerator-aware-mpi=yes\ + --enable-accelerator-aware-mpi=no\ --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7abe667f..89126f5b 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,40 +1,16 @@ +#module load oneapi/release/2023.12.15.001 +#module load mpich/icc-all-debug-pmix-gpu/52.2 +#module load mpich-config/mode/deterministic +#module load intel_compute_runtime/release/821.35 +module load pti-gpu + source ~/spack/share/spack/setup-env.sh spack load c-lime +spack load openssl export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` -#spack load libefence -#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' ` -#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH -#spack load gperftools -export TCMALLOC=/home/paboyle/gperftools/install -export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH -export INTELGT_AUTO_ATTACH_DISABLE=1 - -#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 -#module load oneapi/release/2023.12.15.001 -#module use /soft/modulefiles -#module load intel_compute_runtime/release/agama-devel-682.22 - -#export FI_CXI_DEFAULT_CQ_SIZE=131072 -#export FI_CXI_CQ_FILL_PERCENT=20 -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" - -# -# -ftarget-register-alloc-mode=pvc:default -# -ftarget-register-alloc-mode=pvc:small -# -ftarget-register-alloc-mode=pvc:large -# -ftarget-register-alloc-mode=pvc:auto -#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 - export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 git config --global http.proxy http://proxy.alcf.anl.gov:3128 - -#source ~/spack/share/spack/setup-env.sh -#spack load gperftools -#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` -#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH - export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs index fa37ae09..5d5314c1 100644 --- a/systems/Aurora/tests/repro16.pbs +++ b/systems/Aurora/tests/repro16.pbs @@ -2,7 +2,8 @@ ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 -#PBS -l select=16:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=01:00:00 #PBS -N dwf @@ -13,19 +14,14 @@ cd $PBS_O_WORKDIR -#source ../sourceme.sh +source ../sourceme.sh cat $PBS_NODEFILE -#export MPICH_COLL_SYNC=1 -#export MPICH_ENV_DISPLAY=1 -export MPICH_ export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu -#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH +#module load mpich/51.2/icc-all-deterministic-pmix-gpu #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST diff --git a/systems/Aurora/tests/repro1gpu.pbs b/systems/Aurora/tests/repro1gpu.pbs index 283a9343..f8e52705 100644 --- a/systems/Aurora/tests/repro1gpu.pbs +++ b/systems/Aurora/tests/repro1gpu.pbs @@ -1,6 +1,7 @@ #!/bin/bash -#PBS -l select=16:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=02:00:00 #PBS -N repro1gpu @@ -9,8 +10,9 @@ #export OMP_PROC_BIND=spread #unset OMP_PLACES -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu + +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 @@ -34,6 +36,8 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" cd $PBS_O_WORKDIR +source ../sourceme.sh + NN=`cat $PBS_NODEFILE | wc -l` echo $PBS_NODEFILE cat $PBS_NODEFILE diff --git a/systems/Aurora/tests/reproBigJob.pbs b/systems/Aurora/tests/reproBigJob.pbs new file mode 100644 index 00000000..1d880f0d --- /dev/null +++ b/systems/Aurora/tests/reproBigJob.pbs @@ -0,0 +1,74 @@ +#!/bin/bash + +#PBS -l select=32 +#PBS -q EarlyAppAccess +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N reproBigJob +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 + +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" + +export GRID_PRINT_ENTIRE_LOG=0 +export GRID_CHECKSUM_RECV_BUF=0 +export GRID_CHECKSUM_SEND_BUF=0 + +export MPICH_OFI_NIC_POLICY=GPU + +#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +cp $PBS_NODEFILE nodefile + +DIR=reproBigJob.$PBS_JOBID + +mkdir -p $DIR +cd $DIR + +cp $PBS_NODEFILE nodefile + +BINARY=../Test_dwf_mixedcg_prec + +echo > pingjob < command-line +env > environment +$CMD +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* diff --git a/systems/Aurora/tests/reproN.pbs b/systems/Aurora/tests/reproN.pbs index 293e7ade..be10558b 100644 --- a/systems/Aurora/tests/reproN.pbs +++ b/systems/Aurora/tests/reproN.pbs @@ -1,6 +1,7 @@ #!/bin/bash -#PBS -l select=32:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=02:00:00 #PBS -N reproN @@ -9,8 +10,8 @@ #export OMP_PROC_BIND=spread #unset OMP_PLACES -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 diff --git a/systems/Linux-cuda/config-command b/systems/Linux-cuda/config-command new file mode 100644 index 00000000..94e2287c --- /dev/null +++ b/systems/Linux-cuda/config-command @@ -0,0 +1,18 @@ +../../configure \ + --enable-comms=mpi \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-shm=nvlink \ + --with-lime=$CLIME \ + --with-hdf5=$HDF5 \ + --with-fftw=$FFTW \ + --with-gmp=$GMP \ + --with-mpfr=$MPFR \ + --enable-accelerator=cuda \ + --disable-gparity \ + --disable-fermion-reps \ + --disable-unified \ + CXX=nvcc \ + LDFLAGS="-cudart shared -L$NVIDIALIB -lcublas" \ + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared" + diff --git a/systems/Linux-cuda/sourceme.sh b/systems/Linux-cuda/sourceme.sh new file mode 100644 index 00000000..207a1371 --- /dev/null +++ b/systems/Linux-cuda/sourceme.sh @@ -0,0 +1,16 @@ +. /home/paboyle/spack/share/spack/setup-env.sh +spack load cuda@12.0.0 +spack load c-lime +spack load gmp +spack load mpfr +spack load hdf5 +spack load fftw +spack load openmpi +export FFTW=`spack find --paths fftw | grep fftw | cut -c 14-` +export HDF5=`spack find --paths hdf5 | grep hdf5 | cut -c 14-` +export CUDA=`spack find --paths cuda@11.8.0 | grep cuda | cut -c 14-` +export CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-` +export GMP=`spack find --paths gmp | grep gmp | cut -c 12-` +export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-` +export NVIDIALIB=$CUDA/targets/x86_64-linux/lib/ +export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH:$HDF5/lib:$FFTW/lib:$CLIME/lib/:$MPFR/lib diff --git a/systems/Lumi/config-command b/systems/Lumi/config-command index 5e596285..76854edc 100644 --- a/systems/Lumi/config-command +++ b/systems/Lumi/config-command @@ -1,7 +1,7 @@ spack load c-lime spack load gmp spack load mpfr -CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-` +CLIME=`spack find --paths c-lime | grep c-lime| cut -c 13-` GMP=`spack find --paths gmp | grep gmp | cut -c 12-` MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-` echo clime X$CLIME diff --git a/tests/Test_dwf_dslash_repro.cc b/tests/Test_dwf_dslash_repro.cc new file mode 100644 index 00000000..1bf813d9 --- /dev/null +++ b/tests/Test_dwf_dslash_repro.cc @@ -0,0 +1,239 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_cg_prec.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + +typedef LatticeFermionD FermionField; + +int VerifyOnDevice(const FermionField &res, FermionField &ref) +{ + deviceVector Fails(1); + int * Fail = &Fails[0]; + int FailHost=0; + + typedef typename FermionField::vector_object vobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + const uint64_t NN = res.Grid()->oSites(); + + acceleratorPut(*Fail,FailHost); + + accelerator_barrier(); + // Inject an error + + int injection=0; + if(getenv("GRID_ERROR_INJECT")) injection=1; + autoView(res_v,res,AcceleratorWrite); + autoView(ref_v,ref,AcceleratorRead); + if ( res.Grid()->ThisRank()== 0 ) + { + if (((random()&0xF)==0)&&injection) { + uint64_t sF = random()%(NN); + int lane=0; + printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank()); + auto vv = acceleratorGet(res_v[sF]); + double *dd = (double *)&vv; + *dd=M_PI; + acceleratorPut(res_v[sF],vv); + } + } + + accelerator_for( sF, NN, vobj::Nsimd(), { +#ifdef GRID_SIMT + { + int blane = acceleratorSIMTlane(vobj::Nsimd()); +#else + for(int blane;blaneoSites(); + + /////////////////////////////// + // Pull back to host + /////////////////////////////// + autoView(res_v,res,CpuRead); + autoView(ref_v,ref,CpuRead); + + std::vector ids_host(NN*Nsimd); + + acceleratorCopyFromDevice(ids,&ids_host[0],NN*Nsimd*sizeof(uint64_t)); + + ////////////////////////////////////////////////////////////// + // Redo check on host and print IDs + ////////////////////////////////////////////////////////////// + + for(int ss=0;ss< NN; ss++){ + int sF = ss; + for(int lane=0;lane>0 )&0xFF; + int slice =(id>>8 )&0xFF; + int eu =(id>>16)&0xFF; + std::cout << GridHostname()<<" miscompare site "< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + LatticeFermionD src(FGrid); random(RNG5,src); + LatticeFermionD junk(FGrid); random(RNG5,junk); + + LatticeFermionD result(FGrid); result=Zero(); + LatticeFermionD ref(FGrid); ref=Zero(); + + SU::HotConfiguration(RNG4,Umu); + + RealD mass=0.1; + RealD M5=1.8; + + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + int nsecs=600; + if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ + std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds"); + GridCmdOptionInt(arg,nsecs); + } + + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl; + UGrid->Barrier(); + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl; + + std::cout << GridLogMessage << "::::::::::::: Starting DWF repro for "<Broadcast(0,(void *)&start,sizeof(start)); + + FlightRecorder::ContinueOnFail = 0; + FlightRecorder::PrintEntireLog = 0; + FlightRecorder::ChecksumComms = 0; + FlightRecorder::ChecksumCommsSend=0; + + if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s); + + const uint64_t NN = FGrid->oSites()*vComplexD::Nsimd(); + + deviceVector ids_device(NN); + uint64_t *ids = &ids_device[0]; + + + Ddwf.DhopComms(src,ref); + Ddwf.DhopCalc(src,ref,ids); + + Ddwf.DhopComms(src,result); + + int iter=0; + do { + + result=junk; + + Ddwf.DhopCalc(src,result,ids); + + if ( VerifyOnDevice(result, ref) ) { + printf("Node %s Iter %d detected fails\n",GridHostname(),iter); + PrintFails(result,ref,ids); + // std::cout << " Dslash "<Broadcast(0,(void *)&now,sizeof(now)); + } while (now < (start + nsecs) ); + + + Grid_finalize(); +} diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index f37696a8..97bf5143 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -124,6 +124,8 @@ int main (int argc, char ** argv) SchurDiagMooeeOperatorParanoid HermOpEO(Ddwf); SchurDiagMooeeOperatorParanoid HermOpEO_f(Ddwf_f); + // SchurDiagMooeeOperator HermOpEO(Ddwf); + // SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); int nsecs=600; if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ @@ -131,6 +133,10 @@ int main (int argc, char ** argv) GridCmdOptionInt(arg,nsecs); } + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl; + UGrid->Barrier(); + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl; + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "< mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); @@ -148,7 +154,7 @@ int main (int argc, char ** argv) FlightRecorder::ContinueOnFail = 0; FlightRecorder::PrintEntireLog = 0; - FlightRecorder::ChecksumComms = 1; + FlightRecorder::ChecksumComms = 0; FlightRecorder::ChecksumCommsSend=0; if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); @@ -180,7 +186,7 @@ int main (int argc, char ** argv) iter ++; now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now)); } while (now < (start + nsecs/10) ); - + std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; ConjugateGradient CG(1.0e-8,10000); int i=0; diff --git a/tests/Test_meson_field.cc b/tests/Test_meson_field.cc index 17829046..fa428d6a 100644 --- a/tests/Test_meson_field.cc +++ b/tests/Test_meson_field.cc @@ -31,7 +31,7 @@ See the full license in the file "LICENSE" in the top level distribution directo using namespace Grid; const int TSRC = 0; //timeslice where rho is nonzero -const int VDIM = 5; //length of each vector +const int VDIM = 8; //length of each vector typedef typename DomainWallFermionD::ComplexField ComplexField; typedef typename DomainWallFermionD::FermionField FermionField; @@ -55,19 +55,26 @@ int main(int argc, char *argv[]) pRNG.SeedFixedIntegers(seeds); // MesonField lhs and rhs vectors + const int Nem=1; std::vector phi(VDIM,&grid); - std::vector rho(VDIM,&grid); - FermionField rho_tmp(&grid); + std::vector B0(Nem,&grid); + std::vector B1(Nem,&grid); std::cout << GridLogMessage << "Initialising random meson fields" << std::endl; for (unsigned int i = 0; i < VDIM; ++i){ random(pRNG,phi[i]); - random(pRNG,rho_tmp); //ideally only nonzero on t=0 - rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0 + } + for (unsigned int i = 0; i < Nem; ++i){ + random(pRNG,B0[i]); + random(pRNG,B1[i]); } std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl; // Gamma matrices used in the contraction std::vector Gmu = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, @@ -78,11 +85,15 @@ int main(int argc, char *argv[]) std::vector> momenta = { {0.,0.,0.}, {1.,0.,0.}, + {-1.,0.,0.}, + {0,1.,0.}, + {0,-1.,0.}, + {0,0,1.}, + {0,0,-1.}, {1.,1.,0.}, {1.,1.,1.}, {2.,0.,0.} }; - std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl; std::cout << GridLogMessage << "Computing complex phases" << std::endl; @@ -102,28 +113,29 @@ int main(int argc, char *argv[]) std::cout << GridLogMessage << "Computing complex phases done." << std::endl; Eigen::Tensor Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); - Eigen::Tensor Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); - Eigen::Tensor Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + Eigen::Tensor App(B0.size(),1,Nt,VDIM,VDIM); // timer double start,stop; + ///////////////////////////////////////////////////////////////////////// //execute meson field routine + ///////////////////////////////////////////////////////////////////////// + A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); start = usecond(); A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); stop = usecond(); std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl; - start = usecond(); - /* Ideally, for this meson field we could pass TSRC (even better a list of timeslices) - * to the routine so that all the compnents which are predictably equal to zero are not computed. */ - A2Autils::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp); - stop = usecond(); - std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl; - start = usecond(); - A2Autils::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp); - stop = usecond(); - std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl; + ///////////////////////////////////////////////////////////////////////// + //execute aslash field routine + ///////////////////////////////////////////////////////////////////////// + A2Autils::AslashField(App,&phi[0],&phi[0],B0,B1,Tp); + start = usecond(); + A2Autils::AslashField(App,&phi[0],&phi[0],B0,B1,Tp); + stop = usecond(); + std::cout << GridLogMessage << "Alash(phi,phi) created, execution time " << stop-start << " us" << std::endl; + std::string FileName = "Meson_Fields"; #ifdef HAVE_HDF5 using Default_Reader = Grid::Hdf5Reader; @@ -134,12 +146,11 @@ int main(int argc, char *argv[]) using Default_Writer = Grid::BinaryWriter; FileName.append(".bin"); #endif - - Default_Writer w(FileName); - write(w,"phi_phi",Mpp); - write(w,"phi_rho",Mpr); - write(w,"rho_rho",Mrr); - + { + Default_Writer w(FileName); + write(w,"MesonField",Mpp); + write(w,"AslashField",App); + } // epilogue std::cout << GridLogMessage << "Grid is finalizing now" << std::endl; Grid_finalize(); diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index 212b1a35..16ee5a0f 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -39,7 +39,7 @@ int main (int argc, char ** argv) std::cout< HermOp(Dov); + MdagMLinearOperator HermOp(Dov); ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src5,result5); + std::cout << " Solved by Conjugate Gradient (CGNE)" < qmu({1.0,0.0,0.0,0.0}); + Dov.set_qmu(qmu); // Momentum space prop std::cout << " Solving by FFT and Feynman rules" < HermOp(Dov); + MdagMLinearOperator HermOp(Dov); ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src5,result5); //////////////////////////////////////////////////////////////////////// diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc index e5b6f75b..be50fd31 100644 --- a/tests/core/Test_fftf.cc +++ b/tests/core/Test_fftf.cc @@ -39,7 +39,8 @@ int main (int argc, char ** argv) std::cout< testAlgebra; @@ -148,11 +149,12 @@ void checkSigma(const GparityFlavour::Algebra a, GridSerialRNG &rng) test(m*g, m*testg); std::cout << std::endl; } +#endif int main(int argc, char *argv[]) { Grid_init(&argc,&argv); - +#ifdef ENABLE_GPARITY Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); @@ -170,7 +172,7 @@ int main(int argc, char *argv[]) checkSigma(i, sRNG); } std::cout << GridLogMessage << std::endl; - +#endif Grid_finalize(); return EXIT_SUCCESS; diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc index c8587435..0f3c8aad 100644 --- a/tests/core/Test_gpwilson_even_odd.cc +++ b/tests/core/Test_gpwilson_even_odd.cc @@ -35,7 +35,7 @@ using namespace Grid; int main (int argc, char ** argv) { Grid_init(&argc,&argv); - +#ifdef ENABLE_GPARITY Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); @@ -216,6 +216,6 @@ int main (int argc, char ** argv) std::cout<oSites(),1,{ - assert(B[v]==A_v[ss]()()().getlane(0)); + // assert(B[v]==A_v[ss]()()().getlane(0)); }); // std::cout << "["< inline void sliceSumCPU(const Grid::Lattice &Data,std int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,Zero()); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,Zero()); // sum across these down to scalars ExtractBuffer extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node diff --git a/tests/core/Test_uvm.cc b/tests/core/Test_uvm.cc new file mode 100644 index 00000000..290aa975 --- /dev/null +++ b/tests/core/Test_uvm.cc @@ -0,0 +1,106 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_memory_manager.cc + + Copyright (C) 2022 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +const int64_t Pages=32; +const int64_t PageWords=4096/sizeof(ComplexD); +const int64_t VecWords=PageWords*Pages; +const int64_t N=10000; + +class Tester { +public: + Vector zero_uvm; + std::vector zero_host; + std::vector > A; + std::vector > B; + uint64_t counter; + + Tester() : + zero_uvm(VecWords,ComplexD(0.0)), + zero_host(VecWords,ComplexD(0.0)), + A(N,zero_uvm), + B(N,zero_host) + { counter = 0; } + + void MemoryTest(int N) + { + for(int epoch = 0;epoch<100000;epoch++){ + + int p = random() %Pages; // Which address/page to hit + int v = random() %N; // Which vec + int w = random() %2; // Write or read + int dev= random() %2; // On device? + // int e=1; + ComplexD zc = counter++; + + if ( w ) { + B[v][p*PageWords] = B[v][p*PageWords] + zc; + if ( dev ) { + ComplexD *A_v=&A[v][0]; + accelerator_for(ss,1,1,{ + A_v[p*PageWords] = A_v[p*PageWords] + zc; + }); + } else { + A[v][p*PageWords] = A[v][p*PageWords] + zc; + } + } else { + if ( dev ) { + ComplexD *A_v=&A[v][0]; + ComplexD ref = B[v][p*PageWords]; + std::cout << "Device compare "< See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ +#include +#include +#include + #include int Ls=8; double M5=1.6; double mq=0.01; -double zolo_lo = 0.1; -double zolo_hi = 2.0; +double zolo_lo = 0.01; +double zolo_hi = 7.0; double mobius_scale=2.0; enum ChromaAction { @@ -55,11 +58,6 @@ enum ChromaAction { void calc_grid (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag); void calc_chroma (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag); -#include -#include -#include - - namespace Chroma { @@ -81,7 +79,7 @@ public: std::vector x(4); QDP::multi1d cx(4); - std::vector gd= gr.Grid()->GlobalDimensions(); + Grid::Coordinate gd = gr.Grid()->GlobalDimensions(); for (x[0]=0;x[0] x(5); QDP::multi1d cx(4); - std::vector gd= gr.Grid()->GlobalDimensions(); + Grid::Coordinate gd= gr.Grid()->GlobalDimensions(); for (x[0]=0;x[0] x(5); QDP::multi1d cx(4); - std::vector gd= gr.Grid()->GlobalDimensions(); + Grid::Coordinate gd= gr.Grid()->GlobalDimensions(); for (x[0]=0;x[0]OVEXT_CONSTANT_STRATEGY\n"; +"OVEXT_CONSTANT_STRATEGY1.0\n"; + UnprecOvExtFermActArray S_f(cfs,param); + Handle< FermState > fs( S_f.createState(u) ); + Handle< LinearOperatorArray > M(S_f.linOp(fs)); + return M; + } + if ( parms == HwPartFracTanh ) { + if ( Ls%2 == 0 ) { + printf("Ls is not odd\n"); + exit(-1); + } + UnprecOvExtFermActArrayParams param; + param.OverMass=M5; + param.Mass=_mq; + param.RatPolyDeg = Ls; + param.ApproxMin =eps_lo; + param.ApproxMax =eps_hi; + param.b5 =1.0; + param.c5 =1.0; + // param.approximation_type=COEFF_TYPE_ZOLOTAREV; + param.approximation_type=COEFF_TYPE_TANH_UNSCALED; + //param.approximation_type=COEFF_TYPE_TANH; + param.tuning_strategy_xml= + "OVEXT_CONSTANT_STRATEGY1.0\n"; UnprecOvExtFermActArray S_f(cfs,param); Handle< FermState > fs( S_f.createState(u) ); Handle< LinearOperatorArray > M(S_f.linOp(fs)); @@ -316,7 +337,35 @@ public: param.ApproxMin=eps_lo; param.ApproxMax=eps_hi; param.approximation_type=COEFF_TYPE_ZOLOTAREV; - param.RatPolyDeg=Ls; + param.RatPolyDeg=Ls-1; + // The following is why I think Chroma made some directional errors: + param.AuxFermAct= std::string( +"\n" +" UNPRECONDITIONED_WILSON\n" +" -1.8\n" +" 1\n" +" 0\n" +" 1000\n" +" 1.0e-9\n" +" \n" +" SIMPLE_FERMBC\n" +" 1 1 1 1\n" +" \n" +"" +); + param.AuxFermActGrp= std::string(""); + UnprecOvlapContFrac5DFermActArray S_f(fbc,param); + Handle< FermState > fs( S_f.createState(u) ); + Handle< LinearOperatorArray > M(S_f.linOp(fs)); + return M; + } + if ( parms == HwContFracTanh ) { + UnprecOvlapContFrac5DFermActParams param; + param.Mass=_mq; // How is M5 set? Wilson mass In AuxFermAct + param.ApproxMin=eps_lo; + param.ApproxMax=eps_hi; + param.approximation_type=COEFF_TYPE_TANH_UNSCALED; + param.RatPolyDeg=Ls-1; // The following is why I think Chroma made some directional errors: param.AuxFermAct= std::string( "\n" @@ -378,7 +427,14 @@ int main (int argc,char **argv ) * Setup QDP *********************************************************/ Chroma::initialize(&argc,&argv); - Chroma::WilsonTypeFermActs4DEnv::registerAll(); + // Chroma::WilsonTypeFermActs4DEnv::registerAll(); + Chroma::WilsonTypeFermActsEnv::registerAll(); + //bool linkageHack(void) + //{ + // bool foo = true; + // Inline Measurements + // InlineAggregateEnv::registerAll(); + // GaugeInitEnv::registerAll(); /******************************************************** * Setup Grid @@ -388,26 +444,34 @@ int main (int argc,char **argv ) Grid::GridDefaultSimd(Grid::Nd,Grid::vComplex::Nsimd()), Grid::GridDefaultMpi()); - std::vector gd = UGrid->GlobalDimensions(); + Grid::Coordinate gd = UGrid->GlobalDimensions(); QDP::multi1d nrow(QDP::Nd); for(int mu=0;mu<4;mu++) nrow[mu] = gd[mu]; QDP::Layout::setLattSize(nrow); QDP::Layout::create(); - Grid::GridCartesian * FGrid = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); - Grid::LatticeGaugeField lat(UGrid); - Grid::LatticeFermion src(FGrid); - Grid::LatticeFermion res_chroma(FGrid); - Grid::LatticeFermion res_grid (FGrid); - std::vector ActionList({ HtCayleyTanh, // Plain old DWF. HmCayleyTanh, HwCayleyTanh, HtCayleyZolo, // Plain old DWF. HmCayleyZolo, - HwCayleyZolo + HwCayleyZolo, + HwPartFracZolo, + HwContFracZolo, + HwContFracTanh + }); + std::vector LsList({ + 8,//HtCayleyTanh, // Plain old DWF. + 8,//HmCayleyTanh, + 8,//HwCayleyTanh, + 8,//HtCayleyZolo, // Plain old DWF. + 8,//HmCayleyZolo, + 8,//HwCayleyZolo, + 9,//HwPartFracZolo + 9, //HwContFracZolo + 9 //HwContFracTanh }); std::vector ActionName({ "HtCayleyTanh", @@ -415,10 +479,19 @@ int main (int argc,char **argv ) "HwCayleyTanh", "HtCayleyZolo", "HmCayleyZolo", - "HwCayleyZolo" + "HwCayleyZolo", + "HwPartFracZolo", + "HwContFracZolo", + "HwContFracTanh" }); for(int i=0;i::HotConfiguration(RNG4,Umu); + Grid::SU::HotConfiguration(RNG4,Umu); /* Grid::LatticeColourMatrix U(UGrid); @@ -519,7 +593,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF if ( action == HtCayleyTanh ) { - Grid::DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5); + Grid::DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5); std::cout << Grid::GridLogMessage <<" Calling domain wall multiply "<::HotConfiguration(RNG4, Umu); +#endif std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; diff --git a/tests/solver/Test_dwf_cg_unprec.cc b/tests/solver/Test_dwf_cg_unprec.cc index 58614c49..7435bfae 100644 --- a/tests/solver/Test_dwf_cg_unprec.cc +++ b/tests/solver/Test_dwf_cg_unprec.cc @@ -54,15 +54,30 @@ int main (int argc, char ** argv) GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + std::vector qmu; + qmu.push_back(ComplexD(0.1,0.0)); + qmu.push_back(ComplexD(0.0,0.0)); + qmu.push_back(ComplexD(0.0,0.0)); + qmu.push_back(ComplexD(0.0,0.01)); + + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + LatticeFermion tmp(FGrid); LatticeFermion src(FGrid); random(RNG5,src); LatticeFermion result(FGrid); result=Zero(); - LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); - + LatticeGaugeField Umu(UGrid); +#if 0 + FieldMetaData header; + std::string file("ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); +#else + SU::HotConfiguration(RNG4,Umu); +#endif + std::vector U(4,UGrid); for(int mu=0;mu(Umu,mu); @@ -71,8 +86,15 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD M5=1.8; DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + Ddwf.qmu = qmu; + Ddwf.M(src,tmp); + std::cout << " |M src|^2 "< HermOp(Ddwf); + HermOp.HermOp(src,tmp); + + std::cout << " "< CG(1.0e-6,10000); CG(HermOp,src,result); diff --git a/tests/sp2n/Test_2as_base.cc b/tests/sp2n/Test_2as_base.cc index 62e86609..3aeccae0 100644 --- a/tests/sp2n/Test_2as_base.cc +++ b/tests/sp2n/Test_2as_base.cc @@ -87,8 +87,8 @@ static void run_generators_checks() { typedef typename Sp_TwoIndex::template iGroupMatrix Matrix; int sum = 0; int sum_im = 0; - Vector ta_fund(this_algebra_dim); - Vector eij(this_irrep_dim); + std::vector ta_fund(this_algebra_dim); + std::vector eij(this_irrep_dim); Matrix tmp_l; Matrix tmp_r; for (int n = 0; n < this_algebra_dim; n++)