diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h index 181df320..21ac66b9 100644 --- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h @@ -74,7 +74,7 @@ public: void operator() (const Field &src, Field &psi){ - psi=Zero(); + // psi=Zero(); RealD cp, ssq,rsq; ssq=norm2(src); rsq=Tolerance*Tolerance*ssq; diff --git a/Grid/algorithms/multigrid/Aggregates.h b/Grid/algorithms/multigrid/Aggregates.h index fb708b16..953e3020 100644 --- a/Grid/algorithms/multigrid/Aggregates.h +++ b/Grid/algorithms/multigrid/Aggregates.h @@ -30,6 +30,8 @@ Author: paboyle /* END LEGAL */ #pragma once +#include + NAMESPACE_BEGIN(Grid); inline RealD AggregatePowerLaw(RealD x) @@ -124,6 +126,53 @@ public: } } + virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase &DiracOp,int nn=nbasis) + { + RealD scale; + + TrivialPrecon simple_fine; + PrecGeneralisedConjugateResidualNonHermitian GCR(0.001,30,DiracOp,simple_fine,12,12); + FineField noise(FineGrid); + FineField src(FineGrid); + FineField guess(FineGrid); + FineField Mn(FineGrid); + + for(int b=0;b "< "< Cheb(lo,hi,orderfilter); Cheb(hermop,noise,Mn); // normalise scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; subspace[b] = Mn; - hermop.Op(Mn,tmp); - std::cout< "< "< "< "< "< "< inline bool operator!=(const devAllocator<_Tp>&, const d // Template typedefs //////////////////////////////////////////////////////////////////////////////// template using hostVector = std::vector >; // Needs autoview -template using Vector = std::vector >; // +template using Vector = std::vector >; // Really want to deprecate template using uvmVector = std::vector >; // auto migrating page template using deviceVector = std::vector >; // device vector +/* template class vecView { protected: @@ -214,6 +215,7 @@ template vecView VectorView(Vector &vec,ViewMode _mode) #define autoVecView(v_v,v,mode) \ auto v_v = VectorView(v,mode); \ ViewCloser _autoView##v_v(v_v); +*/ NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index b53e1510..eb8c6d38 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ]; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl; //#define dprintf(...) +//#define mprintf(...) //////////////////////////////////////////////////////////// // For caching copies of data on device @@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) DeviceBytes -=AccCache.bytes; LRUremove(AccCache); AccCache.AccPtr=(uint64_t) NULL; - dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n", + mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld", (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); if (AccCache.accLock!=0) return; @@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)NULL; AccCache.state=CpuDirty; // CPU primary now DeviceBytes -=AccCache.bytes; - dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes); } // uint64_t CpuPtr = AccCache.CpuPtr; DeviceEvictions++; @@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n", + mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx", (uint64_t)AccCache.bytes, (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); @@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::ViewClose(void* Ptr,ViewMode mode) { if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ - dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); + dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr); } else if( (mode==CpuRead)||(mode==CpuWrite)){ CpuViewClose((uint64_t)Ptr); @@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis { uint64_t CpuPtr = (uint64_t)_CpuPtr; if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ - dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); + dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); } else if( (mode==CpuRead)||(mode==CpuWrite)){ return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); @@ -265,7 +266,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n", + dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, @@ -305,7 +306,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // Empty + AccRead => Consistent } AccCache.accLock= 1; - dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); + dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock); } else if(AccCache.state==CpuDirty ){ if(mode==AcceleratorWriteDiscard) { CpuDiscard(AccCache); @@ -318,21 +319,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); + dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); + dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); + dprintf("AccDirty entry ++accLock= %d",AccCache.accLock); } else { assert(0); } @@ -341,7 +342,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // If view is opened on device must remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU - dprintf("AccCache entry removed from LRU \n"); + dprintf("AccCache entry removed from LRU "); LRUremove(AccCache); } @@ -364,10 +365,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) AccCache.accLock--; // Move to LRU queue if not locked and close on device if(AccCache.accLock==0) { - dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); + dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); LRUinsert(AccCache); } else { - dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); + dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); } } void MemoryManager::CpuViewClose(uint64_t CpuPtr) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 964a523b..62fcfa7b 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -33,6 +33,8 @@ Author: Peter Boyle /////////////////////////////////// #include +#define NVLINK_GET + NAMESPACE_BEGIN(Grid); extern bool Stencil_force_mpi ; @@ -193,6 +195,11 @@ public: void *recv, int recv_from_rank,int do_recv, int xbytes,int rbytes,int dir); + + // Could do a PollHtoD and have a CommsMerge dependence + void StencilSendToRecvFromPollDtoH (std::vector &list); + void StencilSendToRecvFromPollIRecv(std::vector &list); + double StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int do_xmit, diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 7dc706df..38b9f9c6 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -30,6 +30,7 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); + Grid_MPI_Comm CartesianCommunicator::communicator_world; //////////////////////////////////////////// @@ -362,8 +363,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, int bytes) { std::vector reqs(0); - unsigned long xcrc = crc32(0L, Z_NULL, 0); - unsigned long rcrc = crc32(0L, Z_NULL, 0); int myrank = _processor; int ierr; @@ -379,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, communicator,MPI_STATUS_IGNORE); assert(ierr==0); - // xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); - // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); - // printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush } // Basic Halo comms primitive double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, @@ -399,6 +395,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, #ifdef ACCELERATOR_AWARE_MPI +void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector &list) {}; +void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector &list) {}; double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, void *xmit, int dest,int dox, @@ -561,53 +559,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferMalloc(xbytes); - acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch + CommsRequest_t srq; + + srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch // ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); // assert(ierr==0); // off_node_bytes+=xbytes; - CommsRequest_t srq; srq.PacketType = InterNodeXmit; srq.bytes = xbytes; // srq.req = xrq; srq.host_buf = host_xmit; srq.device_buf = xmit; + srq.tag = tag; + srq.dest = dest; + srq.commdir = commdir; list.push_back(srq); -#else - tag= dir+_processor*32; - - host_xmit = this->HostBufferMalloc(xbytes); - const int chunks=1; - for(int n=0;n &list) +{ + int pending = 0; + do { + + pending = 0; + + for(int idx = 0; idx &list) +{ + int pending = 0; + do { + + pending = 0; + + for(int idx = 0; idx &list, void *xmit, @@ -644,69 +694,92 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorIsBoss() ) { - // printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); - // printed++; - // } - + if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) { + // Intranode + void *shm = (void *) this->ShmBufferTranslate(from,xmit); + assert(shm!=NULL); + + CommsRequest_t srq; + + srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); + + srq.PacketType = IntraNodeRecv; + srq.bytes = xbytes; + // srq.req = xrq; + srq.host_buf = NULL; + srq.device_buf = xmit; + srq.tag = -1; + srq.dest = dest; + srq.commdir = dir; + list.push_back(srq); + } + } +#else if (dox) { - if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { -#ifdef DEVICE_TO_HOST_CONCURRENT - tag= dir+_processor*32; - // Find the send in the prepared list - int list_idx=-1; - for(int idx = 0; idxShmBufferTranslate(dest,recv); assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + + CommsRequest_t srq; + + srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + + srq.PacketType = IntraNodeXmit; + srq.bytes = xbytes; + // srq.req = xrq; + srq.host_buf = NULL; + srq.device_buf = xmit; + srq.tag = -1; + srq.dest = dest; + srq.commdir = dir; + list.push_back(srq); + } } +#endif return off_node_bytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) { - int nreq=list.size(); + acceleratorCopySynchronise(); // Complete all pending copy transfers D2D - if (nreq==0) return; - std::vector status(nreq); - std::vector MpiRequests(nreq); + std::vector status; + std::vector MpiRequests; + + for(int r=0;r0) { + status.resize(MpiRequests.size()); + int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. + assert(ierr==0); } - int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); - assert(ierr==0); - - for(int r=0;rHostBufferFreeAll(); // Clean up the buffer allocs - this->StencilBarrier(); +#ifndef NVLINK_GET + this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers. +#endif } #endif //////////////////////////////////////////// diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 8e6206ef..3dee8f4d 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, { assert(0); } -void CartesianCommunicator::CommsComplete(std::vector &list){ assert(0);} +void CartesianCommunicator::CommsComplete(std::vector &list){ assert(list.size()==0);} void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, int dest, @@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, { return 2.0*bytes; } +void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector &list) {}; +void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector &list) {}; double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, void *xmit, int xmit_to_rank,int dox, @@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, void *xmit, diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index 422be8aa..f8099a1d 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t; #ifdef ACCELERATOR_AWARE_MPI typedef MPI_Request CommsRequest_t; #else -enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv }; +/* + * Enable state transitions as each packet flows. + */ +enum PacketType_t { + FaceGather, + InterNodeXmit, + InterNodeRecv, + IntraNodeXmit, + IntraNodeRecv, + InterNodeXmitISend, + InterNodeReceiveHtoD +}; +/* + *Package arguments needed for various actions along packet flow + */ typedef struct { PacketType_t PacketType; void *host_buf; void *device_buf; + int dest; + int tag; + int commdir; unsigned long bytes; + acceleratorEvent_t ev; MpiCommsRequest_t req; } CommsRequest_t; #endif diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 90052051..710792ee 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -68,7 +68,7 @@ template Lattice Cshift(const Lattice &rhs,int dimension if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"< void Cshift_comms(Lattice& ret,const Lattice &rhs,int dimension,int shift) { int sshift[2]; @@ -125,7 +125,11 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); - +#ifndef ACCELERATOR_AWARE_MPI + static hostVector hsend_buf; hsend_buf.resize(buffer_size); + static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); +#endif + int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); RealD tcopy=0.0; @@ -156,16 +160,29 @@ template void Cshift_comms(Lattice &ret,const Lattice &r // int rank = grid->_processor; int recv_from_rank; int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); tcomms-=usecond(); grid->Barrier(); +#ifdef ACCELERATOR_AWARE_MPI grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); +#else + // bouncy bouncy + acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes); + grid->SendToRecvFrom((void *)&hsend_buf[0], + xmit_to_rank, + (void *)&hrecv_buf[0], + recv_from_rank, + bytes); + acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); +#endif + xbytes+=bytes; grid->Barrier(); tcomms+=usecond(); @@ -226,12 +243,17 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; - + + for(int s=0;s hsend_buf; hsend_buf.resize(buffer_size); + hostVector hrecv_buf; hrecv_buf.resize(buffer_size); +#endif + int bytes = buffer_size*sizeof(scalar_object); ExtractPointerArray pointers(Nsimd); // @@ -283,11 +305,22 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeSendToRecvFrom((void *)send_buf_extract_mpi, xmit_to_rank, (void *)recv_buf_extract_mpi, recv_from_rank, bytes); +#else + // bouncy bouncy + acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes); + grid->SendToRecvFrom((void *)&hsend_buf[0], + xmit_to_rank, + (void *)&hrecv_buf[0], + recv_from_rank, + bytes); + acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes); +#endif xbytes+=bytes; grid->Barrier(); @@ -311,234 +344,6 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) -{ - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_type scalar_type; - - GridBase *grid=rhs.Grid(); - Lattice temp(rhs.Grid()); - - int fd = rhs.Grid()->_fdimensions[dimension]; - int rd = rhs.Grid()->_rdimensions[dimension]; - int pd = rhs.Grid()->_processors[dimension]; - int simd_layout = rhs.Grid()->_simd_layout[dimension]; - int comm_dim = rhs.Grid()->_processors[dimension] >1 ; - assert(simd_layout==1); - assert(comm_dim==1); - assert(shift>=0); - assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); - static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); - vobj *send_buf; - vobj *recv_buf; - { - grid->ShmBufferFreeAll(); - size_t bytes = buffer_size*sizeof(vobj); - send_buf=(vobj *)grid->ShmBufferMalloc(bytes); - recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); - } - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - - for(int x=0;x>1; - - int bytes = words * sizeof(vobj); - - tgather-=usecond(); - Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); - tgather+=usecond(); - - // int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - - - tcomms-=usecond(); - // grid->Barrier(); - - acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); - grid->SendToRecvFrom((void *)&send_buf[0], - xmit_to_rank, - (void *)&recv_buf[0], - recv_from_rank, - bytes); - xbytes+=bytes; - acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); - - // grid->Barrier(); - tcomms+=usecond(); - - tscatter-=usecond(); - Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); - tscatter+=usecond(); - } - } - if(Cshift_verbose){ - std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) -{ - GridBase *grid=rhs.Grid(); - const int Nsimd = grid->Nsimd(); - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; - - int fd = grid->_fdimensions[dimension]; - int rd = grid->_rdimensions[dimension]; - int ld = grid->_ldimensions[dimension]; - int pd = grid->_processors[dimension]; - int simd_layout = grid->_simd_layout[dimension]; - int comm_dim = grid->_processors[dimension] >1 ; - - //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); - assert(shiftPermuteType(dimension); - - /////////////////////////////////////////////// - // Simd direction uses an extract/merge pair - /////////////////////////////////////////////// - int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; - // int words = sizeof(vobj)/sizeof(vector_type); - - static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); - static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); - scalar_object * recv_buf_extract_mpi; - scalar_object * send_buf_extract_mpi; - { - size_t bytes = sizeof(scalar_object)*buffer_size; - grid->ShmBufferFreeAll(); - send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); - recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); - } - for(int s=0;s pointers(Nsimd); // - ExtractPointerArray rpointers(Nsimd); // received pointers - - /////////////////////////////////////////// - // Work out what to send where - /////////////////////////////////////////// - int cb = (cbmask==0x2)? Odd : Even; - int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer - int nbr_ox = (nbr_coor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - int recv_from_rank; - int xmit_to_rank; - - if (nbr_ic) nbr_lane|=inner_bit; - - assert (sx == nbr_ox); - - if(nbr_proc){ - grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - tcomms-=usecond(); - // grid->Barrier(); - - acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); - grid->SendToRecvFrom((void *)send_buf_extract_mpi, - xmit_to_rank, - (void *)recv_buf_extract_mpi, - recv_from_rank, - bytes); - acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); - xbytes+=bytes; - - // grid->Barrier(); - tcomms+=usecond(); - rpointers[i] = &recv_buf_extract[i][0]; - } else { - rpointers[i] = &send_buf_extract[nbr_lane][0]; - } - - } - tscatter-=usecond(); - Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); - tscatter+=usecond(); - - } - if(Cshift_verbose){ - std::cout << GridLogPerformance << " Cshift (s) copy "< recv_buf; send_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth); +#ifndef ACCELERATOR_AWARE_MPI + static hostVector hsend_buf; + static hostVector hrecv_buf; + hsend_buf.resize(buffer_size*2*depth); + hrecv_buf.resize(buffer_size*2*depth); +#endif std::vector fwd_req; std::vector bwd_req; @@ -495,9 +501,17 @@ public: t_gather+=usecond()-t; t=usecond(); +#ifdef ACCELERATOR_AWARE_MPI grid->SendToRecvFromBegin(fwd_req, (void *)&send_buf[d*buffer_size], xmit_to_rank, (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); +#else + acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes); + grid->SendToRecvFromBegin(fwd_req, + (void *)&hsend_buf[d*buffer_size], xmit_to_rank, + (void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag); + acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes); +#endif t_comms+=usecond()-t; } for ( int d=0;d < depth ; d ++ ) { @@ -508,9 +522,17 @@ public: t_gather+= usecond() - t; t=usecond(); +#ifdef ACCELERATOR_AWARE_MPI grid->SendToRecvFromBegin(bwd_req, (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); +#else + acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes); + grid->SendToRecvFromBegin(bwd_req, + (void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank, + (void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); + acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes); +#endif t_comms+=usecond()-t; } diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 605bdcec..1c6571e1 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -484,6 +484,12 @@ public: this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); accelerator_barrier(); +#ifdef NVLINK_GET + #warning "NVLINK_GET" + this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his + // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check + // Or issue barrier AFTER the DMA is running +#endif } }; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 1d0dfb61..09d09afb 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -504,7 +504,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { - acceleratorFenceComputeStream(); + // acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} #ifndef GRID_CUDA @@ -517,7 +517,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { - // dependent on result of merge + // // dependent on result of merge acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;} diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1142891a..2a666a04 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -363,12 +363,16 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + // std::cout << "Communicate Begin "<Barrier(); FlightRecorder::StepLog("Communicate begin"); // All GPU kernel tasks must complete // accelerator_barrier(); // All kernels should ALREADY be complete // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. for(int i=0;iBarrier(); _grid->StencilSendToRecvFromPrepare(MpiReqs, Packets[i].send_buf, Packets[i].to_rank,Packets[i].do_send, @@ -376,8 +380,15 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } + // std::cout << "Communicate PollDtoH "<Barrier(); + _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/ + // std::cout << "Communicate CopySynch "<Barrier(); acceleratorCopySynchronise(); + // Starts intranode for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, Packets[i].to_rank,Packets[i].do_send, @@ -395,7 +406,14 @@ public: void CommunicateComplete(std::vector > &reqs) { + // std::cout << "Communicate Complete "<Barrier(); FlightRecorder::StepLog("Start communicate complete"); + // std::cout << "Communicate Complete PollIRecv "<Barrier(); + _grid->StencilSendToRecvFromPollIRecv(MpiReqs); + // std::cout << "Communicate Complete Complete "<Barrier(); _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); @@ -483,6 +501,9 @@ public: void HaloGather(const Lattice &source,compressor &compress) { // accelerator_barrier(); + ////////////////////////////////// + // I will overwrite my send buffers + ////////////////////////////////// _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -496,7 +517,12 @@ public: HaloGatherDir(source,compress,point,face_idx); } accelerator_barrier(); // All my local gathers are complete - // _grid->StencilBarrier();// Synch shared memory on a single nodes +#ifdef NVLINK_GET + #warning "NVLINK_GET" + _grid->StencilBarrier(); // He can now get mu local gather, I can get his + // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check + // Or issue barrier AFTER the DMA is running +#endif face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); } @@ -535,6 +561,7 @@ public: coalescedWrite(to[j] ,coalescedRead(from [j])); }); acceleratorFenceComputeStream(); + // Also fenced in WilsonKernels } } @@ -663,7 +690,6 @@ public: } } } - std::cout << "BuildSurfaceList size is "< surface_list_host(surface_list_size); int32_t ss=0; @@ -683,6 +709,7 @@ public: } } acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int)); + std::cout << GridLogMessage<<"BuildSurfaceList size is "<wait(); } -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} -inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} + +/////// +// Asynch event interface +/////// +typedef sycl::event acceleratorEvent_t; + +inline void acceleratorEventWait(acceleratorEvent_t ev) +{ + ev.wait(); +} + +inline int acceleratorEventIsComplete(acceleratorEvent_t ev) +{ + return (ev.get_info() == sycl::info::event_command_status::complete); +} + +inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} +inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } +inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } + +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) @@ -358,8 +375,10 @@ inline int acceleratorIsCommunicable(void *ptr) else return 0; #endif return 1; + } + #endif ////////////////////////////////////////////// @@ -492,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream); } -inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { +inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream); } -inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { +inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream); } inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); }; @@ -564,9 +583,9 @@ inline void acceleratorMem(void) accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } @@ -649,15 +668,15 @@ accelerator_inline void acceleratorFence(void) return; } -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) +inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes) { acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); acceleratorCopySynchronise(); } -template void acceleratorPut(T& dev,T&host) +template void acceleratorPut(T& dev,const T&host) { - acceleratorCopyToDevice(&host,&dev,sizeof(T)); + acceleratorCopyToDevice((void *)&host,&dev,sizeof(T)); } template T acceleratorGet(T& dev) { diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index 6887134d..cdb4fa62 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -73,9 +73,9 @@ Author: paboyle #define thread_critical DO_PRAGMA(omp critical) #ifdef GRID_OMP -inline void thread_bcopy(void *from, void *to,size_t bytes) +inline void thread_bcopy(const void *from, void *to,size_t bytes) { - uint64_t *ufrom = (uint64_t *)from; + const uint64_t *ufrom = (const uint64_t *)from; uint64_t *uto = (uint64_t *)to; assert(bytes%8==0); uint64_t words=bytes/8; @@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes) }); } #else -inline void thread_bcopy(void *from, void *to,size_t bytes) +inline void thread_bcopy(const void *from, void *to,size_t bytes) { bcopy(from,to,bytes); } diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 1424667e..feb44645 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv) Grid_default_latt, Grid_default_mpi); - + if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){ + std::cout << GridLogMessage <<" Enabling flight recorder " <=2*1024*1024*1024LL ){ - //std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<> Ls; diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index e400138b..4b50121e 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -492,17 +492,18 @@ public: } FGrid->Barrier(); double t1=usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t no = 50; + uint64_t ni = 100; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); double t1=usecond(); - uint64_t ncall = 500; - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t no = 50; + uint64_t ni = 100; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); - double t1=usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t ni = 100; + uint64_t no = 50; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); @@ -814,20 +818,21 @@ public: double mf_hi, mf_lo, mf_err; timestat.statistics(t_time); - mf_hi = flops/timestat.min; - mf_lo = flops/timestat.max; + mf_hi = flops/timestat.min*ni; + mf_lo = flops/timestat.max*ni; mf_err= flops/timestat.min * timestat.err/timestat.mean; - mflops = flops/timestat.mean; + mflops = flops/timestat.mean*ni; mflops_all.push_back(mflops); if ( mflops_best == 0 ) mflops_best = mflops; if ( mflops_worst== 0 ) mflops_worst= mflops; if ( mflops>mflops_best ) mflops_best = mflops; if ( mflops L_list({8,12,16,24}); + std::vector L_list({8,12,16,24,32}); int selm1=sel-1; std::vector clover; diff --git a/systems/Aurora/benchmarks/bench16.pbs b/systems/Aurora/benchmarks/bench16.pbs new file mode 100644 index 00000000..b3d3a461 --- /dev/null +++ b/systems/Aurora/benchmarks/bench16.pbs @@ -0,0 +1,74 @@ +#!/bin/bash + +##PBS -q LatticeQCD_aesp_CNDA +#PBS -q debug-scaling +##PBS -q prod +#PBS -l select=16 +#PBS -l walltime=00:20:00 +#PBS -A LatticeQCD_aesp_CNDA + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cp $PBS_NODEFILE nodefile + +export OMP_NUM_THREADS=4 +export MPICH_OFI_NIC_POLICY=GPU + +#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 + +# +# Local vol 16.16.16.32 +# + +LX=16 +LY=16 +LZ=16 +LT=32 + +NX=2 +NY=2 +NZ=4 +NT=1 + +GX=2 +GY=2 +GZ=1 +GT=3 + +PX=$((NX * GX )) +PY=$((NY * GY )) +PZ=$((NZ * GZ )) +PT=$((NT * GT )) + +VX=$((PX * LX )) +VY=$((PY * LY )) +VZ=$((PZ * LZ )) +VT=$((PT * LT )) + +NP=$((PX*PY*PZ*PT)) +VOL=${VX}.${VY}.${VZ}.${VT} +AT=8 +MPI=${PX}.${PY}.${PZ}.${PT} + +CMD="mpiexec -np $NP -ppn 12 -envall \ + ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap " + +echo VOL $VOL +echo MPI $MPI +echo NPROC $NP +echo $CMD +$CMD + diff --git a/systems/Aurora/benchmarks/gpu_tile.sh b/systems/Aurora/benchmarks/gpu_tile.sh index a622ba3e..f8f0ae96 100755 --- a/systems/Aurora/benchmarks/gpu_tile.sh +++ b/systems/Aurora/benchmarks/gpu_tile.sh @@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 @@ -30,8 +30,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A if [ $PALS_RANKID = "0" ] then - numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" -# numactl -p $NUMAP -N $NUMAP "$@" +# numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" + numactl -p $NUMAP -N $NUMAP "$@" else numactl -p $NUMAP -N $NUMAP "$@" fi diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 6e5512ff..08b77f4f 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,18 +1,19 @@ #Ahead of time compile for PVC -export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" -export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/" +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC" #JIT compile #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " -../../configure \ +../configure \ --enable-simd=GPU \ --enable-reduction=grid \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ --enable-debug \ + --prefix $HOME/gpt-install \ --disable-gparity \ --disable-fermion-reps \ --with-lime=$CLIME \ diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt new file mode 100644 index 00000000..7191b4ff --- /dev/null +++ b/systems/WorkArounds.txt @@ -0,0 +1,206 @@ +The purpose of this file is to collate all non-obvious known magic shell variables +and compiler flags required for either correctness or performance on various systems. + +A repository of work-arounds. + +Contents: +1. Interconnect + MPI +2. Compilation +3. Profiling + +************************ +* 1. INTERCONNECT + MPI +************************ + +-------------------------------------------------------------------- +MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O +-------------------------------------------------------------------- +export OMPI_MCA_io=romio321 + +-------------------------------------- +ROMIO fail with > 2GB per node read (32 bit issue) +-------------------------------------- + +Use later MPICH + +https://github.com/paboyle/Grid/issues/381 + +https://github.com/pmodels/mpich/commit/3a479ab0 + +-------------------------------------------------------------------- +Slingshot: Frontier and Perlmutter libfabric slow down +and physical memory fragmentation +-------------------------------------------------------------------- +export FI_MR_CACHE_MONITOR=disabled +or +export FI_MR_CACHE_MONITOR=kdreg2 + +-------------------------------------------------------------------- +Perlmutter +-------------------------------------------------------------------- + +export MPICH_RDMA_ENABLED_CUDA=1 +export MPICH_GPU_IPC_ENABLED=1 +export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0 +export MPICH_GPU_NO_ASYNC_MEMCPY=0 + +-------------------------------------------------------------------- +Frontier/LumiG +-------------------------------------------------------------------- + +Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU + +cat << EOF > select_gpu +#!/bin/bash +export MPICH_GPU_SUPPORT_ENABLED=1 +export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +export GPU_MAP=(0 1 2 3 7 6 5 4) +export NUMA_MAP=(3 3 1 1 2 2 0 0) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +export HIP_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec numactl -m \$NUMA -N \$NUMA \$* +EOF +chmod +x ./select_gpu + +srun ./select_gpu BINARY + + +-------------------------------------------------------------------- +Mellanox performance with A100 GPU (Tursa, Booster, Leonardo) +-------------------------------------------------------------------- +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes + +-------------------------------------------------------------------- +Mellanox + A100 correctness (Tursa, Booster, Leonardo) +-------------------------------------------------------------------- +export UCX_MEMTYPE_CACHE=n + +-------------------------------------------------------------------- +MPICH/Aurora/PVC correctness and performance +-------------------------------------------------------------------- + +https://github.com/pmodels/mpich/issues/7302 + +--enable-cuda-aware-mpi=no +--enable-unified=no + +Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI +Do not use SVM + +Ideally use MPICH with fix to issue 7302: + +https://github.com/pmodels/mpich/pull/7312 + +Ideally: +MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic + +Alternatives: +export MPIR_CVAR_NOLOCAL=1 +export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000 + +-------------------------------------------------------------------- +MPICH/Aurora/PVC correctness and performance +-------------------------------------------------------------------- + +Broken: +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +This gives good peformance without requiring +--enable-cuda-aware-mpi=no + +But is an open issue reported by James Osborn +https://github.com/pmodels/mpich/issues/7139 + +Possibly resolved but unclear if in the installed software yet. + +************************ +* 2. COMPILATION +************************ + +-------------------------------------------------------------------- +G++ compiler breakage / graveyard +-------------------------------------------------------------------- + +9.3.0, 10.3.1, +https://github.com/paboyle/Grid/issues/290 +https://github.com/paboyle/Grid/issues/264 + +Working (-) Broken (X): + +4.9.0 - +4.9.1 - +5.1.0 X +5.2.0 X +5.3.0 X +5.4.0 X +6.1.0 X +6.2.0 X +6.3.0 - +7.1.0 - +8.0.0 (HEAD) - + +https://github.com/paboyle/Grid/issues/100 + +-------------------------------------------------------------------- +AMD GPU nodes : +-------------------------------------------------------------------- + +multiple ROCM versions broken; use 5.3.0 +manifests itself as wrong results in fp32 + +https://github.com/paboyle/Grid/issues/464 + +-------------------------------------------------------------------- +Aurora/PVC +-------------------------------------------------------------------- + +SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB) +SYCL slow link and relocatable code issues (Christoph Lehner) +Opt large register file required for good performance in fp64 + + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC" + +-------------------------------------------------------------------- +Aurora/PVC useful extra options +-------------------------------------------------------------------- + +Host only sanitizer: +-Xarch_host -fsanitize=leak +-Xarch_host -fsanitize=address + +Deterministic MPI reduction: +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + + + +************************ +* 3. Visual profile tools +************************ + +-------------------------------------------------------------------- +Frontier/rocprof +-------------------------------------------------------------------- + +-------------------------------------------------------------------- +Aurora/unitrace +-------------------------------------------------------------------- + + +-------------------------------------------------------------------- +Tursa/nsight-sys +-------------------------------------------------------------------- diff --git a/systems/sdcc-genoa/bench.slurm b/systems/sdcc-genoa/bench.slurm new file mode 100644 index 00000000..2c7f6c32 --- /dev/null +++ b/systems/sdcc-genoa/bench.slurm @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:50:00 +#SBATCH -A lqcdtest +#SBATCH -q lqcd +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=64 +#SBATCH --qos lqcd + +source sourceme.sh + +export PLACES=(1:16:4 1:32:2 0:64:1); +export THR=(16 32 64) + +for t in 2 +do + +export OMP_NUM_THREADS=${THR[$t]} +export OMP_PLACES=${PLACES[$t]} +export thr=${THR[$t]} + +#for vol in 24.24.24.24 32.32.32.32 48.48.48.96 +for vol in 48.48.48.96 +do +srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr +done +#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr +done + diff --git a/systems/sdcc-genoa/bench2.slurm b/systems/sdcc-genoa/bench2.slurm new file mode 100644 index 00000000..be21c816 --- /dev/null +++ b/systems/sdcc-genoa/bench2.slurm @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:50:00 +#SBATCH -A lqcdtest +#SBATCH -q lqcd +#SBATCH --exclusive +#SBATCH --nodes=2 +#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054 +#SBATCH --ntasks=2 +#SBATCH --cpus-per-task=64 +#SBATCH --qos lqcd + +source sourceme.sh + +export PLACES=(1:16:4 1:32:2 0:64:1); +export THR=(16 32 64) + +nodes=2 +mpi=1.1.1.2 + +for t in 2 +do + +export OMP_NUM_THREADS=${THR[$t]} +export OMP_PLACES=${PLACES[$t]} +export thr=${THR[$t]} + +#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr + +for vol in 64.64.64.128 +do +srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr +done + +done + diff --git a/systems/sdcc-genoa/config-command b/systems/sdcc-genoa/config-command new file mode 100644 index 00000000..d992e1da --- /dev/null +++ b/systems/sdcc-genoa/config-command @@ -0,0 +1,16 @@ +../../configure \ +--enable-comms=mpi-auto \ +--enable-unified=yes \ +--enable-shm=shmopen \ +--enable-shm-fast-path=shmopen \ +--enable-accelerator=none \ +--enable-simd=AVX512 \ +--disable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=clang++ \ +MPICXX=mpicxx \ +CXXFLAGS="-std=c++17" + + + diff --git a/systems/sdcc-genoa/sourceme.sh b/systems/sdcc-genoa/sourceme.sh new file mode 100644 index 00000000..4f37888c --- /dev/null +++ b/systems/sdcc-genoa/sourceme.sh @@ -0,0 +1,4 @@ +source $HOME/spack/share/spack/setup-env.sh +spack load llvm@17.0.4 +export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH +module load openmpi diff --git a/tests/debug/Test_general_coarse_pvdagm.cc b/tests/debug/Test_general_coarse_pvdagm.cc index 27c5807e..d382ea64 100644 --- a/tests/debug/Test_general_coarse_pvdagm.cc +++ b/tests/debug/Test_general_coarse_pvdagm.cc @@ -154,6 +154,8 @@ public: // std::cout< PVdagM_t; typedef ShiftedPVdagMLinearOperator ShiftedPVdagM_t; PVdagM_t PVdagM(Ddwf,Dpv); - ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); + // ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355 + // ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246 + // ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183 + // ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145 + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134 + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations + // ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion + // ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters + // ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric + // ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ... with 24 vecs + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ... with 24 vecs and 2^4 blocking + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ... with 16 vecs and 2^4 blocking, sloppier + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35 ... with 20 vecs and 2^4 blocking + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35 ... with 20 vecs and 2^4 blocking, looser coarse + // ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64 ... with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse + ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // // Run power method on HOA?? @@ -269,6 +293,7 @@ int main (int argc, char ** argv) // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp typedef Aggregation Subspace; Subspace AggregatesPD(Coarse5d,FGrid,cb); + /* AggregatesPD.CreateSubspaceChebyshev(RNG5, PVdagM, nbasis, @@ -278,6 +303,10 @@ int main (int argc, char ** argv) 200, 200, 0.0); + */ + AggregatesPD.CreateSubspaceGCR(RNG5, + PVdagM, + nbasis); LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d); LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD); @@ -334,12 +363,13 @@ int main (int argc, char ** argv) /////////////////////////////////////// std::cout< simple; NonHermitianLinearOperator LinOpCoarse(LittleDiracOpPV); - PrecGeneralisedConjugateResidualNonHermitian L2PGCR(1.0e-8, 100, LinOpCoarse,simple,10,10); - L2PGCR.Level(2); + // PrecGeneralisedConjugateResidualNonHermitian L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); + PrecGeneralisedConjugateResidualNonHermitian L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10); + L2PGCR.Level(3); c_res=Zero(); L2PGCR(c_src,c_res); @@ -347,11 +377,12 @@ int main (int argc, char ** argv) // Fine grid smoother //////////////////////////////////////// std::cout< simple_fine; // NonHermitianLinearOperator LinOpSmooth(PVdagM); - PrecGeneralisedConjugateResidualNonHermitian SmootherGCR(0.01,10,ShiftedPVdagM,simple_fine,4,4); + PrecGeneralisedConjugateResidualNonHermitian SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16); + SmootherGCR.Level(2); LatticeFermionD f_src(FGrid); LatticeFermionD f_res(FGrid); @@ -364,12 +395,12 @@ int main (int argc, char ** argv) TwoLevelMG TwoLevelPrecon(AggregatesPD, PVdagM, - SmootherGCR, + simple_fine, SmootherGCR, LinOpCoarse, L2PGCR); - PrecGeneralisedConjugateResidualNonHermitian L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,8,8); + PrecGeneralisedConjugateResidualNonHermitian L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16); L1PGCR.Level(1); f_res=Zero();