diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index ffcfe37a..d4f12f86 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -53,10 +53,11 @@ public: // Communicator should know nothing of the physics grid, only processor grid. //////////////////////////////////////////// int _Nprocessors; // How many in all - Coordinate _processors; // Which dimensions get relayed out over processors lanes. int _processor; // linear processor rank - Coordinate _processor_coor; // linear processor coordinate unsigned long _ndimension; + Coordinate _shm_processors; // Which dimensions get relayed out over processors lanes. + Coordinate _processors; // Which dimensions get relayed out over processors lanes. + Coordinate _processor_coor; // linear processor coordinate static Grid_MPI_Comm communicator_world; Grid_MPI_Comm communicator; std::vector communicator_halo; @@ -97,8 +98,9 @@ public: int BossRank(void) ; int ThisRank(void) ; const Coordinate & ThisProcessorCoor(void) ; + const Coordinate & ShmGrid(void) { return _shm_processors; } ; const Coordinate & ProcessorGrid(void) ; - int ProcessorCount(void) ; + int ProcessorCount(void) ; //////////////////////////////////////////////////////////////////////////////// // very VERY rarely (Log, serial RNG) we need world without a grid @@ -142,16 +144,16 @@ public: int bytes); double StencilSendToRecvFrom(void *xmit, - int xmit_to_rank, + int xmit_to_rank,int do_xmit, void *recv, - int recv_from_rank, + int recv_from_rank,int do_recv, int bytes,int dir); double StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int xmit_to_rank, + int xmit_to_rank,int do_xmit, void *recv, - int recv_from_rank, + int recv_from_rank,int do_recv, int bytes,int dir); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 7b3e8847..ecdf1e53 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) // Remap using the shared memory optimising routine // The remap creates a comm which must be freed //////////////////////////////////////////////////// - GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm); + GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors); InitFromMPICommunicator(processors,optimal_comm); SetCommunicator(optimal_comm); /////////////////////////////////////////////////// @@ -124,12 +124,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); Coordinate parent_processor_coor(_ndimension,0); Coordinate parent_processors (_ndimension,1); - + Coordinate shm_processors (_ndimension,1); // Can make 5d grid from 4d etc... int pad = _ndimension-parent_ndimension; for(int d=0;d list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int dest, + int dest,int dox, void *recv, - int from, + int from,int dor, int bytes,int dir) { int ncomm =communicator_halo.size(); @@ -370,28 +372,32 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); - assert(shm!=NULL); - // std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; - acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); + void *shm = (void *) this->ShmBufferTranslate(dest,recv); + assert(shm!=NULL); + // std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); + } } - + if ( CommunicatorPolicy == CommunicatorPolicySequential ) { this->StencilSendToRecvFromComplete(list,dir); } diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index beb2cc97..a0f33ca4 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) : CartesianCommunicator(processors) { + _shm_processors = Coordinate(processors.size(),1); srank=0; SetCommunicator(communicator_world); } CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) { + _shm_processors = Coordinate(processors.size(),1); _processors = processors; _ndimension = processors.size(); assert(_ndimension>=1); _processor_coor.resize(_ndimension); @@ -111,18 +113,18 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest } double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, - int xmit_to_rank, + int xmit_to_rank,int dox, void *recv, - int recv_from_rank, + int recv_from_rank,int dor, int bytes, int dir) { return 2.0*bytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int xmit_to_rank, + int xmit_to_rank,int dox, void *recv, - int recv_from_rank, + int recv_from_rank,int dor, int bytes, int dir) { return 2.0*bytes; diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index f2d20a24..d55fbf3d 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -93,9 +93,10 @@ public: // Create an optimal reordered communicator that makes MPI_Cart_create get it right ////////////////////////////////////////////////////////////////////////////////////// static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD - static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian - static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian - static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian + // Turns MPI_COMM_WORLD into right layout for Cartesian + static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); + static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); + static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims); /////////////////////////////////////////////////// // Provide shared memory facilities off comm world diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 795f3928..fe2f2d89 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -152,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2) } return log2size; } -void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) +void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) { ////////////////////////////////////////////////////////////////////////////// // Look and see if it looks like an HPE 8600 based on hostname conventions @@ -165,8 +165,8 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M gethostname(name,namelen); int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; - if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm); - else OptimalCommunicatorSharedMemory(processors,optimal_comm); + if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM); + else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM); } static inline int divides(int a,int b) { @@ -221,7 +221,7 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD dim=(dim+1) %ndimension; } } -void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) +void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) { //////////////////////////////////////////////////////////////// // Assert power of two shm_size. @@ -294,7 +294,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo Coordinate HyperCoor(ndimension); GetShmDims(WorldDims,ShmDims); - + SHM = ShmDims; + //////////////////////////////////////////////////////////////// // Establish torus of processes and nodes with sub-blockings //////////////////////////////////////////////////////////////// @@ -341,7 +342,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); assert(ierr==0); } -void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) +void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) { //////////////////////////////////////////////////////////////// // Identify subblock of ranks on node spreading across dims @@ -353,6 +354,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension); GetShmDims(WorldDims,ShmDims); + SHM=ShmDims; + //////////////////////////////////////////////////////////////// // Establish torus of processes and nodes with sub-blockings //////////////////////////////////////////////////////////////// diff --git a/Grid/communicator/SharedMemoryNone.cc b/Grid/communicator/SharedMemoryNone.cc index 35663632..198a59d2 100644 --- a/Grid/communicator/SharedMemoryNone.cc +++ b/Grid/communicator/SharedMemoryNone.cc @@ -48,9 +48,10 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) _ShmSetup=1; } -void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) +void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) { optimal_comm = WorldComm; + SHM = Coordinate(processors.size(),1); } //////////////////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 80231bb4..affdae10 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -173,7 +173,12 @@ public: GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, double _M5,const ImplParams &p= ImplParams()); - + + void DirichletBlock(std::vector & block){ + Stencil.DirichletBlock(block); + StencilEven.DirichletBlock(block); + StencilOdd.DirichletBlock(block); + } // Constructors /* WilsonFermion5D(int simd, diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 246bdb36..930957d8 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -131,8 +131,11 @@ class CartesianStencilAccelerator { int _checkerboard; int _npoints; // Move to template param? int _osites; + int _dirichlet; StencilVector _directions; StencilVector _distances; + StencilVector _comms_send; + StencilVector _comms_recv; StencilVector _comm_buf_size; StencilVector _permute_type; StencilVector same_node; @@ -226,6 +229,8 @@ public: void * recv_buf; Integer to_rank; Integer from_rank; + Integer do_send; + Integer do_recv; Integer bytes; }; struct Merge { @@ -255,7 +260,6 @@ public: void *recv_buf; }; - protected: GridBase * _grid; @@ -299,29 +303,6 @@ public: int u_comm_offset; int _unified_buffer_size; - ///////////////////////////////////////// - // Timing info; ugly; possibly temporary - ///////////////////////////////////////// - double commtime; - double mpi3synctime; - double mpi3synctime_g; - double shmmergetime; - double gathertime; - double gathermtime; - double halogtime; - double mergetime; - double decompresstime; - double comms_bytes; - double shm_bytes; - double splicetime; - double nosplicetime; - double calls; - std::vector comm_bytes_thr; - std::vector shm_bytes_thr; - std::vector comm_time_thr; - std::vector comm_enter_thr; - std::vector comm_leave_thr; - //////////////////////////////////////// // Stencil query //////////////////////////////////////// @@ -348,11 +329,12 @@ public: ////////////////////////////////////////// // Comms packet queue for asynch thread // Use OpenMP Tasks for cleaner ??? + // must be called *inside* parallel region ////////////////////////////////////////// + /* void CommunicateThreaded() { #ifdef GRID_OMP - // must be called in parallel region int mythread = omp_get_thread_num(); int nthreads = CartesianCommunicator::nCommThreads; #else @@ -361,65 +343,29 @@ public: #endif if (nthreads == -1) nthreads = 1; if (mythread < nthreads) { - comm_enter_thr[mythread] = usecond(); for (int i = mythread; i < Packets.size(); i += nthreads) { uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, Packets[i].to_rank, Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes,i); - comm_bytes_thr[mythread] += bytes; - shm_bytes_thr[mythread] += 2*Packets[i].bytes-bytes; // Send + Recv. - } - comm_leave_thr[mythread]= usecond(); - comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; } } - - void CollateThreads(void) - { - int nthreads = CartesianCommunicator::nCommThreads; - double first=0.0; - double last =0.0; - - for(int t=0;t 0.0) && ( t0 < first ) ) first = t0; // min time seen - - if ( t1 > last ) last = t1; // max time seen - - } - commtime+= last-first; - } + */ //////////////////////////////////////////////////////////////////////// // Non blocking send and receive. Necessarily parallel. //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); - commtime-=usecond(); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - comms_bytes+=bytes; - shm_bytes +=2*Packets[i].bytes-bytes; + _grid->StencilSendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].bytes,i); } } @@ -428,7 +374,6 @@ public: for(int i=0;iStencilSendToRecvFromComplete(reqs[i],i); } - commtime+=usecond(); } //////////////////////////////////////////////////////////////////////// // Blocking send and receive. Either sequential or parallel. @@ -436,28 +381,27 @@ public: void Communicate(void) { if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){ - thread_region { - // must be called in parallel region - int mythread = thread_num(); - int maxthreads= thread_max(); - int nthreads = CartesianCommunicator::nCommThreads; - assert(nthreads <= maxthreads); - if (nthreads == -1) nthreads = 1; - if (mythread < nthreads) { - for (int i = mythread; i < Packets.size(); i += nthreads) { - double start = usecond(); - uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - comm_bytes_thr[mythread] += bytes; - shm_bytes_thr[mythread] += Packets[i].bytes - bytes; - comm_time_thr[mythread] += usecond() - start; - } - } - } - } else { // Concurrent and non-threaded asynch calls to MPI + ///////////////////////////////////////////////////////// + // several way threaded on different communicators. + // Cannot combine with Dirichlet operators + // This scheme is needed on Intel Omnipath for best performance + // Deprecate once there are very few omnipath clusters + ///////////////////////////////////////////////////////// + int nthreads = CartesianCommunicator::nCommThreads; + int old = GridThread::GetThreads(); + GridThread::SetThreads(nthreads); + thread_for(i,Packets.size(),{ + _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].bytes,i); + }); + GridThread::SetThreads(old); + } else { + ///////////////////////////////////////////////////////// + // Concurrent and non-threaded asynch calls to MPI + ///////////////////////////////////////////////////////// std::vector > reqs; this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); @@ -499,31 +443,23 @@ public: sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd); if ( sshift[0] == sshift[1] ) { if (splice_dim) { - splicetime-=usecond(); - auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx); + auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx,point); is_same_node = is_same_node && tmp; - splicetime+=usecond(); } else { - nosplicetime-=usecond(); - auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx); + auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx,point); is_same_node = is_same_node && tmp; - nosplicetime+=usecond(); } } else { if(splice_dim){ - splicetime-=usecond(); // if checkerboard is unfavourable take two passes // both with block stride loop iteration - auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx); - auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx); + auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx,point); + auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx,point); is_same_node = is_same_node && tmp1 && tmp2; - splicetime+=usecond(); } else { - nosplicetime-=usecond(); - auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx); - auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx); + auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx,point); + auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx,point); is_same_node = is_same_node && tmp1 && tmp2; - nosplicetime+=usecond(); } } } @@ -533,13 +469,10 @@ public: template void HaloGather(const Lattice &source,compressor &compress) { - mpi3synctime_g-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime_g+=usecond(); // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); - halogtime-=usecond(); u_comm_offset=0; @@ -553,7 +486,6 @@ public: assert(u_comm_offset==_unified_buffer_size); accelerator_barrier(); - halogtime+=usecond(); } ///////////////////////// @@ -568,7 +500,6 @@ public: Packets.resize(0); CopyReceiveBuffers.resize(0); CachedTransfers.resize(0); - calls++; } void AddCopy(void *from,void * to, Integer bytes) { @@ -622,12 +553,17 @@ public: CachedTransfers.push_back(obj); return 0; } - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + void AddPacket(void *xmit,void * rcv, + Integer to, Integer do_send, + Integer from, Integer do_recv, + Integer bytes){ Packet p; p.send_buf = xmit; p.recv_buf = rcv; p.to_rank = to; p.from_rank= from; + p.do_send = do_send; + p.do_recv = do_recv; p.bytes = bytes; Packets.push_back(p); } @@ -651,19 +587,13 @@ public: CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - mpi3synctime-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime+=usecond(); - shmmergetime-=usecond(); CommsMerge(decompress,MergersSHM,DecompressionsSHM); - shmmergetime+=usecond(); } template void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { - - mergetime-=usecond(); for(int i=0;i &dirichlet_block) + { + this->_dirichlet = 1; + for(int ii=0;ii_npoints;ii++){ + int dimension = this->_directions[ii]; + int displacement = this->_distances[ii]; + int shift = displacement; + int gd = _grid->_gdimensions[dimension]; + int fd = _grid->_fdimensions[dimension]; + int pd = _grid->_processors [dimension]; + int ld = gd/pd; + int pc = _grid->_processor_coor[dimension]; + /////////////////////////////////////////// + // Figure out dirichlet send and receive + // on this leg of stencil. + /////////////////////////////////////////// + int comm_dim = _grid->_processors[dimension] >1 ; + int block = dirichlet_block[dimension]; + this->_comms_send[ii] = comm_dim; + this->_comms_recv[ii] = comm_dim; + if ( block ) { + assert(abs(displacement) < ld ); + + if( displacement > 0 ) { + // High side, low side + // | <--B--->| + // | | | + // noR + // noS + if ( (ld*(pc+1) ) % block == 0 ) this->_comms_recv[ii] = 0; + if ( ( ld*pc ) % block == 0 ) this->_comms_send[ii] = 0; + } else { + // High side, low side + // | <--B--->| + // | | | + // noS + // noR + if ( (ld*(pc+1) ) % block == 0 ) this->_comms_send[ii] = 0; + if ( ( ld*pc ) % block == 0 ) this->_comms_recv[ii] = 0; + } + } + } + } CartesianStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, const std::vector &distances, Parameters p) - : shm_bytes_thr(npoints), - comm_bytes_thr(npoints), - comm_enter_thr(npoints), - comm_leave_thr(npoints), - comm_time_thr(npoints) { + this->_dirichlet = 0; face_table_computed=0; _grid = grid; this->parameters=p; @@ -745,6 +711,8 @@ public: this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels this->_directions = StencilVector(directions); this->_distances = StencilVector(distances); + this->_comms_send.resize(npoints); + this->_comms_recv.resize(npoints); this->same_node.resize(npoints); _unified_buffer_size=0; @@ -763,24 +731,27 @@ public: int displacement = distances[i]; int shift = displacement; + int gd = _grid->_gdimensions[dimension]; int fd = _grid->_fdimensions[dimension]; + int pd = _grid->_processors [dimension]; + int ld = gd/pd; int rd = _grid->_rdimensions[dimension]; + int pc = _grid->_processor_coor[dimension]; this->_permute_type[point]=_grid->PermuteType(dimension); this->_checkerboard = checkerboard; - ////////////////////////// - // the permute type - ////////////////////////// int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); int rotate_dim = _grid->_simd_layout[dimension]>2; + this->_comms_send[ii] = comm_dim; + this->_comms_recv[ii] = comm_dim; + assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported int sshift[2]; - ////////////////////////// // Underlying approach. For each local site build // up a table containing the npoint "neighbours" and whether they @@ -881,6 +852,7 @@ public: GridBase *grid=_grid; const int Nsimd = grid->Nsimd(); + int comms_recv = this->_comms_recv[point]; int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -937,7 +909,9 @@ public: if ( (shiftpm== 1) && (sx_processor_coor[dimension]==grid->_processors[dimension]-1) ) { wraparound = 1; } - if (!offnode) { + + // Wrap locally dirichlet support case OR node local + if ( (offnode==0) || (comms_recv==0) ) { int permute_slice=0; CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); @@ -1054,11 +1028,14 @@ public: } template - int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; + int comms_send = this->_comms_send[point] ; + int comms_recv = this->_comms_recv[point] ; + assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1124,10 +1101,10 @@ public: //////////////////////////////////////////////////////// // Gather locally //////////////////////////////////////////////////////// - gathertime-=usecond(); assert(send_buf!=NULL); - Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; - gathertime+=usecond(); + if ( comms_send ) + Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); + face_idx++; int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask); if ( (!duplicate) ) { // Force comms for now @@ -1138,12 +1115,12 @@ public: /////////////////////////////////////////////////////////// AddPacket((void *)&send_buf[u_comm_offset], (void *)&recv_buf[u_comm_offset], - xmit_to_rank, - recv_from_rank, + xmit_to_rank, comms_send, + recv_from_rank, comms_recv, bytes); } - if ( compress.DecompressionStep() ) { + if ( compress.DecompressionStep() && comms_recv ) { AddDecompress(&this->u_recv_buf_p[u_comm_offset], &recv_buf[u_comm_offset], words,Decompressions); @@ -1155,11 +1132,15 @@ public: } template - int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx,int point) { const int Nsimd = _grid->Nsimd(); const int maxl =2;// max layout in a direction + + int comms_send = this->_comms_send[point] ; + int comms_recv = this->_comms_recv[point] ; + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; @@ -1224,12 +1205,11 @@ public: &face_table[face_idx][0], face_table[face_idx].size()*sizeof(face_table_host[0])); } - gathermtime-=usecond(); - Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); + if ( comms_send ) + Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++; - gathermtime+=usecond(); //spointers[0] -- low //spointers[1] -- high @@ -1260,7 +1240,10 @@ public: int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask); if ( (!duplicate) ) { // Force comms for now - AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + AddPacket((void *)sp,(void *)rp, + xmit_to_rank,comms_send, + recv_from_rank,comms_recv, + bytes); } } else { @@ -1270,7 +1253,9 @@ public: } } - AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); + if ( comms_recv ) { + AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); + } u_comm_offset +=buffer_size; } diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index ccffb564..21e048f4 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -217,9 +217,9 @@ int main (int argc, char ** argv) dbytes+= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[mu][0], - xmit_to_rank, + xmit_to_rank,1, (void *)&rbuf[mu][0], - recv_from_rank, + recv_from_rank,1, bytes,mu); comm_proc = mpi_layout[mu]-1; @@ -228,9 +228,9 @@ int main (int argc, char ** argv) dbytes+= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[mu+4][0], - xmit_to_rank, + xmit_to_rank,1, (void *)&rbuf[mu+4][0], - recv_from_rank, + recv_from_rank,1, bytes,mu+4); } @@ -309,9 +309,9 @@ int main (int argc, char ** argv) dbytes+= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[mu][0], - xmit_to_rank, + xmit_to_rank,1, (void *)&rbuf[mu][0], - recv_from_rank, + recv_from_rank,1, bytes,mu); Grid.StencilSendToRecvFromComplete(requests,mu); requests.resize(0); @@ -322,9 +322,9 @@ int main (int argc, char ** argv) dbytes+= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[mu+4][0], - xmit_to_rank, + xmit_to_rank,1, (void *)&rbuf[mu+4][0], - recv_from_rank, + recv_from_rank,1, bytes,mu+4); Grid.StencilSendToRecvFromComplete(requests,mu+4); requests.resize(0); @@ -411,8 +411,8 @@ int main (int argc, char ** argv) Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } int tid = omp_get_thread_num(); - tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, bytes,tid); + tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1, + (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid); thread_critical { dbytes+=tbytes; } } diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 7fa37fb6..6896bddf 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -32,18 +32,112 @@ using namespace std; using namespace Grid; -template -struct scal { - d internal; +//////////////////////// +/// Move to domains //// +//////////////////////// + +struct DomainDecomposition +{ + Coordinate Block; + + DomainDecomposition(const Coordinate &_Block): Block(_Block){ assert(Block.size()==Nd);}; + + template + void ProjectDomain(Field &f,Integer domain) + { + GridBase *grid = f.Grid(); + int dims = grid->Nd(); + int isDWF= (dims==Nd+1); + assert((dims==Nd)||(dims==Nd+1)); + + Field zz(grid); zz = Zero(); + LatticeInteger coor(grid); + LatticeInteger domaincoor(grid); + LatticeInteger mask(grid); mask = Integer(1); + LatticeInteger zi(grid); zi = Integer(0); + for(int d=0;d +struct DirichletFilter: public MomentumFilterBase +{ + Coordinate Block; + + DirichletFilter(const Coordinate &_Block): Block(_Block) {} + // Edge detect using domain projectors + void applyFilter (MomentaField &U) const override + { + DomainDecomposition Domains(Block); + GridBase *grid = U.Grid(); + LatticeInteger coor(grid); + LatticeInteger face(grid); + LatticeInteger one(grid); one = 1; + LatticeInteger zero(grid); zero = 0; + LatticeInteger omega(grid); + LatticeInteger omegabar(grid); + LatticeInteger tmp(grid); + + omega=one; Domains.ProjectDomain(omega,0); + omegabar=one; Domains.ProjectDomain(omegabar,1); + + LatticeInteger nface(grid); nface=Zero(); + + MomentaField projected(grid); projected=Zero(); + typedef decltype(PeekIndex(U,0)) MomentaLinkField; + MomentaLinkField Umu(grid); + MomentaLinkField zz(grid); zz=Zero(); + + int dims = grid->Nd(); + Coordinate Global=grid->GlobalDimensions(); + assert(dims==Nd); + + for(int mu=0;mu(U,mu); + + // Upper face + tmp = Cshift(omegabar,mu,1); + tmp = tmp + omega; + face = where(tmp == Integer(2),one,zero ); + + tmp = Cshift(omega,mu,1); + tmp = tmp + omegabar; + face = where(tmp == Integer(2),one,face ); + + Umu = where(face,zz,Umu); + + PokeIndex(U, Umu, mu); + } + } + } +}; + + + +Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT +}; + +void Benchmark(int Ls, std::vector Dirichlet); int main (int argc, char ** argv) { @@ -52,24 +146,48 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); - Coordinate latt4 = GridDefaultLatt(); int Ls=16; - for(int i=0;i> Ls; } - + } + std::vector Dirichlet(5,0); + Benchmark(Ls,Dirichlet); + Coordinate latt4 = GridDefaultLatt(); + Coordinate mpi = GridDefaultMpi(); + Coordinate shm; + GlobalSharedMemory::GetShmDims(mpi,shm); + /* + Dirichlet = std::vector({0, + latt4[0]/mpi[0] * shm[0], + latt4[1]/mpi[1] * shm[1], + latt4[2]/mpi[2] * shm[2], + latt4[3]/mpi[3] * shm[3]}); + */ + Dirichlet = std::vector({0, + latt4[0]/mpi[0] , + latt4[1]/mpi[1] , + latt4[2]/mpi[2] , + latt4[3]/mpi[3] }); + + std::cout << " Dirichlet block "<< Dirichlet<< std::endl; + Benchmark(Ls,Dirichlet); + Grid_finalize(); + exit(0); +} +void Benchmark(int Ls, std::vector Dirichlet) +{ + Coordinate latt4 = GridDefaultLatt(); GridLogLayout(); long unsigned int single_site_flops = 8*Nc*(7+16*Nc); - GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - std::cout << GridLogMessage << "Making s innermost grids"<::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; -#if 0 - Umu=1.0; - for(int mu=0;mu(Umu,mu); - // if (mu !=2 ) ttmp = 0; - // ttmp = ttmp* pow(10.0,mu); - PokeIndex(Umu,ttmp,mu); - } - std::cout << GridLogMessage << "Forced to diagonal " << std::endl; -#endif + //////////////////////////////////// + // Apply BCs + //////////////////////////////////// + std::cout << GridLogMessage << "Applying BCs " << std::endl; + Coordinate Block(4); + for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1]; + + std::cout << GridLogMessage << "Dirichlet Block " << Block<< std::endl; + DirichletFilter Filter(Block); + Filter.applyFilter(Umu); + //////////////////////////////////// // Naive wilson implementation //////////////////////////////////// @@ -191,11 +296,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); - Dw.ZeroCounters(); Dw.Dhop(src,result,0); std::cout<1.0e-4) ) { - /* - std::cout << "RESULT\n " << result<Barrier(); exit(-1); } assert (norm2(err)< 1.0e-4 ); - Dw.Report(); } if (1) @@ -294,13 +390,14 @@ int main (int argc, char ** argv) std::cout<1.0e-4)){ -/* - std::cout<< "DAG RESULT\n " < 1.0e-4 ) { + std::cout << "Error vector is\n" <Barrier(); Dw.DhopEO(src_o,r_e,DaggerNo); double t0=usecond(); @@ -352,7 +448,6 @@ int main (int argc, char ** argv) std::cout<1.0e-4)){ - /* - std::cout<< "Deo RESULT\n " < develop) uncommited changes +Current Grid git commit hash=da06d15f73184ceb15d66d4e7e702b02fed7b940: (HEAD -> feature/dirichlet, develop) uncommited changes Grid : Message : ================================================ Grid : Message : MPI is initialised and logging filters activated @@ -124,122 +136,102 @@ Grid : Message : ================================================ Grid : Message : Requested 2147483648 byte stencil comms buffers Grid : Message : MemoryManager Cache 34004218675 bytes Grid : Message : MemoryManager::Init() setting up -Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory Grid : Message : MemoryManager::Init() Using cudaMalloc -Grid : Message : 1.198523 s : Grid Layout -Grid : Message : 1.198530 s : Global lattice size : 64 64 64 64 -Grid : Message : 1.198534 s : OpenMP threads : 4 -Grid : Message : 1.198535 s : MPI tasks : 2 2 2 2 -Grid : Message : 1.397615 s : Making s innermost grids -Grid : Message : 1.441828 s : Initialising 4d RNG -Grid : Message : 1.547973 s : Intialising parallel RNG with unique string 'The 4D RNG' -Grid : Message : 1.547998 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 -Grid : Message : 1.954777 s : Initialising 5d RNG -Grid : Message : 3.633825 s : Intialising parallel RNG with unique string 'The 5D RNG' -Grid : Message : 3.633869 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a -Grid : Message : 12.162710 s : Initialised RNGs -Grid : Message : 15.882520 s : Drawing gauge field -Grid : Message : 15.816362 s : Random gauge initialised -Grid : Message : 17.279671 s : Setting up Cshift based reference -Grid : Message : 26.331426 s : ***************************************************************** -Grid : Message : 26.331452 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm -Grid : Message : 26.331454 s : ***************************************************************** -Grid : Message : 26.331456 s : ***************************************************************** -Grid : Message : 26.331458 s : * Benchmarking DomainWallFermionR::Dhop -Grid : Message : 26.331459 s : * Vectorising space-time by 8 -Grid : Message : 26.331463 s : * VComplexF size is 64 B -Grid : Message : 26.331465 s : * SINGLE precision -Grid : Message : 26.331467 s : * Using Overlapped Comms/Compute -Grid : Message : 26.331468 s : * Using GENERIC Nc WilsonKernels -Grid : Message : 26.331469 s : ***************************************************************** -Grid : Message : 28.413717 s : Called warmup -Grid : Message : 56.418423 s : Called Dw 3000 times in 2.80047e+07 us -Grid : Message : 56.418476 s : mflop/s = 3.79581e+07 -Grid : Message : 56.418479 s : mflop/s per rank = 2.37238e+06 -Grid : Message : 56.418481 s : mflop/s per node = 9.48953e+06 -Grid : Message : 56.418483 s : RF GiB/s (base 2) = 77130 -Grid : Message : 56.418485 s : mem GiB/s (base 2) = 48206.3 -Grid : Message : 56.422076 s : norm diff 1.03481e-13 -Grid : Message : 56.456894 s : #### Dhop calls report -Grid : Message : 56.456899 s : WilsonFermion5D Number of DhopEO Calls : 6002 -Grid : Message : 56.456903 s : WilsonFermion5D TotalTime /Calls : 4710.93 us -Grid : Message : 56.456905 s : WilsonFermion5D CommTime /Calls : 3196.15 us -Grid : Message : 56.456908 s : WilsonFermion5D FaceTime /Calls : 494.392 us -Grid : Message : 56.456910 s : WilsonFermion5D ComputeTime1/Calls : 44.4107 us -Grid : Message : 56.456912 s : WilsonFermion5D ComputeTime2/Calls : 1037.75 us -Grid : Message : 56.456921 s : Average mflops/s per call : 3.55691e+09 -Grid : Message : 56.456925 s : Average mflops/s per call per rank : 2.22307e+08 -Grid : Message : 56.456928 s : Average mflops/s per call per node : 8.89228e+08 -Grid : Message : 56.456930 s : Average mflops/s per call (full) : 3.82915e+07 -Grid : Message : 56.456933 s : Average mflops/s per call per rank (full): 2.39322e+06 -Grid : Message : 56.456952 s : Average mflops/s per call per node (full): 9.57287e+06 -Grid : Message : 56.456954 s : WilsonFermion5D Stencil -Grid : Message : 56.457016 s : Stencil calls 3001 -Grid : Message : 56.457022 s : Stencil halogtime 0 -Grid : Message : 56.457024 s : Stencil gathertime 55.9154 -Grid : Message : 56.457026 s : Stencil gathermtime 20.1073 -Grid : Message : 56.457028 s : Stencil mergetime 18.5585 -Grid : Message : 56.457030 s : Stencil decompresstime 0.0639787 -Grid : Message : 56.457032 s : Stencil comms_bytes 4.02653e+08 -Grid : Message : 56.457034 s : Stencil commtime 6379.93 -Grid : Message : 56.457036 s : Stencil 63.1124 GB/s per rank -Grid : Message : 56.457038 s : Stencil 252.45 GB/s per node -Grid : Message : 56.457040 s : WilsonFermion5D StencilEven -Grid : Message : 56.457048 s : WilsonFermion5D StencilOdd -Grid : Message : 56.457062 s : WilsonFermion5D Stencil Reporti() -Grid : Message : 56.457065 s : WilsonFermion5D StencilEven Reporti() -Grid : Message : 56.457066 s : WilsonFermion5D StencilOdd Reporti() -Grid : Message : 79.259261 s : Compare to naive wilson implementation Dag to verify correctness -Grid : Message : 79.259287 s : Called DwDag -Grid : Message : 79.259288 s : norm dag result 12.0421 -Grid : Message : 79.271740 s : norm dag ref 12.0421 -Grid : Message : 79.287759 s : norm dag diff 7.63236e-14 -Grid : Message : 79.328100 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec -Grid : Message : 79.955951 s : src_e0.499997 -Grid : Message : 80.633620 s : src_o0.500003 -Grid : Message : 80.164163 s : ********************************************************* -Grid : Message : 80.164168 s : * Benchmarking DomainWallFermionF::DhopEO -Grid : Message : 80.164170 s : * Vectorising space-time by 8 -Grid : Message : 80.164172 s : * SINGLE precision -Grid : Message : 80.164174 s : * Using Overlapped Comms/Compute -Grid : Message : 80.164177 s : * Using GENERIC Nc WilsonKernels -Grid : Message : 80.164178 s : ********************************************************* -Grid : Message : 93.797635 s : Deo mflop/s = 3.93231e+07 -Grid : Message : 93.797670 s : Deo mflop/s per rank 2.45769e+06 -Grid : Message : 93.797672 s : Deo mflop/s per node 9.83077e+06 -Grid : Message : 93.797674 s : #### Dhop calls report -Grid : Message : 93.797675 s : WilsonFermion5D Number of DhopEO Calls : 3001 -Grid : Message : 93.797677 s : WilsonFermion5D TotalTime /Calls : 4542.83 us -Grid : Message : 93.797679 s : WilsonFermion5D CommTime /Calls : 2978.97 us -Grid : Message : 93.797681 s : WilsonFermion5D FaceTime /Calls : 602.287 us -Grid : Message : 93.797683 s : WilsonFermion5D ComputeTime1/Calls : 67.1416 us -Grid : Message : 93.797685 s : WilsonFermion5D ComputeTime2/Calls : 1004.07 us -Grid : Message : 93.797713 s : Average mflops/s per call : 3.30731e+09 -Grid : Message : 93.797717 s : Average mflops/s per call per rank : 2.06707e+08 -Grid : Message : 93.797719 s : Average mflops/s per call per node : 8.26827e+08 -Grid : Message : 93.797721 s : Average mflops/s per call (full) : 3.97084e+07 -Grid : Message : 93.797727 s : Average mflops/s per call per rank (full): 2.48178e+06 -Grid : Message : 93.797732 s : Average mflops/s per call per node (full): 9.92711e+06 -Grid : Message : 93.797735 s : WilsonFermion5D Stencil -Grid : Message : 93.797746 s : WilsonFermion5D StencilEven -Grid : Message : 93.797758 s : WilsonFermion5D StencilOdd -Grid : Message : 93.797769 s : Stencil calls 3001 -Grid : Message : 93.797773 s : Stencil halogtime 0 -Grid : Message : 93.797776 s : Stencil gathertime 56.7458 -Grid : Message : 93.797780 s : Stencil gathermtime 22.6504 -Grid : Message : 93.797782 s : Stencil mergetime 21.1913 -Grid : Message : 93.797786 s : Stencil decompresstime 0.0556481 -Grid : Message : 93.797788 s : Stencil comms_bytes 2.01327e+08 -Grid : Message : 93.797791 s : Stencil commtime 2989.33 -Grid : Message : 93.797795 s : Stencil 67.3484 GB/s per rank -Grid : Message : 93.797798 s : Stencil 269.394 GB/s per node -Grid : Message : 93.797801 s : WilsonFermion5D Stencil Reporti() -Grid : Message : 93.797803 s : WilsonFermion5D StencilEven Reporti() -Grid : Message : 93.797805 s : WilsonFermion5D StencilOdd Reporti() -Grid : Message : 93.873429 s : r_e6.02111 -Grid : Message : 93.879931 s : r_o6.02102 -Grid : Message : 93.885912 s : res12.0421 -Grid : Message : 94.876555 s : norm diff 0 -Grid : Message : 95.485643 s : norm diff even 0 -Grid : Message : 95.581236 s : norm diff odd 0 +Grid : Message : 1.875883 s : Grid Layout +Grid : Message : 1.875893 s : Global lattice size : 64 64 64 64 +Grid : Message : 1.875897 s : OpenMP threads : 4 +Grid : Message : 1.875898 s : MPI tasks : 2 2 2 2 +Grid : Message : 1.993571 s : Initialising 4d RNG +Grid : Message : 2.881990 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 2.882370 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 2.495044 s : Initialising 5d RNG +Grid : Message : 4.120900 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 4.121350 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 15.268010 s : Drawing gauge field +Grid : Message : 16.234025 s : Random gauge initialised +Grid : Message : 16.234057 s : Applying BCs +Grid : Message : 16.365565 s : Setting up Cshift based reference +Grid : Message : 44.512418 s : ***************************************************************** +Grid : Message : 44.512448 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 44.512450 s : ***************************************************************** +Grid : Message : 44.512451 s : ***************************************************************** +Grid : Message : 44.512452 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 44.512453 s : * Vectorising space-time by 8 +Grid : Message : 44.512454 s : * VComplexF size is 64 B +Grid : Message : 44.512456 s : * SINGLE precision +Grid : Message : 44.512459 s : * Using Overlapped Comms/Compute +Grid : Message : 44.512460 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 44.512461 s : ***************************************************************** +Grid : Message : 46.389070 s : Called warmup +Grid : Message : 49.211265 s : Called Dw 300 times in 2.82203e+06 us +Grid : Message : 49.211295 s : mflop/s = 3.76681e+07 +Grid : Message : 49.211297 s : mflop/s per rank = 2.35425e+06 +Grid : Message : 49.211299 s : mflop/s per node = 9.41702e+06 +Grid : Message : 49.211301 s : RF GiB/s (base 2) = 76540.6 +Grid : Message : 49.211308 s : mem GiB/s (base 2) = 47837.9 +Grid : Message : 49.214868 s : norm diff 1.06409e-13 +Grid : Message : 92.647781 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 92.647816 s : Called DwDag +Grid : Message : 92.647817 s : norm dag result 12.0421 +Grid : Message : 92.801806 s : norm dag ref 12.0421 +Grid : Message : 92.817724 s : norm dag diff 7.21921e-14 +Grid : Message : 92.858973 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 93.210378 s : src_e0.499997 +Grid : Message : 93.583286 s : src_o0.500003 +Grid : Message : 93.682468 s : ********************************************************* +Grid : Message : 93.682471 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 93.682472 s : * Vectorising space-time by 8 +Grid : Message : 93.682473 s : * SINGLE precision +Grid : Message : 93.682475 s : * Using Overlapped Comms/Compute +Grid : Message : 93.682476 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 93.682477 s : ********************************************************* +Grid : Message : 95.162342 s : Deo mflop/s = 3.92487e+07 +Grid : Message : 95.162387 s : Deo mflop/s per rank 2.45305e+06 +Grid : Message : 95.162389 s : Deo mflop/s per node 9.81219e+06 +Grid : Message : 95.232801 s : r_e6.02111 +Grid : Message : 95.240061 s : r_o6.02102 +Grid : Message : 95.245975 s : res12.0421 +Grid : Message : 95.833402 s : norm diff 0 +Grid : Message : 96.573829 s : norm diff even 0 +Grid : Message : 96.868272 s : norm diff odd 0 + Dirichlet block [0 64 64 32 32] +Grid : Message : 97.756909 s : Grid Layout +Grid : Message : 97.756911 s : Global lattice size : 64 64 64 64 +Grid : Message : 97.756921 s : OpenMP threads : 4 +Grid : Message : 97.756922 s : MPI tasks : 2 2 2 2 +Grid : Message : 97.897085 s : Initialising 4d RNG +Grid : Message : 97.965061 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 97.965097 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 98.367431 s : Initialising 5d RNG +Grid : Message : 99.752745 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 99.752790 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 111.290148 s : Drawing gauge field +Grid : Message : 112.349289 s : Random gauge initialised +Grid : Message : 112.349320 s : Applying BCs +Grid : Message : 113.948740 s : Setting up Cshift based reference +Grid : Message : 140.320415 s : ***************************************************************** +Grid : Message : 140.320443 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 140.320444 s : ***************************************************************** +Grid : Message : 140.320445 s : ***************************************************************** +Grid : Message : 140.320446 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 140.320447 s : * Vectorising space-time by 8 +Grid : Message : 140.320448 s : * VComplexF size is 64 B +Grid : Message : 140.320450 s : * SINGLE precision +Grid : Message : 140.320451 s : * Using Overlapped Comms/Compute +Grid : Message : 140.320452 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 140.320453 s : ***************************************************************** +Grid : Message : 142.296150 s : Called warmup +Grid : Message : 144.397678 s : Called Dw 300 times in 2.36719e+06 us +Grid : Message : 144.397700 s : mflop/s = 4.49058e+07 +Grid : Message : 144.397702 s : mflop/s per rank = 2.80661e+06 +Grid : Message : 144.397704 s : mflop/s per node = 1.12265e+07 +Grid : Message : 144.397706 s : RF GiB/s (base 2) = 91247.6 +Grid : Message : 144.397708 s : mem GiB/s (base 2) = 57029.7 +Grid : Message : 144.401269 s : norm diff 9.78944e-14 +Grid : Message : 186.885460 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 186.885492 s : Called DwDag +Grid : Message : 186.885493 s : norm dag result 10.4157 +Grid : Message : 186.897154 s : norm dag ref 11.2266 +Grid : Message : 186.912538 s : norm dag diff 0.484633 diff --git a/systems/Tursa/dwf4.slurm b/systems/Tursa/dwf4.slurm index 65191398..5940ac05 100644 --- a/systems/Tursa/dwf4.slurm +++ b/systems/Tursa/dwf4.slurm @@ -1,14 +1,13 @@ #!/bin/bash #SBATCH -J dslash -#SBATCH -A tc002 -#SBATCH -t 2:20:00 -#SBATCH --nodelist=tu-c0r0n[00,03,06,09] +#SBATCH -A dp207 #SBATCH --exclusive #SBATCH --nodes=4 #SBATCH --ntasks=16 +#SBATCH --qos=standard #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 -#SBATCH --time=12:00:00 +#SBATCH --time=0:05:00 #SBATCH --partition=gpu #SBATCH --gres=gpu:4 #SBATCH --output=%x.%j.out