diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 3a7bbc44..c1025b59 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -65,6 +65,7 @@ NAMESPACE_BEGIN(Grid); #endif enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; +enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 }; class GridBLAS { public: @@ -97,7 +98,21 @@ public: gridblasInit=1; } } - + +#ifdef GRID_CUDA + cublasComputeType_t toDataType(GridBLASPrecision_t p) { + switch (p) { + case GridBLAS_PRECISION_16F: + return CUBLAS_COMPUTE_32F_FAST_16F; + case GridBLAS_PRECISION_16BF: + return CUBLAS_COMPUTE_32F_FAST_16BF; + case GridBLAS_PRECISION_TF32: + return CUBLAS_COMPUTE_32F_FAST_TF32; + default: + assert(0); + } + } +#endif // Force construct once GridBLAS() { Init(); }; ~GridBLAS() { }; @@ -138,8 +153,10 @@ public: deviceVector &Amk, // pointer list to matrices deviceVector &Bkn, ComplexD beta, - deviceVector &Cmn) + deviceVector &Cmn, + GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT) { + assert(precision == GridBLAS_PRECISION_DEFAULT); gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, m,n,k, alpha, @@ -201,8 +218,10 @@ public: deviceVector &Amk, // pointer list to matrices deviceVector &Bkn, ComplexD beta, - deviceVector &Cmn) + deviceVector &Cmn, + GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT) { + assert(precision == GridBLAS_PRECISION_DEFAULT); RealD t2=usecond(); int32_t batchCount = Amk.size(); assert(Bkn.size()==batchCount); @@ -448,7 +467,8 @@ public: deviceVector &Amk, // pointer list to matrices deviceVector &Bkn, ComplexF beta, - deviceVector &Cmn) + deviceVector &Cmn, + GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT) { RealD t2=usecond(); int32_t batchCount = Amk.size(); @@ -473,6 +493,7 @@ public: assert(Bkn.size()==batchCount); assert(Cmn.size()==batchCount); #ifdef GRID_HIP + assert(precision == GridBLAS_PRECISION_DEFAULT); hipblasOperation_t hOpA; hipblasOperation_t hOpB; if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; @@ -503,50 +524,67 @@ public: if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; - auto err = cublasCgemmBatched(gridblasHandle, - hOpA, - hOpB, - m,n,k, - (cuComplex *) &alpha_p[0], - (cuComplex **)&Amk[0], lda, - (cuComplex **)&Bkn[0], ldb, - (cuComplex *) &beta_p[0], - (cuComplex **)&Cmn[0], ldc, - batchCount); + cublasStatus_t err; + if (precision == GridBLAS_PRECISION_DEFAULT) { + err = cublasCgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (cuComplex *) &alpha_p[0], + (cuComplex **)&Amk[0], lda, + (cuComplex **)&Bkn[0], ldb, + (cuComplex *) &beta_p[0], + (cuComplex **)&Cmn[0], ldc, + batchCount); + } else { + cublasComputeType_t compute_precision = toDataType(precision); + err = cublasGemmBatchedEx(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (void *) &alpha_p[0], + (void **)&Amk[0], CUDA_C_32F, lda, + (void **)&Bkn[0], CUDA_C_32F, ldb, + (void *) &beta_p[0], + (void **)&Cmn[0], CUDA_C_32F, ldc, + batchCount, compute_precision, CUBLAS_GEMM_DEFAULT); + } assert(err==CUBLAS_STATUS_SUCCESS); #endif #ifdef GRID_SYCL - int64_t m64=m; - int64_t n64=n; - int64_t k64=k; - int64_t lda64=lda; - int64_t ldb64=ldb; - int64_t ldc64=ldc; - int64_t batchCount64=batchCount; - - oneapi::mkl::transpose iOpA; - oneapi::mkl::transpose iOpB; - - if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; - if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; - if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; - if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; - if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; - if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; - - oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, - &iOpA, - &iOpB, - &m64,&n64,&k64, - (ComplexF *) &alpha_p[0], - (const ComplexF **)&Amk[0], (const int64_t *)&lda64, - (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, - (ComplexF *) &beta_p[0], - (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, - (int64_t)1,&batchCount64,std::vector()); + assert(precision == GridBLAS_PRECISION_DEFAULT); + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t lda64=lda; + int64_t ldb64=ldb; + int64_t ldc64=ldc; + int64_t batchCount64=batchCount; + + oneapi::mkl::transpose iOpA; + oneapi::mkl::transpose iOpB; + + if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; + if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; + if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; + if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; + if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; + if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; + + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + &iOpA, + &iOpB, + &m64,&n64,&k64, + (ComplexF *) &alpha_p[0], + (const ComplexF **)&Amk[0], (const int64_t *)&lda64, + (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, + (ComplexF *) &beta_p[0], + (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, + (int64_t)1,&batchCount64,std::vector()); synchronise(); #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + assert(precision == GridBLAS_PRECISION_DEFAULT); // Need a default/reference implementation; use Eigen if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { @@ -946,6 +984,336 @@ public: RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; } + /* + Inverse and Determinant + + - CPU version uses Eigen + - GPU version uses LAPACK-compatible getrf / getri + + Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner. + Overhead to go through getrf / getri for CPU version too large. + Current interface therefore only guarantees the inverse and determinant + functions on all platforms but not the getrf / getri ones. + */ +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + + void inverseBatched(int64_t n, + deviceVector &Ann, + deviceVector &Cnn) { + + int64_t batchCount = Ann.size(); + assert(batchCount == Cnn.size()); + thread_for(p,batchCount, { + Eigen::Map eAnn(Ann[p],n,n); + Eigen::Map eCnn(Cnn[p],n,n); + eCnn = eAnn.inverse(); + }); + } + + void inverseBatched(int64_t n, + deviceVector &Ann, + deviceVector &Cnn) { + + int64_t batchCount = Ann.size(); + assert(batchCount == Cnn.size()); + thread_for(p,batchCount, { + Eigen::Map eAnn(Ann[p],n,n); + Eigen::Map eCnn(Cnn[p],n,n); + eCnn = eAnn.inverse(); + }); + } + + void determinantBatched(int64_t n, + deviceVector &Ann, + deviceVector &C) { + + int64_t batchCount = Ann.size(); + assert(batchCount == C.size()); + thread_for(p,batchCount, { + Eigen::Map eAnn(Ann[p],n,n); + *C[p] = eAnn.determinant(); + }); + } + + void determinantBatched(int64_t n, + deviceVector &Ann, + deviceVector &C) { + + int64_t batchCount = Ann.size(); + assert(batchCount == C.size()); + thread_for(p,batchCount, { + Eigen::Map eAnn(Ann[p],n,n); + *C[p] = eAnn.determinant(); + }); + } + +#else + +#ifdef GRID_SYCL + template + void getrfBatchedSYCL(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info) { + + int64_t batchCount = Ann.size(); + + static deviceVector scratchpad; + int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount); + if (sp_size > scratchpad.size()) + scratchpad.resize(sp_size); + + static deviceVector _ipiv; + if (batchCount > _ipiv.size()) + _ipiv.resize(batchCount); + int64_t** p_ipiv = &_ipiv[0]; + int64_t* pipiv = &ipiv[0]; + + accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; }); + + oneapi::mkl::lapack::getrf_batch(*gridblasHandle, + &n, &n, + (T **)&Ann[0], + &n, + (int64_t**)&_ipiv[0], + (int64_t)1, &batchCount, + (T*)&scratchpad[0], (int64_t)scratchpad.size(), + std::vector()); + synchronise(); + } +#endif + + void getrfBatched(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info) + { + int64_t batchCount = Ann.size(); + assert(ipiv.size()==batchCount*n); + assert(info.size()==batchCount); + +#ifdef GRID_HIP + auto err = hipblasZgetrfBatched(gridblasHandle,(int)n, + (hipblasDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + auto err = cublasZgetrfBatched(gridblasHandle, (int)n, + (cuDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + getrfBatchedSYCL(n, Ann, ipiv, info); +#endif + } + + void getrfBatched(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info) + { + int64_t batchCount = Ann.size(); + assert(ipiv.size()==batchCount*n); + assert(info.size()==batchCount); + +#ifdef GRID_HIP + auto err = hipblasCgetrfBatched(gridblasHandle,(int)n, + (hipblasComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + auto err = cublasCgetrfBatched(gridblasHandle, (int)n, + (cuComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (int*) &info[0], + (int)batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + getrfBatchedSYCL(n, Ann, ipiv, info); +#endif + } + +#ifdef GRID_SYCL + template + void getriBatchedSYCL(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info, + deviceVector &Cnn) { + + int64_t batchCount = Ann.size(); + + static deviceVector scratchpad; + int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size(*gridblasHandle, &n, &n, (int64_t)1, &batchCount); + if (sp_size > scratchpad.size()) + scratchpad.resize(sp_size); + + static deviceVector _ipiv; + if (batchCount > _ipiv.size()) + _ipiv.resize(batchCount); + int64_t** p_ipiv = &_ipiv[0]; + int64_t* pipiv = &ipiv[0]; + + accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; }); + + oneapi::mkl::lapack::getri_batch(*gridblasHandle, + &n, + (T **)&Ann[0], + &n, + (int64_t**)p_ipiv, + (int64_t)1, &batchCount, + (T *)&scratchpad[0], (int64_t)scratchpad.size(), + std::vector()); + + synchronise(); + + T** pA = &Ann[0]; + T** pC = &Cnn[0]; + accelerator_for(i, batchCount*n*n, 1, { + auto j = i / batchCount; + auto k = i % batchCount; + pC[k][j] = pA[k][j]; + }); + } + +#endif + + void getriBatched(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info, + deviceVector &Cnn) + { + int64_t batchCount = Ann.size(); + assert(ipiv.size()==batchCount*n); + assert(info.size()==batchCount); + assert(Cnn.size()==batchCount); + +#ifdef GRID_HIP + auto err = hipblasZgetriBatched(gridblasHandle,(int)n, + (hipblasDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (hipblasDoubleComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + auto err = cublasZgetriBatched(gridblasHandle, (int)n, + (cuDoubleComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (cuDoubleComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + getriBatchedSYCL(n, Ann, ipiv, info, Cnn); +#endif + } + + void getriBatched(int64_t n, + deviceVector &Ann, + deviceVector &ipiv, + deviceVector &info, + deviceVector &Cnn) + { + int64_t batchCount = Ann.size(); + assert(ipiv.size()==batchCount*n); + assert(info.size()==batchCount); + assert(Cnn.size()==batchCount); + +#ifdef GRID_HIP + auto err = hipblasCgetriBatched(gridblasHandle,(int)n, + (hipblasComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (hipblasComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + auto err = cublasCgetriBatched(gridblasHandle, (int)n, + (cuComplex **)&Ann[0], (int)n, + (int*) &ipiv[0], + (cuComplex **)&Cnn[0], (int)n, + (int*) &info[0], + (int)batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + getriBatchedSYCL(n, Ann, ipiv, info, Cnn); +#endif + } + + template + void inverseBatched(int64_t n, + deviceVector &Ann, // this will be overwritten with LU decomposition + deviceVector &Cnn // this will be overwritten with the inverse + ) { + + int64_t batchCount = Ann.size(); + RealD t0 = usecond(); + deviceVector ipiv(batchCount*n); + deviceVector info(batchCount); + + //RealD t1 = usecond(); + getrfBatched(n, Ann, ipiv, info); + // test info for non-invertibility? set to nan if yes? + getriBatched(n, Ann, ipiv, info, Cnn); + //synchronise(); + //RealD t2 = usecond(); + //std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl; + } + + template + void determinantBatched(int64_t n, + deviceVector &Ann, // this will be overwritten with LU decomposition + deviceVector &C // this will be overwritten with determinant + ) { + + int64_t batchCount = Ann.size(); + //RealD t0 = usecond(); + deviceVector ipiv(batchCount*n); + deviceVector info(batchCount); + + dtype** pAnn = (dtype**)&Ann[0]; + dtype** pC = (dtype**)&C[0]; +#if defined(GRID_CUDA) || defined(GRID_HIP) + int* pipiv = (int*)&ipiv[0]; +#else + int64_t* pipiv = (int64_t*)&ipiv[0]; +#endif + + //RealD t1 = usecond(); + getrfBatched(n, Ann, ipiv, info); + //RealD t2 = usecond(); + accelerator_for(i,batchCount,1,{ + dtype det = 1.0; + for (int64_t j=0;j double benchmark(int M, int N, int K, int BATCH) { diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 552d7f19..0f03aba5 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -32,6 +32,10 @@ NAMESPACE_BEGIN(Grid); Grid_MPI_Comm CartesianCommunicator::communicator_world; +#ifdef GRID_CHECKSUM_COMMS +extern void * Grid_backtrace_buffer[_NBACKTRACE]; +uint64_t checksum_index = 1; +#endif //////////////////////////////////////////// // First initialise of comms system @@ -568,6 +572,11 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorHostBufferMalloc(xbytes); CommsRequest_t srq; +#ifdef GRID_CHECKSUM_COMMS + uint64_t xbytes_data = xbytes - 8; + srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch + assert(xbytes % 8 == 0); + // flip one bit so that a zero buffer is not consistent + uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag); + *(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum; +#else srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch +#endif // ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); // assert(ierr==0); @@ -635,7 +654,11 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vectorHostBufferFreeAll(); // Clean up the buffer allocs diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 469ec3bb..6ef950d5 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -43,10 +43,6 @@ Author: Christoph Lehner #define GRID_SYCL_LEVEL_ZERO_IPC #define SHM_SOCKETS #else -#ifdef HAVE_NUMAIF_H - #warning " Using NUMAIF " -#include -#endif #endif #include #endif diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index fdb98cd4..b8099c27 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -202,7 +202,7 @@ template void Scatter_plane_simple (Lattice &rhs,deviceVector< { auto buffer_p = & buffer[0]; auto table = MapCshiftTable(); - autoView( rhs_v, rhs, AcceleratorWrite); + autoView( rhs_v, rhs, AcceleratorWriteDiscard); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); }); @@ -228,7 +228,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA if(cbmask ==0x3 ) { int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; - autoView( rhs_v , rhs, AcceleratorWrite); + autoView( rhs_v , rhs, AcceleratorWriteDiscard); accelerator_for(nn,e1*e2,1,{ int n = nn%e1; int b = nn/e1; @@ -302,7 +302,7 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs { auto table = MapCshiftTable(); autoView(rhs_v , rhs, AcceleratorRead); - autoView(lhs_v , lhs, AcceleratorWrite); + autoView(lhs_v , lhs, AcceleratorWriteDiscard); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index fc0cc2eb..a66a0420 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -29,8 +29,12 @@ Author: paboyle #ifndef _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_ - NAMESPACE_BEGIN(Grid); + +#ifdef GRID_CHECKSUM_COMMS +extern uint64_t checksum_index; +#endif + const int Cshift_verbose=0; template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { @@ -126,8 +130,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); #ifndef ACCELERATOR_AWARE_MPI - static hostVector hsend_buf; hsend_buf.resize(buffer_size); - static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); + int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj); + static hostVector hsend_buf; hsend_buf.resize(buffer_size+pad); + static hostVector hrecv_buf; hrecv_buf.resize(buffer_size+pad); #endif int cb= (cbmask==0x2)? Odd : Even; @@ -180,20 +185,39 @@ template void Cshift_comms(Lattice &ret,const Lattice &r #else // bouncy bouncy acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes); + +#ifdef GRID_CHECKSUM_COMMS + assert(bytes % 8 == 0); + checksum_index++; + uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index); + *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum; + bytes += 8; +#endif + grid->SendToRecvFrom((void *)&hsend_buf[0], xmit_to_rank, (void *)&hrecv_buf[0], recv_from_rank, bytes); + +#ifdef GRID_CHECKSUM_COMMS + bytes -= 8; acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); + uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes); + uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index); std::cout << GridLogComms<< " Cshift: " <<" dim"<ThisRank() <<" Coor "<ThisProcessorCoor() - <<" send "< void Cshift_comms_simd(Lattice &ret,const Lattice hsend_buf; hsend_buf.resize(buffer_size); - hostVector hrecv_buf; hrecv_buf.resize(buffer_size); +#ifdef GRID_CHECKSUM_COMMS + buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj); +#endif + + static hostVector hsend_buf; hsend_buf.resize(buffer_size); + static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); + +#ifdef GRID_CHECKSUM_COMMS + buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj); +#endif #endif int bytes = buffer_size*sizeof(scalar_object); @@ -328,21 +360,37 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeSendToRecvFrom((void *)&hsend_buf[0], xmit_to_rank, (void *)&hrecv_buf[0], recv_from_rank, bytes); +#ifdef GRID_CHECKSUM_COMMS + bytes -= 8; acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes); + uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes); + uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index); std::cout << GridLogComms<< " Cshift_comms_simd: " <<" dim"<ThisRank() <<" Coor "<ThisProcessorCoor() - <<" send "< Word svm_xor(Word *vec,uint64_t L) theGridAccelerator->wait(); return ret; } +template Word checksum_gpu(Word *vec,uint64_t L) +{ + Word identity; identity=0; + Word ret = 0; + { + sycl::buffer abuff(&ret, {1}); + theGridAccelerator->submit([&](sycl::handler &cgh) { + auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); + cgh.parallel_for(sycl::range<1>{L}, + Reduction, + [=] (sycl::id<1> index, auto &sum) { + auto l = index % 61; + sum ^= vec[index]<>(64-l); + }); + }); + } + theGridAccelerator->wait(); + return ret; +} NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index 064c10e6..1df4e6e0 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -106,6 +106,47 @@ public: } }; + +#ifdef GRID_LOG_VIEWS +// Little autoscope assister +template +class ViewCloser +{ + View v; // Take a copy of view and call view close when I go out of scope automatically + const char* filename; int line, mode; +public: + ViewCloser(View &_v, const char* _filename, int _line, int _mode) : + v(_v), filename(_filename), line(_line), mode(_mode) { + + switch (mode){ + case AcceleratorRead: + case AcceleratorWrite: + case CpuRead: + case CpuWrite: + ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0])); + break; + } + + }; + ~ViewCloser() { + + switch (mode) { + case AcceleratorWriteDiscard: + case AcceleratorWrite: + case CpuWrite: + ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0])); + break; + } + + v.ViewClose(); + } +}; + +#define autoView(l_v,l,mode) \ + auto l_v = l.View(mode); \ + ViewCloser _autoView##l_v(l_v,__FILE__,__LINE__,mode); + +#else // Little autoscope assister template class ViewCloser @@ -119,6 +160,7 @@ class ViewCloser #define autoView(l_v,l,mode) \ auto l_v = l.View(mode); \ ViewCloser _autoView##l_v(l_v); +#endif ///////////////////////////////////////////////////////////////////////////////////////// // Lattice expression types used by ET to assemble the AST diff --git a/Grid/qcd/action/fermion/ZMobiusFermion.h b/Grid/qcd/action/fermion/ZMobiusFermion.h index f8d1f11f..de832b7d 100644 --- a/Grid/qcd/action/fermion/ZMobiusFermion.h +++ b/Grid/qcd/action/fermion/ZMobiusFermion.h @@ -57,7 +57,7 @@ public: { // RealD eps = 1.0; - std::cout< zgamma(this->Ls); for(int s=0;sLs;s++){ zgamma[s] = gamma[s]; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 26e80838..54a76b07 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -535,7 +535,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField { autoView(U_v , U,AcceleratorRead); autoView(in_v , in,AcceleratorRead); - autoView(out_v,out,AcceleratorWrite); + autoView(out_v,out,AcceleratorWriteDiscard); autoView(st_v , st,AcceleratorRead); KERNEL_CALL_ID(GenericDhopSite); } diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h index b25bd2f9..6f6fd016 100644 --- a/Grid/qcd/smearing/GaugeConfiguration.h +++ b/Grid/qcd/smearing/GaugeConfiguration.h @@ -118,7 +118,7 @@ protected: GaugeK); // derivative of SmearBase return SigmaK; } - +public: /*! @brief Returns smeared configuration at level 'Level' */ const GaugeField &get_smeared_conf(int Level) const { diff --git a/Grid/qcd/smearing/GaugeConfigurationMasked.h b/Grid/qcd/smearing/GaugeConfigurationMasked.h index 78846263..3ee00d6c 100644 --- a/Grid/qcd/smearing/GaugeConfigurationMasked.h +++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h @@ -819,7 +819,7 @@ public: } // if smearingLevels = 0 do nothing } -private: +public: //==================================================================== // Override base clas here to mask it virtual void fill_smearedSet(GaugeField &U) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index cf38db27..25d194b5 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -192,6 +192,11 @@ public: void ViewClose(void) { } +#ifdef GRID_LOG_VIEWS + size_t size() { return 0; }; + uint64_t & operator[](size_t i) { static uint64_t v=0; return v; }; +#endif + }; //////////////////////////////////////// diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index 139e7957..466ce071 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -372,4 +372,53 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) } } +#ifdef GRID_LOG_VIEWS + +bool ViewLogger::Enabled = false; +std::vector ViewLogger::LogVector; + +void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); } +void ViewLogger::End() { Enabled = false; } + +void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes) +{ + if (!Enabled) + return; + + size_t i = LogVector.size(); + LogVector.resize(i + 1); + auto & n = LogVector[i]; + + n.filename = filename; + n.line = line; + n.index = index; + + if (bytes < sizeof(uint64_t)) { + + n.head = n.tail = 0; + + } else { + + switch (mode) { + case AcceleratorRead: + case AcceleratorWrite: + case AcceleratorWriteDiscard: + acceleratorCopyFromDevice((char*)data, &n.head, sizeof(uint64_t)); + acceleratorCopyFromDevice((char*)data + bytes - sizeof(uint64_t), &n.tail, sizeof(uint64_t)); + break; + + case CpuRead: + case CpuWrite: + //case CpuWriteDiscard: + n.head = *(uint64_t*)data; + n.tail = *(uint64_t*)((char*)data + bytes - sizeof(uint64_t)); + break; + } + } +} + +#endif + + + NAMESPACE_END(Grid); diff --git a/Grid/util/FlightRecorder.h b/Grid/util/FlightRecorder.h index 7c1cd171..7cad5a61 100644 --- a/Grid/util/FlightRecorder.h +++ b/Grid/util/FlightRecorder.h @@ -42,5 +42,22 @@ class FlightRecorder { static void xmitLog(void *,uint64_t bytes); static void recvLog(void *,uint64_t bytes,int rank); }; +#ifdef GRID_LOG_VIEWS +class ViewLogger { + struct Entry_t { + const char* filename; + int line; + int index; + uint64_t head, tail; + }; + +public: + static bool Enabled; + static std::vector LogVector; + static void Begin(); + static void End(); + static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes); +}; +#endif NAMESPACE_END(Grid); diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 18bdd14f..90f48b7f 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -416,12 +416,28 @@ void Grid_init(int *argc,char ***argv) } else { FILE *fp; std::ostringstream fname; + + int rank = CartesianCommunicator::RankWorld(); + int radix=64; + char* root = getenv("GRID_STDOUT_ROOT"); + if (root) { + fname << root ; + mkdir(fname.str().c_str(), S_IRWXU ); + fname << "/"; + } + fname << (rank/radix)*radix ; + mkdir(fname.str().c_str(), S_IRWXU ); + fname << "/"; fname<<"Grid.stdout."; fname<