diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index dc972537..329d1d46 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -191,7 +191,7 @@ public: Lattice pgbuf(&pencil_g); autoView(pgbuf_v , pgbuf, CpuWrite); - std::cout << "CPU view" << std::endl; + //std::cout << "CPU view" << std::endl; typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; @@ -215,7 +215,7 @@ public: else if ( sign == forward ) div = 1.0; else assert(0); - std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; + //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; FFTW_plan p; { FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; @@ -229,7 +229,7 @@ public: } // Barrel shift and collect global pencil - std::cout << GridLogPerformance<<"Making pencil" << std::endl; + //std::cout << GridLogPerformance<<"Making pencil" << std::endl; Coordinate lcoor(Nd), gcoor(Nd); result = source; int pc = processor_coor[dim]; @@ -251,7 +251,7 @@ public: } } - std::cout <::fftw_destroy_plan(p); #endif diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index f4245319..3a7bbc44 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -208,8 +208,8 @@ public: assert(Bkn.size()==batchCount); assert(Cmn.size()==batchCount); - assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose - assert(OpB!=GridBLAS_OP_T); + //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose + //assert(OpB!=GridBLAS_OP_T); int lda = m; // m x k column major int ldb = k; // k x n column major @@ -367,28 +367,67 @@ public: Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + else + eCmn = alpha * eAmk.adjoint() * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + else + eCmn = alpha * eAmk * eBkn.adjoint() ; + }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + else + eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ; + } ); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; } ); } else { assert(0); @@ -414,8 +453,8 @@ public: RealD t2=usecond(); int32_t batchCount = Amk.size(); - assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose - assert(OpB!=GridBLAS_OP_T); + //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose + //assert(OpB!=GridBLAS_OP_T); int lda = m; // m x k column major int ldb = k; // k x n column major @@ -514,28 +553,70 @@ public: Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; + else + eCmn = alpha * eAmk.adjoint() * eBkn ; + }); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],k,n); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; + else + eCmn = alpha * eAmk * eBkn.adjoint() ; + }); + } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],m,k); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; + else + eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ; + } ); + } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { + thread_for (p, batchCount, { + Eigen::Map eAmk(Amk[p],k,m); + Eigen::Map eBkn(Bkn[p],n,k); + Eigen::Map eCmn(Cmn[p],m,n); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; } ); } else { assert(0); @@ -661,29 +742,41 @@ public: Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; - } ); + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; + }); } else { assert(0); } @@ -809,28 +902,40 @@ public: Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn ; + else + eCmn = alpha * eAmk * eBkn ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],k,n); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; + else + eCmn = alpha * eAmk.transpose() * eBkn ; }); } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],m,k); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; + else + eCmn = alpha * eAmk * eBkn.transpose() ; }); } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { thread_for (p, batchCount, { Eigen::Map eAmk(Amk[p],k,m); Eigen::Map eBkn(Bkn[p],n,k); Eigen::Map eCmn(Cmn[p],m,n); - eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + if (std::abs(beta) != 0.0) + eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; + else + eCmn = alpha * eAmk.transpose() * eBkn.transpose() ; }); } else { assert(0); diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 1db5bc59..62fcfa7b 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -149,7 +149,8 @@ public: sizeof(obj),d*100+p); } - CommsComplete(list); + if (!list.empty()) // avoid triggering assert in comms == none + CommsComplete(list); for(int p=1;p<_processors[d];p++){ accum = accum + column[p]; } diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index dc22aee0..9b82658a 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -542,7 +542,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI - printf("Host buffer allocate for GPU non-aware MPI\n"); + // printf("Host buffer allocate for GPU non-aware MPI\n"); #if 0 HostCommBuf= acceleratorAllocHost(bytes); #else