From 84cab5e6e7c1cd23efe4d42d2d34fd1b31ac66fa Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Sat, 1 Feb 2025 16:37:21 +0100
Subject: [PATCH 1/5] no comms and log cleanup

---
 Grid/algorithms/FFT.h                 | 12 ++++++------
 Grid/communicator/Communicator_base.h |  3 ++-
 Grid/util/Lexicographic.h             |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h
index dc972537..329d1d46 100644
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -191,7 +191,7 @@ public:
       
     Lattice<sobj> pgbuf(&pencil_g);
     autoView(pgbuf_v , pgbuf, CpuWrite);
-    std::cout << "CPU view" << std::endl;
+    //std::cout << "CPU view" << std::endl;
     
     typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
     typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@@ -215,7 +215,7 @@ public:
     else if ( sign == forward ) div = 1.0;
     else assert(0);
       
-    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
     FFTW_plan p;
     {
       FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,7 +229,7 @@ public:
     }
       
     // Barrel shift and collect global pencil
-    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
     Coordinate lcoor(Nd), gcoor(Nd);
     result = source;
     int pc = processor_coor[dim];
@@ -251,7 +251,7 @@ public:
       }
     }
       
-    std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
+    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
     // Loop over orthog coords
     int NN=pencil_g.lSites();
     GridStopWatch timer;
@@ -274,7 +274,7 @@ public:
     usec += timer.useconds();
     flops+= flops_call*NN;
       
-    std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
+    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
     // writing out result
     {
       autoView(pgbuf_v,pgbuf,CpuRead);
@@ -291,7 +291,7 @@ public:
     }
     result = result*div;
       
-    std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
     // destroying plan
     FFTW<scalar>::fftw_destroy_plan(p);
 #endif
diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index c72fcc79..964a523b 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -147,7 +147,8 @@ public:
 			    sizeof(obj),d*100+p);
 
       }
-      CommsComplete(list);
+      if (!list.empty()) // avoid triggering assert in comms == none
+	CommsComplete(list);
       for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
       }
diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h
index 422e42ee..8636333d 100644
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -50,7 +50,7 @@ namespace Grid{
       int64_t index64;
       IndexFromCoorReversed(coor,index64,dims);
       if ( index64>=2*1024*1024*1024LL ){
-	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+	//std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
       }
       assert(index64<2*1024*1024*1024LL);
       index = (int) index64;

From 19f9378b9886f2d0ef2028185ab37311080987c2 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 11 Mar 2025 13:49:53 +0000
Subject: [PATCH 2/5] Should work on Aurora nowb

---
 Grid/allocator/AlignedAllocator.h      | 2 +-
 Grid/allocator/MemoryManagerCache.cc   | 3 +++
 Grid/communicator/Communicator_mpi3.cc | 3 ---
 Grid/cshift/Cshift_mpi.h               | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index a32d69db..316f201c 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -69,7 +69,7 @@ public:
   }
 
   // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p, const _Tp& __val) { };
   void construct(pointer __p) { };
   void destroy(pointer __p) { };
 };
diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc
index eb8c6d38..09afbcf7 100644
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -234,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
+  if(bytes>=DeviceMaxBytes) {
+    printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
+  }
   assert(bytes<DeviceMaxBytes);
   while(bytes+DeviceLRUBytes > DeviceMaxBytes){
     if ( DeviceLRUBytes > 0){
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 38b9f9c6..8de29669 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -759,9 +759,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
 
   int nreq=MpiRequests.size();
 
-  std::cout << GridLogMessage << " StencilSendToRevFromComplete "<<nreq<<" Mpi Requests"<<std::endl;
-
-
   if (nreq>0) {
     status.resize(MpiRequests.size());
     int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 710792ee..4b000035 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -126,8 +126,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
   static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
   static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
 #ifndef ACCELERATOR_AWARE_MPI
-  static hostVector<vobj> hsend_buf;  hsend_buf.resize(buffer_size);
-  static hostVector<vobj> hrecv_buf;  hrecv_buf.resize(buffer_size);
+  static std::vector<vobj> hsend_buf;  hsend_buf.resize(buffer_size);
+  static std::vector<vobj> hrecv_buf;  hrecv_buf.resize(buffer_size);
 #endif
   
   int cb= (cbmask==0x2)? Odd : Even;
@@ -250,8 +250,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
     recv_buf_extract[s].resize(buffer_size);
   }
 #ifndef ACCELERATOR_AWARE_MPI
-  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
-  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+  std::vector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
+  std::vector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
   
   int bytes = buffer_size*sizeof(scalar_object);

From 25ab9325e7f319735533bf40ebaa6ba695d3fd52 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Tue, 11 Mar 2025 15:02:32 +0000
Subject: [PATCH 3/5] Use hostVector but remove construct resize

---
 Grid/cshift/Cshift_mpi.h                   | 9 ++++-----
 Grid/qcd/action/fermion/WilsonCompressor.h | 1 -
 Grid/stencil/Stencil.h                     | 1 -
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 4b000035..6f2e2699 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -126,8 +126,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
   static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
   static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
 #ifndef ACCELERATOR_AWARE_MPI
-  static std::vector<vobj> hsend_buf;  hsend_buf.resize(buffer_size);
-  static std::vector<vobj> hrecv_buf;  hrecv_buf.resize(buffer_size);
+  static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
   
   int cb= (cbmask==0x2)? Odd : Even;
@@ -244,14 +244,13 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
   scalar_object *  recv_buf_extract_mpi;
   scalar_object *  send_buf_extract_mpi;
 
-
   for(int s=0;s<Nsimd;s++){
     send_buf_extract[s].resize(buffer_size);
     recv_buf_extract[s].resize(buffer_size);
   }
 #ifndef ACCELERATOR_AWARE_MPI
-  std::vector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
-  std::vector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
+  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
 #endif
   
   int bytes = buffer_size*sizeof(scalar_object);
diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h
index 1c6571e1..22f2d8e3 100644
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -485,7 +485,6 @@ public:
     assert(this->u_comm_offset==this->_unified_buffer_size);
     accelerator_barrier();
 #ifdef NVLINK_GET
-    #warning "NVLINK_GET"
     this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
     // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
     // Or issue barrier AFTER the DMA is running
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index 2a666a04..3613cdbb 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -518,7 +518,6 @@ public:
     }
     accelerator_barrier(); // All my local gathers are complete
 #ifdef NVLINK_GET
-    #warning "NVLINK_GET"
     _grid->StencilBarrier(); // He can now get mu local gather, I can get his
     // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
     // Or issue barrier AFTER the DMA is running

From e9177e4af3acd572666d7f66a944cf0539045d7d Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 13 Mar 2025 08:48:23 +0000
Subject: [PATCH 4/5] Blas compatibility

---
 Grid/algorithms/blas/BatchedBlas.h | 147 ++++++++++++++++++++++++-----
 1 file changed, 126 insertions(+), 21 deletions(-)

diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index f4245319..3a7bbc44 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -208,8 +208,8 @@ public:
     assert(Bkn.size()==batchCount);
     assert(Cmn.size()==batchCount);
 
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -367,28 +367,67 @@ public:
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
         });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 	  });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
       } else { 
 	assert(0);
@@ -414,8 +453,8 @@ public:
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
 
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    //assert(OpB!=GridBLAS_OP_T);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -514,28 +553,70 @@ public:
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.adjoint() ;
+	  });
+      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  else
+	    eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
+	  } );
+      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
+	thread_for (p, batchCount, {
+	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
+	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
+	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  } );
       } else { 
 	assert(0);
@@ -661,29 +742,41 @@ public:
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;	  
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
-	  } );
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
+	  });
       } else { 
 	assert(0);
       }
@@ -809,28 +902,40 @@ public:
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn ;
+	  else
+	    eCmn = alpha * eAmk * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn ;
 	  });
       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk * eBkn.transpose() ;
 	  });
       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 	thread_for (p, batchCount, {
 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
-	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  if (std::abs(beta) != 0.0)
+	    eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
+	  else
+	    eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
 	  });
       } else { 
 	assert(0);

From fe66c7ca304b2726f8d4a2f66dfc006b544190f8 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Thu, 13 Mar 2025 12:49:36 +0000
Subject: [PATCH 5/5] verbosity

---
 Grid/communicator/SharedMemoryMPI.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index dc22aee0..9b82658a 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -542,7 +542,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
   // Each MPI rank should allocate our own buffer
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
-  printf("Host buffer allocate for GPU non-aware MPI\n");
+  // printf("Host buffer allocate for GPU non-aware MPI\n");
 #if 0
   HostCommBuf= acceleratorAllocHost(bytes);
 #else