fast cpu basisRotate and other small cleanups

2025-08-24 23:17:10 +01:00 · 2020-07-30 07:08:54 -04:00
parent 0e88bf4bff
commit 197612bc7a
3 changed files with 43 additions and 30 deletions
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -136,34 +136,39 @@ void MemoryManager::Init(void)
      Ncache[SharedSmall]=Nc;
    }
  }
-  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
-#ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
-#endif

+  // only root node delivers messages, this is called before communicator is initialized,
+  // so need a manual restriction
+  if ( CartesianCommunicator::RankWorld() == 0 ) {
+    std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
+#ifdef ALLOCATION_CACHE
+    std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
+#endif
+    
 #ifdef GRID_UVM
-  std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
 #ifdef GRID_CUDA
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
 #endif
 #ifdef GRID_HIP
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
 #endif
 #ifdef GRID_SYCL
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
 #endif
 #else
-  std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
 #ifdef GRID_CUDA
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
 #endif
 #ifdef GRID_HIP
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
 #endif
 #ifdef GRID_SYCL
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
 #endif
 #endif
+  }
 }

 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -44,22 +44,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {

-// Fugaku Tofu: enable by default
-/*
-#if defined (A64FX) || defined (A64FXFIXEDSIZE)
-#ifndef TOFU
-#define TOFU
-#pragma message ("TOFU network / MPI_THREAD_SERIALIZED")
-#endif
-#endif
-*/
-
 #if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
    nCommThreads=1;
    // wrong results here too
-    //MPI_Init(argc,argv);
-
-    // comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
+    // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
    // other comms schemes are ok
    MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
 #else
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -54,13 +54,34 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  typedef decltype(basis[0].View(AcceleratorRead)) View;

  Vector<View> basis_v; basis_v.reserve(basis.size());
+  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
+  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
  GridBase* grid = basis[0].Grid();
      
  for(int k=0;k<basis.size();k++){
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }

-
+#ifdef GRID_OMP
+  int max_threads = thread_max();
+  Vector < vobj > Bt(Nm * max_threads);
+  thread_region
+    {
+      vobj* B = &Bt[Nm * thread_num()];
+      thread_for_in_region(ss, grid->oSites(),{
+	  for(int j=j0; j<j1; ++j) B[j]=0.;
+      
+	  for(int j=j0; j<j1; ++j){
+	    for(int k=k0; k<k1; ++k){
+	      B[j] +=Qt(j,k) * basis_v[k][ss];
+	    }
+	  }
+	  for(int j=j0; j<j1; ++j){
+	    basis_v[j][ss] = B[j];
+	  }
+	});
+    }
+#else
  View *basis_vp = &basis_v[0];

  int nrot = j1-j0;
@@ -70,14 +91,12 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead

-  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
-
  Vector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];

  // GPU readable copy of matrix
-  Vector<double> Qt_jv(Nm*Nm);
-  double *Qt_p = & Qt_jv[0];
+  Vector<Coeff_t> Qt_jv(Nm*Nm);
+  Coeff_t *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
@@ -118,6 +137,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
+#endif

  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }