fast cpu basisRotate and other small cleanups

2025-07-01 22:17:08 +01:00 · 2020-07-30 07:08:54 -04:00
parent 0e88bf4bff
commit 197612bc7a
3 changed files with 43 additions and 30 deletions
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@ -136,34 +136,39 @@ void MemoryManager::Init(void)
      Ncache[SharedSmall]=Nc;
    }
  }
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 #endif
  // only root node delivers messages, this is called before communicator is initialized,
  // so need a manual restriction
  if ( CartesianCommunicator::RankWorld() == 0 ) {
    std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
    std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 #endif
 #ifdef GRID_UVM
-  std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
 #ifdef GRID_CUDA
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
 #endif
 #ifdef GRID_HIP
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
 #endif
 #ifdef GRID_SYCL
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
 #endif
 #else
-  std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
 #ifdef GRID_CUDA
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
 #endif
 #ifdef GRID_HIP
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
 #endif
 #ifdef GRID_SYCL
-  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
+    std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
 #endif
 #endif
  }
 }
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -44,22 +44,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
 // Fugaku Tofu: enable by default
 /*
 #if defined (A64FX) || defined (A64FXFIXEDSIZE)
 #ifndef TOFU
 #define TOFU
 #pragma message ("TOFU network / MPI_THREAD_SERIALIZED")
 #endif
 #endif
 */
 #if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
    nCommThreads=1;
    // wrong results here too
-    //MPI_Init(argc,argv);
+    // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
    // comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
    // other comms schemes are ok
    MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
 #else
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@ -54,13 +54,34 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  typedef decltype(basis[0].View(AcceleratorRead)) View;
  Vector<View> basis_v; basis_v.reserve(basis.size());
  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
  GridBase* grid = basis[0].Grid();
  for(int k=0;k<basis.size();k++){
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }
-
+#ifdef GRID_OMP
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
    {
      vobj* B = &Bt[Nm * thread_num()];
      thread_for_in_region(ss, grid->oSites(),{
 	  for(int j=j0; j<j1; ++j) B[j]=0.;
 	  for(int j=j0; j<j1; ++j){
 	    for(int k=k0; k<k1; ++k){
 	      B[j] +=Qt(j,k) * basis_v[k][ss];
 	    }
 	  }
 	  for(int j=j0; j<j1; ++j){
 	    basis_v[j][ss] = B[j];
 	  }
 	});
    }
 #else
  View *basis_vp = &basis_v[0];
  int nrot = j1-j0;
@ -70,14 +91,12 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
  Vector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];
  // GPU readable copy of matrix
-  Vector<double> Qt_jv(Nm*Nm);
+  Vector<Coeff_t> Qt_jv(Nm*Nm);
-  double *Qt_p = & Qt_jv[0];
+  Coeff_t *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
@ -118,6 +137,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
 #endif
  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }