Improved frontier cshift

2025-06-17 15:27:06 +01:00 · 2023-10-13 17:46:07 +03:00
parent 6d0c2de399
commit c9c4576237
3 changed files with 122 additions and 63 deletions
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@ -29,8 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-extern Vector<std::pair<int,int> > Cshift_table; 
+extern std::vector<std::pair<int,int> > Cshift_table; 
+extern commVector<std::pair<int,int> > Cshift_table_device; 

+inline std::pair<int,int> *MapCshiftTable(void)
+{
+  // GPU version
+#ifdef ACCELERATOR_CSHIFT    
+  uint64_t sz=Cshift_table.size();
+  if (Cshift_table_device.size()!=sz )    {
+    Cshift_table_device.resize(sz);
+  }
+  acceleratorCopyToDevice((void *)&Cshift_table[0],
+			  (void *)&Cshift_table_device[0],
+			  sizeof(Cshift_table[0])*sz);
+
+  return &Cshift_table_device[0];
+#else 
+  return &Cshift_table[0];
+#endif
+  // CPU version use identify map
+}
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
  }
  {
    auto buffer_p = & buffer[0];
-    auto table = &Cshift_table[0];
-#ifdef ACCELERATOR_CSHIFT    
+    auto table = MapCshiftTable();
+#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
@ -225,7 +244,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  
  {
    auto buffer_p = & buffer[0];
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
@ -297,30 +316,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }

-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-
-template <typename T>
-T iDivUp(T a, T b) // Round a / b to nearest higher integer value
-{ return (a % b != 0) ? (a / b + 1) : (a / b); }
-
-template <typename T>
-__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
-{
-    int idx = blockIdx.x*blockDim.x + threadIdx.x;
-    if (idx >= e1*e2) return;
-
-    int n, b, o;
-
-    n = idx / e2;
-    b = idx % e2;
-    o = n*stride + b;
-
-    vector[2*idx + 0] = lo + o;
-    vector[2*idx + 1] = ro + o;
-}
-
-#endif
-
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@ -345,20 +340,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;

  if(cbmask == 0x3 ){
-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-    ent = e1*e2;
-    dim3 blockSize(acceleratorThreads());
-    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
-    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
-    accelerator_barrier();
-#else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
-#endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@ -372,7 +359,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  }

  {
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
@ -409,19 +396,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;

  if ( cbmask == 0x3 ) {
-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-    ent = e1*e2;
-    dim3 blockSize(acceleratorThreads());
-    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
-    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
-    accelerator_barrier();
-#else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
-#endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
@ -432,7 +411,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  }

  {
-    auto table = &Cshift_table[0];
+    auto table = MapCshiftTable();
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);