I have made the Cshift work successfully with open mp threading in

every routine. Collapse(2) is now working under clang-omp++.
2025-11-25 08:59:32 +00:00 · 2015-05-13 00:31:00 +01:00
parent 6cec662ac5
commit 48f425d31c
13 changed files with 166 additions and 1006 deletions
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -24,21 +24,21 @@
 #include <stdio.h>
 #include <signal.h>

+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif
+
 #include <Grid_config.h>

 ////////////////////////////////////////////////////////////
 // Tunable header includes
 ////////////////////////////////////////////////////////////

-#ifdef HAVE_OPENMP
-#define OMP
-#include <omp.h>
-#endif

 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
-
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
--- a/lib/Grid_stencil.h
+++ b/lib/Grid_stencil.h
@@ -250,7 +250,11 @@ namespace Grid {
 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 	  int words = sizeof(cobj)/sizeof(vector_type);

-	  /* FIXME ALTERNATE BUFFER DETERMINATION ; possibly slow to allocate*/
+	  /*
+	   * possibly slow to allocate
+	   * Doesn't matter in this test, but may want to preallocate in the 
+	   * dirac operators
+	   */
 	  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); 
 	  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 	  int bytes = buffer_size*sizeof(scalar_object);
--- a/lib/Grid_threads.h
+++ b/lib/Grid_threads.h
@@ -1,13 +1,17 @@
 #ifndef GRID_THREADS_H
 #define GRID_THREADS_H

-#ifdef HAVE_OPENMP
+#ifdef _OPENMP
+#define GRID_OMP
+#endif
+
+#ifdef GRID_OMP
 #include <omp.h>
 #define PARALLEL_FOR_LOOP _Pragma("omp parallel for")
-#define PARALLEL_NESTED_LOOP(n) _Pragma("omp parallel for collapse(" #n ")")
+#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #else
 #define PARALLEL_FOR_LOOP 
-#define PARALLEL_NESTED_LOOP(n) 
+#define PARALLEL_NESTED_LOOP2
 #endif

 namespace Grid {
@@ -20,7 +24,7 @@ class GridThread {
  static int _threads;

  static void SetThreads(int thr) { 
-#ifdef HAVE_OPENMP
+#ifdef GRID_OMP
    _threads = MIN(thr,omp_get_max_threads()) ;
    omp_set_num_threads(_threads);
 #else 
@@ -28,7 +32,7 @@ class GridThread {
 #endif
  };
  static void SetMaxThreads(void) { 
-#ifdef HAVE_OPENMP
+#ifdef GRID_OMP
    _threads = omp_get_max_threads();
    omp_set_num_threads(_threads);
 #else 
@@ -58,7 +62,7 @@ class GridThread {
  };

  static int  ThreadBarrier(void) {
-#ifdef HAVE_OPENMP
+#ifdef GRID_OMP
 #pragma omp barrier
    return omp_get_thread_num();
 #else
--- a/lib/cshift/Grid_cshift_common.h
+++ b/lib/cshift/Grid_cshift_common.h
@@ -26,16 +26,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  }

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  int bo  = 0;                                      // offset in buffer
  
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-      int o = n*rhs._grid->_slice_stride[dimension];
+      int o  = n*rhs._grid->_slice_stride[dimension];
+      int bo = n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb &cbmask ) {
-	buffer[bo]=compress(rhs._odata[so+o+b]);
-	bo++;
+	buffer[bo+b]=compress(rhs._odata[so+o+b]);
      }
    }
  }
@@ -55,9 +54,8 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
  }

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  int bo  = 0;                                      // offset in buffer
    
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){

@@ -104,15 +102,15 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
  }

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  int bo  = 0;                                      // offset in buffer
    
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-      int o=n*rhs._grid->_slice_stride[dimension];
+      int o   =n*rhs._grid->_slice_stride[dimension];
+      int bo  =n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb & cbmask ) {
-	rhs._odata[so+o+b]=buffer[bo++];
+	rhs._odata[so+o+b]=buffer[bo+b];
      }
    }
  }
@@ -131,7 +129,7 @@ PARALLEL_NESTED_LOOP(2)

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
    
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){

@@ -160,7 +158,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
  
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
      
@@ -185,7 +183,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
  
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
      int o =n*rhs._grid->_slice_stride[dimension];
--- a/lib/cshift/Grid_cshift_mpi.h
+++ b/lib/cshift/Grid_cshift_mpi.h
@@ -1,10 +1,6 @@
 #ifndef _GRID_CSHIFT_MPI_H_
 #define _GRID_CSHIFT_MPI_H_

-#ifndef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
-#define MIN(x,y) ((x)>(y)?(y):(x))
-#endif

 namespace Grid { 

--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -100,22 +100,21 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
  vHalfSpinColourVector  chi;    
  vSpinColourVector result;
  vHalfSpinColourVector Uchi;
-  vHalfSpinColourVector *chi_p;
  int offset,local,perm, ptype;

 PARALLEL_FOR_LOOP
  for(int sss=0;sss<grid->oSites();sss++){

    int ss = sss;
-    int ssu= sss;
-    //int ss = Stencil._LebesgueReorder[sss];
+    int ssu= ss;
+    //    int ss = Stencil._LebesgueReorder[sss];

    // Xp
    offset = Stencil._offsets [Xp][ss];
    local  = Stencil._is_local[Xp][ss];
    perm   = Stencil._permute[Xp][ss];
    ptype  = Stencil._permute_type[Xp];
-    chi_p  = &comm_buf[offset];
+
    if ( local && perm ) 
    {
      spProjXp(tmp,in._odata[offset]);