I have made the Cshift work successfully with open mp threading in

every routine. Collapse(2) is now working under clang-omp++.
2025-07-22 23:47:08 +01:00 · 2015-05-13 00:31:00 +01:00
parent 6cec662ac5
commit 48f425d31c
13 changed files with 166 additions and 1006 deletions
--- a/Makefile.in
+++ b/Makefile.in
@@ -205,12 +205,9 @@ AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
-CC = @CC@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
+CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
@@ -234,7 +231,7 @@ LTLIBOBJS = @LTLIBOBJS@
 MAKEINFO = @MAKEINFO@
 MKDIR_P = @MKDIR_P@
 OBJEXT = @OBJEXT@
-OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OPENMP_CXXFLAGS = @OPENMP_CXXFLAGS@
 PACKAGE = @PACKAGE@
 PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
 PACKAGE_NAME = @PACKAGE_NAME@
@@ -252,7 +249,6 @@ abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
 abs_top_builddir = @abs_top_builddir@
 abs_top_srcdir = @abs_top_srcdir@
-ac_ct_CC = @ac_ct_CC@
 ac_ct_CXX = @ac_ct_CXX@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
--- a/23
+++ b/23
@@ -2,10 +2,7 @@
 *** Hacks and bug fixes to clean up and Audits
 ================================================================
 * Base class to share common code between vRealF, VComplexF etc... 
-
-* Performance check on Guido's reimplementation strategy
-
-* Bug in SeedFixedIntegers gives same output on each site. -- Think I fixed but NOT checked for sure
+  - Performance check on Guido's reimplementation strategy

 * FIXME audit
 * const audit
@@ -20,15 +17,16 @@
 *** New Functionality
 ================================================================

-* Implement where to take template scheme.
+* Implement where within expression template scheme.

 * - BinaryWriter, TextWriter etc...
  - use protocol buffers? replace xmlReader/Writer ec..
  - Binary use htonll, htonl

-* Expression template engine:
+* Bug in SeedFixedIntegers gave same output on each site. -- Think I fixed but NOT checked for sure
+  Implement and use lattice IO to verify this.

-   -- Audit
+* Expression template engine: -- DONE
   -- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>

 * CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
@@ -56,9 +54,10 @@
 * TaProj
 * FFTnD ?

-* Parallel MPI2 IO
-  Plaquette checks into nersc reader.
-
+* Parallel io improvements
+  - optional parallel MPI2 IO
+  - move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
+  
 * rb4d support for 5th dimension in Mobius.

 * Check for missing functionality                    - partially audited against QDP++ layout
@@ -70,13 +69,15 @@
   // copyMask.
   // localMaxAbs
   // Fourier transform equivalent.
-Actions
+
+Actions -- coherent framework for implementing actions and their forces.
 * Fermion
  - Wilson
  - Clover
  - DomainWall
  - Mobius
  - z-Mobius
+
 * Gauge
  - Wilson, symanzik, iwasaki

--- a/aclocal.m4
+++ b/aclocal.m4
@@ -765,70 +765,6 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])

-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# _AM_PROG_CC_C_O
-# ---------------
-# Like AC_PROG_CC_C_O, but changed for automake.  We rewrite AC_PROG_CC
-# to automatically call this.
-AC_DEFUN([_AM_PROG_CC_C_O],
-[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-AC_REQUIRE_AUX_FILE([compile])dnl
-AC_LANG_PUSH([C])dnl
-AC_CACHE_CHECK(
-  [whether $CC understands -c and -o together],
-  [am_cv_prog_cc_c_o],
-  [AC_LANG_CONFTEST([AC_LANG_PROGRAM([])])
-  # Make sure it works both with $CC and with simple cc.
-  # Following AC_PROG_CC_C_O, we do the test twice because some
-  # compilers refuse to overwrite an existing .o file with -o,
-  # though they will create one.
-  am_cv_prog_cc_c_o=yes
-  for am_i in 1 2; do
-    if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \
-         && test -f conftest2.$ac_objext; then
-      : OK
-    else
-      am_cv_prog_cc_c_o=no
-      break
-    fi
-  done
-  rm -f core conftest*
-  unset am_i])
-if test "$am_cv_prog_cc_c_o" != yes; then
-   # Losing compiler, so override with the script.
-   # FIXME: It is wrong to rewrite CC.
-   # But if we don't then we get into trouble of one sort or another.
-   # A longer-term fix would be to have automake use am__CC in this case,
-   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
-   CC="$am_aux_dir/compile $CC"
-fi
-AC_LANG_POP([C])])
-
-# For backward compatibility.
-AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
-
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# AM_RUN_LOG(COMMAND)
-# -------------------
-# Run COMMAND, save the exit status in ac_status, and log it.
-# (This has been adapted from Autoconf's _AC_RUN_LOG macro.)
-AC_DEFUN([AM_RUN_LOG],
-[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD
-   ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD
-   ac_status=$?
-   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
-   (exit $ac_status); }])
-
 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

 # Copyright (C) 1996-2014 Free Software Foundation, Inc.
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -77,7 +77,7 @@ int main (int argc, char ** argv)
  WilsonMatrix Dw(Umu,mass);
  
  std::cout << "Calling Dw"<<std::endl;
-  int ncall=10000;
+  int ncall=1000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.multiply(src,result);
--- a/1004
+++ b/1004
--- a/1
+++ b/1
@@ -1,3 +1,4 @@
+CXX=clang-omp++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp"  --enable-comms=mpi
 CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx"  --enable-comms=mpi
 CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -g -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx"  --enable-comms=mpi
 CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -g  -std=c++11" LDFLAGS= LIBS=-lmpi  --enable-comms=fake
--- a/configure.ac
+++ b/configure.ac
@@ -6,6 +6,7 @@ AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Grid_config.h])

 # Checks for programs.
+AC_LANG(C++)
 AC_PROG_CXX
 AC_OPENMP
 AC_PROG_RANLIB
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -24,21 +24,21 @@
 #include <stdio.h>
 #include <signal.h>

+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif
+
 #include <Grid_config.h>

 ////////////////////////////////////////////////////////////
 // Tunable header includes
 ////////////////////////////////////////////////////////////

-#ifdef HAVE_OPENMP
-#define OMP
-#include <omp.h>
-#endif

 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
-
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
--- a/lib/Grid_stencil.h
+++ b/lib/Grid_stencil.h
@@ -250,7 +250,11 @@ namespace Grid {
 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 	  int words = sizeof(cobj)/sizeof(vector_type);

-	  /* FIXME ALTERNATE BUFFER DETERMINATION ; possibly slow to allocate*/
+	  /*
+	   * possibly slow to allocate
+	   * Doesn't matter in this test, but may want to preallocate in the 
+	   * dirac operators
+	   */
 	  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); 
 	  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 	  int bytes = buffer_size*sizeof(scalar_object);
--- a/lib/Grid_threads.h
+++ b/lib/Grid_threads.h
@@ -1,13 +1,17 @@
 #ifndef GRID_THREADS_H
 #define GRID_THREADS_H

-#ifdef HAVE_OPENMP
+#ifdef _OPENMP
+#define GRID_OMP
+#endif
+
+#ifdef GRID_OMP
 #include <omp.h>
 #define PARALLEL_FOR_LOOP _Pragma("omp parallel for")
-#define PARALLEL_NESTED_LOOP(n) _Pragma("omp parallel for collapse(" #n ")")
+#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #else
 #define PARALLEL_FOR_LOOP 
-#define PARALLEL_NESTED_LOOP(n) 
+#define PARALLEL_NESTED_LOOP2
 #endif

 namespace Grid {
@@ -20,7 +24,7 @@ class GridThread {
  static int _threads;

  static void SetThreads(int thr) { 
-#ifdef HAVE_OPENMP
+#ifdef GRID_OMP
    _threads = MIN(thr,omp_get_max_threads()) ;
    omp_set_num_threads(_threads);
 #else 
@@ -28,7 +32,7 @@ class GridThread {
 #endif
  };
  static void SetMaxThreads(void) { 
-#ifdef HAVE_OPENMP
+#ifdef GRID_OMP
    _threads = omp_get_max_threads();
    omp_set_num_threads(_threads);
 #else 
@@ -58,7 +62,7 @@ class GridThread {
  };

  static int  ThreadBarrier(void) {
-#ifdef HAVE_OPENMP
+#ifdef GRID_OMP
 #pragma omp barrier
    return omp_get_thread_num();
 #else
--- a/lib/cshift/Grid_cshift_common.h
+++ b/lib/cshift/Grid_cshift_common.h
@@ -26,16 +26,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  }

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  int bo  = 0;                                      // offset in buffer
  
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-      int o = n*rhs._grid->_slice_stride[dimension];
+      int o  = n*rhs._grid->_slice_stride[dimension];
+      int bo = n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb &cbmask ) {
-	buffer[bo]=compress(rhs._odata[so+o+b]);
-	bo++;
+	buffer[bo+b]=compress(rhs._odata[so+o+b]);
      }
    }
  }
@@ -55,9 +54,8 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
  }

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  int bo  = 0;                                      // offset in buffer
    
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){

@@ -104,15 +102,15 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
  }

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  int bo  = 0;                                      // offset in buffer
    
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-      int o=n*rhs._grid->_slice_stride[dimension];
+      int o   =n*rhs._grid->_slice_stride[dimension];
+      int bo  =n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb & cbmask ) {
-	rhs._odata[so+o+b]=buffer[bo++];
+	rhs._odata[so+o+b]=buffer[bo+b];
      }
    }
  }
@@ -131,7 +129,7 @@ PARALLEL_NESTED_LOOP(2)

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
    
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){

@@ -160,7 +158,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
  
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
      
@@ -185,7 +183,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
  
-PARALLEL_NESTED_LOOP(2)
+PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
      int o =n*rhs._grid->_slice_stride[dimension];
--- a/lib/cshift/Grid_cshift_mpi.h
+++ b/lib/cshift/Grid_cshift_mpi.h
@@ -1,10 +1,6 @@
 #ifndef _GRID_CSHIFT_MPI_H_
 #define _GRID_CSHIFT_MPI_H_

-#ifndef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
-#define MIN(x,y) ((x)>(y)?(y):(x))
-#endif

 namespace Grid { 

--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -100,22 +100,21 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
  vHalfSpinColourVector  chi;    
  vSpinColourVector result;
  vHalfSpinColourVector Uchi;
-  vHalfSpinColourVector *chi_p;
  int offset,local,perm, ptype;

 PARALLEL_FOR_LOOP
  for(int sss=0;sss<grid->oSites();sss++){

    int ss = sss;
-    int ssu= sss;
-    //int ss = Stencil._LebesgueReorder[sss];
+    int ssu= ss;
+    //    int ss = Stencil._LebesgueReorder[sss];

    // Xp
    offset = Stencil._offsets [Xp][ss];
    local  = Stencil._is_local[Xp][ss];
    perm   = Stencil._permute[Xp][ss];
    ptype  = Stencil._permute_type[Xp];
-    chi_p  = &comm_buf[offset];
+
    if ( local && perm ) 
    {
      spProjXp(tmp,in._odata[offset]);