1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

I have made the Cshift work successfully with open mp threading in

every routine. Collapse(2) is now working under clang-omp++.
This commit is contained in:
Peter Boyle 2015-05-13 00:31:00 +01:00
parent 52174da232
commit b4a570477c
13 changed files with 166 additions and 1006 deletions

View File

@ -205,12 +205,9 @@ AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CXX = @CXX@
CXXCPP = @CXXCPP@
CXXDEPMODE = @CXXDEPMODE@
CXXFLAGS = @CXXFLAGS@
CYGPATH_W = @CYGPATH_W@
@ -234,7 +231,7 @@ LTLIBOBJS = @LTLIBOBJS@
MAKEINFO = @MAKEINFO@
MKDIR_P = @MKDIR_P@
OBJEXT = @OBJEXT@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OPENMP_CXXFLAGS = @OPENMP_CXXFLAGS@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
@ -252,7 +249,6 @@ abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_CC = @ac_ct_CC@
ac_ct_CXX = @ac_ct_CXX@
am__include = @am__include@
am__leading_dot = @am__leading_dot@

21
TODO
View File

@ -2,10 +2,7 @@
*** Hacks and bug fixes to clean up and Audits
================================================================
* Base class to share common code between vRealF, VComplexF etc...
* Performance check on Guido's reimplementation strategy
* Bug in SeedFixedIntegers gives same output on each site. -- Think I fixed but NOT checked for sure
- Performance check on Guido's reimplementation strategy
* FIXME audit
* const audit
@ -20,15 +17,16 @@
*** New Functionality
================================================================
* Implement where to take template scheme.
* Implement where within expression template scheme.
* - BinaryWriter, TextWriter etc...
- use protocol buffers? replace xmlReader/Writer ec..
- Binary use htonll, htonl
* Expression template engine:
* Bug in SeedFixedIntegers gave same output on each site. -- Think I fixed but NOT checked for sure
Implement and use lattice IO to verify this.
-- Audit
* Expression template engine: -- DONE
-- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>
* CovariantShift support -----Use a class to store gauge field? (parallel transport?)
@ -56,8 +54,9 @@
* TaProj
* FFTnD ?
* Parallel MPI2 IO
Plaquette checks into nersc reader.
* Parallel io improvements
- optional parallel MPI2 IO
- move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
* rb4d support for 5th dimension in Mobius.
@ -70,13 +69,15 @@
// copyMask.
// localMaxAbs
// Fourier transform equivalent.
Actions
Actions -- coherent framework for implementing actions and their forces.
* Fermion
- Wilson
- Clover
- DomainWall
- Mobius
- z-Mobius
* Gauge
- Wilson, symanzik, iwasaki

64
aclocal.m4 vendored
View File

@ -765,70 +765,6 @@ AC_DEFUN([_AM_SET_OPTIONS],
AC_DEFUN([_AM_IF_OPTION],
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
# Copyright (C) 1999-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# _AM_PROG_CC_C_O
# ---------------
# Like AC_PROG_CC_C_O, but changed for automake. We rewrite AC_PROG_CC
# to automatically call this.
AC_DEFUN([_AM_PROG_CC_C_O],
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
AC_REQUIRE_AUX_FILE([compile])dnl
AC_LANG_PUSH([C])dnl
AC_CACHE_CHECK(
[whether $CC understands -c and -o together],
[am_cv_prog_cc_c_o],
[AC_LANG_CONFTEST([AC_LANG_PROGRAM([])])
# Make sure it works both with $CC and with simple cc.
# Following AC_PROG_CC_C_O, we do the test twice because some
# compilers refuse to overwrite an existing .o file with -o,
# though they will create one.
am_cv_prog_cc_c_o=yes
for am_i in 1 2; do
if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \
&& test -f conftest2.$ac_objext; then
: OK
else
am_cv_prog_cc_c_o=no
break
fi
done
rm -f core conftest*
unset am_i])
if test "$am_cv_prog_cc_c_o" != yes; then
# Losing compiler, so override with the script.
# FIXME: It is wrong to rewrite CC.
# But if we don't then we get into trouble of one sort or another.
# A longer-term fix would be to have automake use am__CC in this case,
# and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
CC="$am_aux_dir/compile $CC"
fi
AC_LANG_POP([C])])
# For backward compatibility.
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# AM_RUN_LOG(COMMAND)
# -------------------
# Run COMMAND, save the exit status in ac_status, and log it.
# (This has been adapted from Autoconf's _AC_RUN_LOG macro.)
AC_DEFUN([AM_RUN_LOG],
[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD
($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
(exit $ac_status); }])
# Check to make sure that the build environment is sane. -*- Autoconf -*-
# Copyright (C) 1996-2014 Free Software Foundation, Inc.

View File

@ -77,7 +77,7 @@ int main (int argc, char ** argv)
WilsonMatrix Dw(Umu,mass);
std::cout << "Calling Dw"<<std::endl;
int ncall=10000;
int ncall=1000;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.multiply(src,result);

1004
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,4 @@
CXX=clang-omp++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp" --enable-comms=mpi
CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx" --enable-comms=mpi
CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -g -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx" --enable-comms=mpi
CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -g -std=c++11" LDFLAGS= LIBS=-lmpi --enable-comms=fake

View File

@ -6,6 +6,7 @@ AC_CONFIG_SRCDIR([lib/Grid.h])
AC_CONFIG_HEADERS([lib/Grid_config.h])
# Checks for programs.
AC_LANG(C++)
AC_PROG_CXX
AC_OPENMP
AC_PROG_RANLIB

View File

@ -24,21 +24,21 @@
#include <stdio.h>
#include <signal.h>
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
#include <Grid_config.h>
////////////////////////////////////////////////////////////
// Tunable header includes
////////////////////////////////////////////////////////////
#ifdef HAVE_OPENMP
#define OMP
#include <omp.h>
#endif
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif

View File

@ -250,7 +250,11 @@ namespace Grid {
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
int words = sizeof(cobj)/sizeof(vector_type);
/* FIXME ALTERNATE BUFFER DETERMINATION ; possibly slow to allocate*/
/*
* possibly slow to allocate
* Doesn't matter in this test, but may want to preallocate in the
* dirac operators
*/
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);

View File

@ -1,13 +1,17 @@
#ifndef GRID_THREADS_H
#define GRID_THREADS_H
#ifdef HAVE_OPENMP
#ifdef _OPENMP
#define GRID_OMP
#endif
#ifdef GRID_OMP
#include <omp.h>
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for")
#define PARALLEL_NESTED_LOOP(n) _Pragma("omp parallel for collapse(" #n ")")
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
#else
#define PARALLEL_FOR_LOOP
#define PARALLEL_NESTED_LOOP(n)
#define PARALLEL_NESTED_LOOP2
#endif
namespace Grid {
@ -20,7 +24,7 @@ class GridThread {
static int _threads;
static void SetThreads(int thr) {
#ifdef HAVE_OPENMP
#ifdef GRID_OMP
_threads = MIN(thr,omp_get_max_threads()) ;
omp_set_num_threads(_threads);
#else
@ -28,7 +32,7 @@ class GridThread {
#endif
};
static void SetMaxThreads(void) {
#ifdef HAVE_OPENMP
#ifdef GRID_OMP
_threads = omp_get_max_threads();
omp_set_num_threads(_threads);
#else
@ -58,7 +62,7 @@ class GridThread {
};
static int ThreadBarrier(void) {
#ifdef HAVE_OPENMP
#ifdef GRID_OMP
#pragma omp barrier
return omp_get_thread_num();
#else

View File

@ -26,16 +26,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int bo = 0; // offset in buffer
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int o = n*rhs._grid->_slice_stride[dimension];
int bo = n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo]=compress(rhs._odata[so+o+b]);
bo++;
buffer[bo+b]=compress(rhs._odata[so+o+b]);
}
}
}
@ -55,9 +54,8 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int bo = 0; // offset in buffer
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
@ -104,15 +102,15 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int bo = 0; // offset in buffer
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int o=n*rhs._grid->_slice_stride[dimension];
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
rhs._odata[so+o+b]=buffer[bo+b];
}
}
}
@ -131,7 +129,7 @@ PARALLEL_NESTED_LOOP(2)
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
@ -160,7 +158,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
@ -185,7 +183,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int o =n*rhs._grid->_slice_stride[dimension];

View File

@ -1,10 +1,6 @@
#ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
namespace Grid {

View File

@ -100,22 +100,21 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
vHalfSpinColourVector chi;
vSpinColourVector result;
vHalfSpinColourVector Uchi;
vHalfSpinColourVector *chi_p;
int offset,local,perm, ptype;
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
int ss = sss;
int ssu= sss;
//int ss = Stencil._LebesgueReorder[sss];
int ssu= ss;
// int ss = Stencil._LebesgueReorder[sss];
// Xp
offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss];
perm = Stencil._permute[Xp][ss];
ptype = Stencil._permute_type[Xp];
chi_p = &comm_buf[offset];
if ( local && perm )
{
spProjXp(tmp,in._odata[offset]);