mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
I have made the Cshift work successfully with open mp threading in
every routine. Collapse(2) is now working under clang-omp++.
This commit is contained in:
parent
6cec662ac5
commit
48f425d31c
@ -205,12 +205,9 @@ AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CXX = @CXX@
|
||||
CXXCPP = @CXXCPP@
|
||||
CXXDEPMODE = @CXXDEPMODE@
|
||||
CXXFLAGS = @CXXFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
@ -234,7 +231,7 @@ LTLIBOBJS = @LTLIBOBJS@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
OBJEXT = @OBJEXT@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OPENMP_CXXFLAGS = @OPENMP_CXXFLAGS@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
@ -252,7 +249,6 @@ abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_CXX = @ac_ct_CXX@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
|
23
TODO
23
TODO
@ -2,10 +2,7 @@
|
||||
*** Hacks and bug fixes to clean up and Audits
|
||||
================================================================
|
||||
* Base class to share common code between vRealF, VComplexF etc...
|
||||
|
||||
* Performance check on Guido's reimplementation strategy
|
||||
|
||||
* Bug in SeedFixedIntegers gives same output on each site. -- Think I fixed but NOT checked for sure
|
||||
- Performance check on Guido's reimplementation strategy
|
||||
|
||||
* FIXME audit
|
||||
* const audit
|
||||
@ -20,15 +17,16 @@
|
||||
*** New Functionality
|
||||
================================================================
|
||||
|
||||
* Implement where to take template scheme.
|
||||
* Implement where within expression template scheme.
|
||||
|
||||
* - BinaryWriter, TextWriter etc...
|
||||
- use protocol buffers? replace xmlReader/Writer ec..
|
||||
- Binary use htonll, htonl
|
||||
|
||||
* Expression template engine:
|
||||
* Bug in SeedFixedIntegers gave same output on each site. -- Think I fixed but NOT checked for sure
|
||||
Implement and use lattice IO to verify this.
|
||||
|
||||
-- Audit
|
||||
* Expression template engine: -- DONE
|
||||
-- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>
|
||||
|
||||
* CovariantShift support -----Use a class to store gauge field? (parallel transport?)
|
||||
@ -56,9 +54,10 @@
|
||||
* TaProj
|
||||
* FFTnD ?
|
||||
|
||||
* Parallel MPI2 IO
|
||||
Plaquette checks into nersc reader.
|
||||
|
||||
* Parallel io improvements
|
||||
- optional parallel MPI2 IO
|
||||
- move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
|
||||
|
||||
* rb4d support for 5th dimension in Mobius.
|
||||
|
||||
* Check for missing functionality - partially audited against QDP++ layout
|
||||
@ -70,13 +69,15 @@
|
||||
// copyMask.
|
||||
// localMaxAbs
|
||||
// Fourier transform equivalent.
|
||||
Actions
|
||||
|
||||
Actions -- coherent framework for implementing actions and their forces.
|
||||
* Fermion
|
||||
- Wilson
|
||||
- Clover
|
||||
- DomainWall
|
||||
- Mobius
|
||||
- z-Mobius
|
||||
|
||||
* Gauge
|
||||
- Wilson, symanzik, iwasaki
|
||||
|
||||
|
64
aclocal.m4
vendored
64
aclocal.m4
vendored
@ -765,70 +765,6 @@ AC_DEFUN([_AM_SET_OPTIONS],
|
||||
AC_DEFUN([_AM_IF_OPTION],
|
||||
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
|
||||
|
||||
# Copyright (C) 1999-2014 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# _AM_PROG_CC_C_O
|
||||
# ---------------
|
||||
# Like AC_PROG_CC_C_O, but changed for automake. We rewrite AC_PROG_CC
|
||||
# to automatically call this.
|
||||
AC_DEFUN([_AM_PROG_CC_C_O],
|
||||
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
||||
AC_REQUIRE_AUX_FILE([compile])dnl
|
||||
AC_LANG_PUSH([C])dnl
|
||||
AC_CACHE_CHECK(
|
||||
[whether $CC understands -c and -o together],
|
||||
[am_cv_prog_cc_c_o],
|
||||
[AC_LANG_CONFTEST([AC_LANG_PROGRAM([])])
|
||||
# Make sure it works both with $CC and with simple cc.
|
||||
# Following AC_PROG_CC_C_O, we do the test twice because some
|
||||
# compilers refuse to overwrite an existing .o file with -o,
|
||||
# though they will create one.
|
||||
am_cv_prog_cc_c_o=yes
|
||||
for am_i in 1 2; do
|
||||
if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \
|
||||
&& test -f conftest2.$ac_objext; then
|
||||
: OK
|
||||
else
|
||||
am_cv_prog_cc_c_o=no
|
||||
break
|
||||
fi
|
||||
done
|
||||
rm -f core conftest*
|
||||
unset am_i])
|
||||
if test "$am_cv_prog_cc_c_o" != yes; then
|
||||
# Losing compiler, so override with the script.
|
||||
# FIXME: It is wrong to rewrite CC.
|
||||
# But if we don't then we get into trouble of one sort or another.
|
||||
# A longer-term fix would be to have automake use am__CC in this case,
|
||||
# and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
|
||||
CC="$am_aux_dir/compile $CC"
|
||||
fi
|
||||
AC_LANG_POP([C])])
|
||||
|
||||
# For backward compatibility.
|
||||
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
|
||||
|
||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# AM_RUN_LOG(COMMAND)
|
||||
# -------------------
|
||||
# Run COMMAND, save the exit status in ac_status, and log it.
|
||||
# (This has been adapted from Autoconf's _AC_RUN_LOG macro.)
|
||||
AC_DEFUN([AM_RUN_LOG],
|
||||
[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD
|
||||
($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD
|
||||
ac_status=$?
|
||||
echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
|
||||
(exit $ac_status); }])
|
||||
|
||||
# Check to make sure that the build environment is sane. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
|
||||
|
@ -77,7 +77,7 @@ int main (int argc, char ** argv)
|
||||
WilsonMatrix Dw(Umu,mass);
|
||||
|
||||
std::cout << "Calling Dw"<<std::endl;
|
||||
int ncall=10000;
|
||||
int ncall=1000;
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.multiply(src,result);
|
||||
|
@ -1,3 +1,4 @@
|
||||
CXX=clang-omp++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp" --enable-comms=mpi
|
||||
CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx" --enable-comms=mpi
|
||||
CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -g -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx" --enable-comms=mpi
|
||||
CXX=clang++ ./configure --enable-simd=AVX CXXFLAGS="-mavx -g -std=c++11" LDFLAGS= LIBS=-lmpi --enable-comms=fake
|
||||
|
@ -6,6 +6,7 @@ AC_CONFIG_SRCDIR([lib/Grid.h])
|
||||
AC_CONFIG_HEADERS([lib/Grid_config.h])
|
||||
|
||||
# Checks for programs.
|
||||
AC_LANG(C++)
|
||||
AC_PROG_CXX
|
||||
AC_OPENMP
|
||||
AC_PROG_RANLIB
|
||||
|
10
lib/Grid.h
10
lib/Grid.h
@ -24,21 +24,21 @@
|
||||
#include <stdio.h>
|
||||
#include <signal.h>
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||
#endif
|
||||
|
||||
#include <Grid_config.h>
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Tunable header includes
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef HAVE_OPENMP
|
||||
#define OMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_MALLOC_MALLOC_H
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_MALLOC_H
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
@ -250,7 +250,11 @@ namespace Grid {
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
int words = sizeof(cobj)/sizeof(vector_type);
|
||||
|
||||
/* FIXME ALTERNATE BUFFER DETERMINATION ; possibly slow to allocate*/
|
||||
/*
|
||||
* possibly slow to allocate
|
||||
* Doesn't matter in this test, but may want to preallocate in the
|
||||
* dirac operators
|
||||
*/
|
||||
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
@ -1,13 +1,17 @@
|
||||
#ifndef GRID_THREADS_H
|
||||
#define GRID_THREADS_H
|
||||
|
||||
#ifdef HAVE_OPENMP
|
||||
#ifdef _OPENMP
|
||||
#define GRID_OMP
|
||||
#endif
|
||||
|
||||
#ifdef GRID_OMP
|
||||
#include <omp.h>
|
||||
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for")
|
||||
#define PARALLEL_NESTED_LOOP(n) _Pragma("omp parallel for collapse(" #n ")")
|
||||
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
|
||||
#else
|
||||
#define PARALLEL_FOR_LOOP
|
||||
#define PARALLEL_NESTED_LOOP(n)
|
||||
#define PARALLEL_NESTED_LOOP2
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
@ -20,7 +24,7 @@ class GridThread {
|
||||
static int _threads;
|
||||
|
||||
static void SetThreads(int thr) {
|
||||
#ifdef HAVE_OPENMP
|
||||
#ifdef GRID_OMP
|
||||
_threads = MIN(thr,omp_get_max_threads()) ;
|
||||
omp_set_num_threads(_threads);
|
||||
#else
|
||||
@ -28,7 +32,7 @@ class GridThread {
|
||||
#endif
|
||||
};
|
||||
static void SetMaxThreads(void) {
|
||||
#ifdef HAVE_OPENMP
|
||||
#ifdef GRID_OMP
|
||||
_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(_threads);
|
||||
#else
|
||||
@ -58,7 +62,7 @@ class GridThread {
|
||||
};
|
||||
|
||||
static int ThreadBarrier(void) {
|
||||
#ifdef HAVE_OPENMP
|
||||
#ifdef GRID_OMP
|
||||
#pragma omp barrier
|
||||
return omp_get_thread_num();
|
||||
#else
|
||||
|
@ -26,16 +26,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
|
||||
}
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
PARALLEL_NESTED_LOOP(2)
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int bo = n*rhs._grid->_slice_block[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb &cbmask ) {
|
||||
buffer[bo]=compress(rhs._odata[so+o+b]);
|
||||
bo++;
|
||||
buffer[bo+b]=compress(rhs._odata[so+o+b]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -55,9 +54,8 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
||||
}
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
PARALLEL_NESTED_LOOP(2)
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
@ -104,15 +102,15 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
||||
}
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
PARALLEL_NESTED_LOOP(2)
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
int o=n*rhs._grid->_slice_stride[dimension];
|
||||
int o =n*rhs._grid->_slice_stride[dimension];
|
||||
int bo =n*rhs._grid->_slice_block[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
rhs._odata[so+o+b]=buffer[bo++];
|
||||
rhs._odata[so+o+b]=buffer[bo+b];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -131,7 +129,7 @@ PARALLEL_NESTED_LOOP(2)
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
PARALLEL_NESTED_LOOP(2)
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
@ -160,7 +158,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
PARALLEL_NESTED_LOOP(2)
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
@ -185,7 +183,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
PARALLEL_NESTED_LOOP(2)
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
int o =n*rhs._grid->_slice_stride[dimension];
|
||||
|
@ -1,10 +1,6 @@
|
||||
#ifndef _GRID_CSHIFT_MPI_H_
|
||||
#define _GRID_CSHIFT_MPI_H_
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
|
||||
|
@ -100,22 +100,21 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
vHalfSpinColourVector chi;
|
||||
vSpinColourVector result;
|
||||
vHalfSpinColourVector Uchi;
|
||||
vHalfSpinColourVector *chi_p;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<grid->oSites();sss++){
|
||||
|
||||
int ss = sss;
|
||||
int ssu= sss;
|
||||
//int ss = Stencil._LebesgueReorder[sss];
|
||||
int ssu= ss;
|
||||
// int ss = Stencil._LebesgueReorder[sss];
|
||||
|
||||
// Xp
|
||||
offset = Stencil._offsets [Xp][ss];
|
||||
local = Stencil._is_local[Xp][ss];
|
||||
perm = Stencil._permute[Xp][ss];
|
||||
ptype = Stencil._permute_type[Xp];
|
||||
chi_p = &comm_buf[offset];
|
||||
|
||||
if ( local && perm )
|
||||
{
|
||||
spProjXp(tmp,in._odata[offset]);
|
||||
|
Loading…
Reference in New Issue
Block a user