1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

I have made the Cshift work successfully with open mp threading in

every routine. Collapse(2) is now working under clang-omp++.
This commit is contained in:
Peter Boyle
2015-05-13 00:31:00 +01:00
parent 6cec662ac5
commit 48f425d31c
13 changed files with 166 additions and 1006 deletions

View File

@ -24,21 +24,21 @@
#include <stdio.h>
#include <signal.h>
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
#include <Grid_config.h>
////////////////////////////////////////////////////////////
// Tunable header includes
////////////////////////////////////////////////////////////
#ifdef HAVE_OPENMP
#define OMP
#include <omp.h>
#endif
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif

View File

@ -250,7 +250,11 @@ namespace Grid {
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
int words = sizeof(cobj)/sizeof(vector_type);
/* FIXME ALTERNATE BUFFER DETERMINATION ; possibly slow to allocate*/
/*
* possibly slow to allocate
* Doesn't matter in this test, but may want to preallocate in the
* dirac operators
*/
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);

View File

@ -1,13 +1,17 @@
#ifndef GRID_THREADS_H
#define GRID_THREADS_H
#ifdef HAVE_OPENMP
#ifdef _OPENMP
#define GRID_OMP
#endif
#ifdef GRID_OMP
#include <omp.h>
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for")
#define PARALLEL_NESTED_LOOP(n) _Pragma("omp parallel for collapse(" #n ")")
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
#else
#define PARALLEL_FOR_LOOP
#define PARALLEL_NESTED_LOOP(n)
#define PARALLEL_NESTED_LOOP2
#endif
namespace Grid {
@ -20,7 +24,7 @@ class GridThread {
static int _threads;
static void SetThreads(int thr) {
#ifdef HAVE_OPENMP
#ifdef GRID_OMP
_threads = MIN(thr,omp_get_max_threads()) ;
omp_set_num_threads(_threads);
#else
@ -28,7 +32,7 @@ class GridThread {
#endif
};
static void SetMaxThreads(void) {
#ifdef HAVE_OPENMP
#ifdef GRID_OMP
_threads = omp_get_max_threads();
omp_set_num_threads(_threads);
#else
@ -58,7 +62,7 @@ class GridThread {
};
static int ThreadBarrier(void) {
#ifdef HAVE_OPENMP
#ifdef GRID_OMP
#pragma omp barrier
return omp_get_thread_num();
#else

View File

@ -26,16 +26,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int bo = 0; // offset in buffer
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int o = n*rhs._grid->_slice_stride[dimension];
int o = n*rhs._grid->_slice_stride[dimension];
int bo = n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo]=compress(rhs._odata[so+o+b]);
bo++;
buffer[bo+b]=compress(rhs._odata[so+o+b]);
}
}
}
@ -55,9 +54,8 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int bo = 0; // offset in buffer
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
@ -104,15 +102,15 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int bo = 0; // offset in buffer
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int o=n*rhs._grid->_slice_stride[dimension];
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
rhs._odata[so+o+b]=buffer[bo+b];
}
}
}
@ -131,7 +129,7 @@ PARALLEL_NESTED_LOOP(2)
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
@ -160,7 +158,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
@ -185,7 +183,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
PARALLEL_NESTED_LOOP(2)
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int o =n*rhs._grid->_slice_stride[dimension];

View File

@ -1,10 +1,6 @@
#ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
namespace Grid {

View File

@ -100,22 +100,21 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
vHalfSpinColourVector chi;
vSpinColourVector result;
vHalfSpinColourVector Uchi;
vHalfSpinColourVector *chi_p;
int offset,local,perm, ptype;
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
int ss = sss;
int ssu= sss;
//int ss = Stencil._LebesgueReorder[sss];
int ssu= ss;
// int ss = Stencil._LebesgueReorder[sss];
// Xp
offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss];
perm = Stencil._permute[Xp][ss];
ptype = Stencil._permute_type[Xp];
chi_p = &comm_buf[offset];
if ( local && perm )
{
spProjXp(tmp,in._odata[offset]);