1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

OMP collapse changes to make NVCC happy

This commit is contained in:
paboyle 2018-01-28 01:21:53 +00:00
parent b6ebf35af5
commit e657f9a344
4 changed files with 25 additions and 26 deletions

View File

@ -48,7 +48,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int stride=rhs.Grid()->_slice_stride[dimension];
if ( cbmask == 0x3 ) {
thread_loop_collapse( 2, (int n=0;n<e1;n++) ,
thread_loop_collapse2( (int n=0;n<e1;n++) ,
for(int b=0;b<e2;b++){
int o = n*stride;
int bo = n*e2;
@ -92,7 +92,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){
thread_loop_collapse( 2, (int n=0;n<e1;n++), {
thread_loop_collapse2( (int n=0;n<e1;n++), {
for(int b=0;b<e2;b++){
int o = n*n1;
@ -108,7 +108,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl;
thread_loop_collapse( 2, (int n=0;n<e1;n++),{
thread_loop_collapse2( (int n=0;n<e1;n++),{
for(int b=0;b<e2;b++){
int o=n*n1;
@ -142,7 +142,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int stride=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3 ) {
thread_loop_collapse( 2, (int n=0;n<e1;n++),{
thread_loop_collapse2( (int n=0;n<e1;n++),{
for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension];
@ -184,7 +184,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) {
thread_loop_collapse(2, (int n=0;n<e1;n++),{
thread_loop_collapse2( (int n=0;n<e1;n++),{
for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension];
@ -228,7 +228,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension];
if(cbmask == 0x3 ){
thread_loop_collapse( 2,(int n=0;n<e1;n++),{
thread_loop_collapse2((int n=0;n<e1;n++),{
for(int b=0;b<e2;b++){
int o =n*stride+b;
//lhs[lo+o]=rhs[ro+o];
@ -236,7 +236,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}
});
} else {
thread_loop_collapse(2, (int n=0;n<e1;n++),{
thread_loop_collapse2( (int n=0;n<e1;n++),{
for(int b=0;b<e2;b++){
int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
@ -266,7 +266,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension];
thread_loop_collapse(2, (int n=0;n<e1;n++),{
thread_loop_collapse2( (int n=0;n<e1;n++),{
for(int b=0;b<e2;b++){
int o =n*stride;

View File

@ -51,12 +51,11 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
thread_region
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
@ -72,7 +71,7 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
}
R[o+i*ostride]=dot;
}
}}
}});
}
};
@ -101,12 +100,11 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
thread_region
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
@ -122,7 +120,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
}
R[o+i*ostride]=dot;
}
}}
}});
}
};
@ -159,14 +157,12 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD;
#pragma omp parallel
{
thread_region {
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
thread_loop_collapse2((int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
@ -182,9 +178,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
auto rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp);
}}
}}
#pragma omp critical
{
}});
thread_critical {
mat += mat_thread;
}
}

View File

@ -358,7 +358,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av;
thread_loop_collapse(2, (int n=0;n<e1;n++),{
thread_loop_collapse2( (int n=0;n<e1;n++),{
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
R[ss] = at*X[ss]+Y[ss];

View File

@ -52,7 +52,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_OMP
#define thread_loop( range , ... ) _Pragma("omp parallel for schedule(static)") for range { __VA_ARGS__ ; };
#define thread_loop_in_region( range , ... ) _Pragma("omp for schedule(static)") for range { __VA_ARGS__ ; };
#define thread_loop_collapse( n, range , ... ) _Pragma("omp parallel for collapse(" #n ")") for range { __VA_ARGS__ };
#define thread_loop_collapse2( range , ... ) _Pragma("omp parallel for collapse(2)") for range { __VA_ARGS__ };
#define thread_loop_collapse3( range , ... ) _Pragma("omp parallel for collapse(3)") for range { __VA_ARGS__ };
#define thread_loop_collapse4( range , ... ) _Pragma("omp parallel for collapse(4)") for range { __VA_ARGS__ };
#define thread_region _Pragma("omp parallel")
#define thread_critical _Pragma("omp critical")
#define thread_num(a) omp_get_thread_num()
@ -60,7 +62,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#else
#define thread_loop( range , ... ) for range { __VA_ARGS__ ; };
#define thread_loop_in_region( range , ... ) for range { __VA_ARGS__ ; };
#define thread_loop_collapse( n, range , ... ) for range { __VA_ARGS__ ; };
#define thread_loop_collapse2( range , ... ) for range { __VA_ARGS__ ; };
#define thread_loop_collapse3( range , ... ) for range { __VA_ARGS__ ; };
#define thread_loop_collapse4( range , ... ) for range { __VA_ARGS__ ; };
#define thread_region
#define thread_critical
#define thread_num(a) (0)