mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	OMP collapse changes to make NVCC happy
This commit is contained in:
		@@ -48,7 +48,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
 | 
			
		||||
 | 
			
		||||
  int stride=rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
  if ( cbmask == 0x3 ) { 
 | 
			
		||||
    thread_loop_collapse( 2, (int n=0;n<e1;n++) , 
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<e1;n++) , 
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o  = n*stride;
 | 
			
		||||
	int bo = n*e2;
 | 
			
		||||
@@ -92,7 +92,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
 | 
			
		||||
  int n1=rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( cbmask ==0x3){
 | 
			
		||||
    thread_loop_collapse( 2, (int n=0;n<e1;n++), {
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<e1;n++), {
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
	int o      =   n*n1;
 | 
			
		||||
@@ -108,7 +108,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
 | 
			
		||||
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
 | 
			
		||||
    // Test_cshift_red_black code.
 | 
			
		||||
    std::cout << " Dense packed buffer WARNING " <<std::endl;
 | 
			
		||||
    thread_loop_collapse( 2, (int n=0;n<e1;n++),{
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<e1;n++),{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
	int o=n*n1;
 | 
			
		||||
@@ -142,7 +142,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
 | 
			
		||||
  int stride=rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
  
 | 
			
		||||
  if ( cbmask ==0x3 ) {
 | 
			
		||||
    thread_loop_collapse( 2, (int n=0;n<e1;n++),{
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<e1;n++),{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o   =n*rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
	int bo  =n*rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
@@ -184,7 +184,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
 | 
			
		||||
  if(cbmask ==0x3 ) {
 | 
			
		||||
    thread_loop_collapse(2, (int n=0;n<e1;n++),{
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<e1;n++),{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o      = n*rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
	int offset = b+n*rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
@@ -228,7 +228,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
  int stride = rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
  if(cbmask == 0x3 ){
 | 
			
		||||
    thread_loop_collapse( 2,(int n=0;n<e1;n++),{
 | 
			
		||||
    thread_loop_collapse2((int n=0;n<e1;n++),{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
  	//lhs[lo+o]=rhs[ro+o];
 | 
			
		||||
@@ -236,7 +236,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
  } else { 
 | 
			
		||||
    thread_loop_collapse(2, (int n=0;n<e1;n++),{
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<e1;n++),{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
        int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
 | 
			
		||||
@@ -266,7 +266,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block [dimension];
 | 
			
		||||
  int stride = rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  thread_loop_collapse(2, (int n=0;n<e1;n++),{
 | 
			
		||||
  thread_loop_collapse2( (int n=0;n<e1;n++),{
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
      int o  =n*stride;
 | 
			
		||||
 
 | 
			
		||||
@@ -51,12 +51,11 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
 | 
			
		||||
  int block =FullGrid->_slice_block [Orthog];
 | 
			
		||||
  int nblock=FullGrid->_slice_nblock[Orthog];
 | 
			
		||||
  int ostride=FullGrid->_ostride[Orthog];
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  thread_region
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<vobj> s_x(Nblock);
 | 
			
		||||
 | 
			
		||||
#pragma omp for collapse(2)
 | 
			
		||||
    for(int n=0;n<nblock;n++){
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<nblock;n++),{
 | 
			
		||||
      for(int b=0;b<block;b++){
 | 
			
		||||
	int o  = n*stride + b;
 | 
			
		||||
 | 
			
		||||
@@ -72,7 +71,7 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
 | 
			
		||||
	  }
 | 
			
		||||
	  R[o+i*ostride]=dot;
 | 
			
		||||
	}
 | 
			
		||||
      }}
 | 
			
		||||
      }});
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@@ -101,12 +100,11 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
 | 
			
		||||
  int block =FullGrid->_slice_block [Orthog];
 | 
			
		||||
  int nblock=FullGrid->_slice_nblock[Orthog];
 | 
			
		||||
  int ostride=FullGrid->_ostride[Orthog];
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  thread_region
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<vobj> s_x(Nblock);
 | 
			
		||||
 | 
			
		||||
#pragma omp for collapse(2)
 | 
			
		||||
    for(int n=0;n<nblock;n++){
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<nblock;n++),{
 | 
			
		||||
      for(int b=0;b<block;b++){
 | 
			
		||||
	int o  = n*stride + b;
 | 
			
		||||
 | 
			
		||||
@@ -122,7 +120,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
 | 
			
		||||
	  }
 | 
			
		||||
	  R[o+i*ostride]=dot;
 | 
			
		||||
	}
 | 
			
		||||
      }}
 | 
			
		||||
    }});
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
@@ -159,14 +157,12 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_typeD;
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  thread_region {
 | 
			
		||||
    std::vector<vobj> Left(Nblock);
 | 
			
		||||
    std::vector<vobj> Right(Nblock);
 | 
			
		||||
    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 | 
			
		||||
 | 
			
		||||
#pragma omp for collapse(2)
 | 
			
		||||
    for(int n=0;n<nblock;n++){
 | 
			
		||||
    thread_loop_collapse2((int n=0;n<nblock;n++),{
 | 
			
		||||
      for(int b=0;b<block;b++){
 | 
			
		||||
 | 
			
		||||
	int o  = n*stride + b;
 | 
			
		||||
@@ -182,9 +178,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
	    auto rtmp = TensorRemove(tmp);
 | 
			
		||||
	    mat_thread(i,j) += Reduce(rtmp);
 | 
			
		||||
	  }}
 | 
			
		||||
      }}
 | 
			
		||||
#pragma omp critical
 | 
			
		||||
    {
 | 
			
		||||
    }});
 | 
			
		||||
    thread_critical {
 | 
			
		||||
      mat += mat_thread;
 | 
			
		||||
    }  
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -358,7 +358,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
 | 
			
		||||
 | 
			
		||||
    tensor_reduced at; at=av;
 | 
			
		||||
 | 
			
		||||
    thread_loop_collapse(2, (int n=0;n<e1;n++),{
 | 
			
		||||
    thread_loop_collapse2( (int n=0;n<e1;n++),{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int ss= so+n*stride+b;
 | 
			
		||||
	R[ss] = at*X[ss]+Y[ss];
 | 
			
		||||
 
 | 
			
		||||
@@ -52,7 +52,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#define thread_loop( range , ... )           _Pragma("omp parallel for schedule(static)") for range { __VA_ARGS__ ; };
 | 
			
		||||
#define thread_loop_in_region( range , ... ) _Pragma("omp for schedule(static)")          for range  { __VA_ARGS__ ; };
 | 
			
		||||
#define thread_loop_collapse( n, range , ... )  _Pragma("omp parallel for collapse(" #n ")")      for range  { __VA_ARGS__ };
 | 
			
		||||
#define thread_loop_collapse2( range , ... )  _Pragma("omp parallel for collapse(2)")     for range  { __VA_ARGS__ };
 | 
			
		||||
#define thread_loop_collapse3( range , ... )  _Pragma("omp parallel for collapse(3)")     for range  { __VA_ARGS__ };
 | 
			
		||||
#define thread_loop_collapse4( range , ... )  _Pragma("omp parallel for collapse(4)")     for range  { __VA_ARGS__ };
 | 
			
		||||
#define thread_region                         _Pragma("omp parallel")
 | 
			
		||||
#define thread_critical                       _Pragma("omp critical")
 | 
			
		||||
#define thread_num(a) omp_get_thread_num()
 | 
			
		||||
@@ -60,7 +62,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#else
 | 
			
		||||
#define thread_loop( range , ... )            for range { __VA_ARGS__ ; };
 | 
			
		||||
#define thread_loop_in_region( range , ... )  for range { __VA_ARGS__ ; };
 | 
			
		||||
#define thread_loop_collapse( n, range , ... )   for range { __VA_ARGS__ ; };
 | 
			
		||||
#define thread_loop_collapse2( range , ... )  for range { __VA_ARGS__ ; };
 | 
			
		||||
#define thread_loop_collapse3( range , ... )  for range { __VA_ARGS__ ; };
 | 
			
		||||
#define thread_loop_collapse4( range , ... )  for range { __VA_ARGS__ ; };
 | 
			
		||||
#define thread_region                           
 | 
			
		||||
#define thread_critical                         
 | 
			
		||||
#define thread_num(a) (0)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user