mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 12:04:33 +00:00 
			
		
		
		
	Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided
loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table to make a look sufficiently simple to thread.
This commit is contained in:
		| @@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator< | |||||||
|    |    | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|   int bo=0; |  | ||||||
|     //PARALLEL_NESTED_LOOP21 |   if ( cbmask == 0x3 ) {  | ||||||
|   for(int n=0;n<e1;n++){ | PARALLEL_NESTED_LOOP2 | ||||||
|     for(int b=0;b<e2;b++){ |     for(int n=0;n<e1;n++){ | ||||||
|       int o  = n*rhs._grid->_slice_stride[dimension]; |       for(int b=0;b<e2;b++){ | ||||||
|       //      int bo = n*rhs._grid->_slice_block[dimension]; | 	int o  = n*rhs._grid->_slice_stride[dimension]; | ||||||
|       int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | 	int bo = n*rhs._grid->_slice_block[dimension]; | ||||||
|       if ( ocb &cbmask ) { | 	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); | ||||||
| 	buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); |  | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |   } else {  | ||||||
|  |      int bo=0; | ||||||
|  |      for(int n=0;n<e1;n++){ | ||||||
|  |        for(int b=0;b<e2;b++){ | ||||||
|  | 	 int o  = n*rhs._grid->_slice_stride[dimension]; | ||||||
|  | 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||||
|  | 	 if ( ocb &cbmask ) { | ||||||
|  | 	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); | ||||||
|  | 	 } | ||||||
|  |        } | ||||||
|  |      } | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_ | |||||||
|  |  | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|   //PARALLEL_NESTED_LOOP2 |    | ||||||
|   for(int n=0;n<e1;n++){ |   if ( cbmask ==0x3){ | ||||||
|     for(int b=0;b<e2;b++){ | PARALLEL_NESTED_LOOP2 | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  |  | ||||||
|       int o=n*rhs._grid->_slice_stride[dimension]; | 	int o=n*rhs._grid->_slice_stride[dimension]; | ||||||
|       int offset = b+n*rhs._grid->_slice_block[dimension]; | 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||||
|  |  | ||||||
|       int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | 	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); | ||||||
|       if ( ocb & cbmask ) { |  | ||||||
| 	cobj temp;  |  | ||||||
| 	temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); |  | ||||||
| 	extract<cobj>(temp,pointers,offset); | 	extract<cobj>(temp,pointers,offset); | ||||||
|  |  | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } else {  | ||||||
|  |  | ||||||
|  |     assert(0); //Fixme think this is buggy | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int o=n*rhs._grid->_slice_stride[dimension]; | ||||||
|  | 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||||
|  | 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||||
|  |  | ||||||
|  | 	if ( ocb & cbmask ) { | ||||||
|  | 	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); | ||||||
|  | 	  extract<cobj>(temp,pointers,offset); | ||||||
|  | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v | |||||||
|      |      | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|   int bo=0; |    | ||||||
|   //PARALLEL_NESTED_LOOP2 |   if ( cbmask ==0x3 ) { | ||||||
|   for(int n=0;n<e1;n++){ | PARALLEL_NESTED_LOOP2 | ||||||
|     for(int b=0;b<e2;b++){ |     for(int n=0;n<e1;n++){ | ||||||
|       int o   =n*rhs._grid->_slice_stride[dimension]; |       for(int b=0;b<e2;b++){ | ||||||
|       //      int bo  =n*rhs._grid->_slice_block[dimension]; | 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||||
|       int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | 	int bo  =n*rhs._grid->_slice_block[dimension]; | ||||||
|       if ( ocb & cbmask ) { | 	rhs._odata[so+o+b]=buffer[bo+b]; | ||||||
| 	rhs._odata[so+o+b]=buffer[bo++]; |       } | ||||||
|  |     } | ||||||
|  |   } else {  | ||||||
|  |     int bo=0; | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||||
|  | 	int bo  =n*rhs._grid->_slice_block[dimension]; | ||||||
|  | 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||||
|  | 	if ( ocb & cbmask ) { | ||||||
|  | 	  rhs._odata[so+o+b]=buffer[bo++]; | ||||||
|  | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v | |||||||
|      |      | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|  |  | ||||||
|  |   if(cbmask ==0x3 ) { | ||||||
| PARALLEL_NESTED_LOOP2 | PARALLEL_NESTED_LOOP2 | ||||||
|   for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|     for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
|       int o      = n*rhs._grid->_slice_stride[dimension]; | 	int o      = n*rhs._grid->_slice_stride[dimension]; | ||||||
|       int offset = b+n*rhs._grid->_slice_block[dimension]; | 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||||
|       int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); |  | ||||||
|       if ( ocb&cbmask ) { |  | ||||||
| 	merge(rhs._odata[so+o+b],pointers,offset); | 	merge(rhs._odata[so+o+b],pointers,offset); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |   } else {  | ||||||
|  |     assert(0); // think this is buggy FIXME | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int o      = n*rhs._grid->_slice_stride[dimension]; | ||||||
|  | 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||||
|  | 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||||
|  | 	if ( ocb&cbmask ) { | ||||||
|  | 	  merge(rhs._odata[so+o+b],pointers,offset); | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|  |  | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc |   int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|  |  | ||||||
|  |   if(cbmask == 0x3 ){ | ||||||
| PARALLEL_NESTED_LOOP2 | PARALLEL_NESTED_LOOP2 | ||||||
|   for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|     for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
|   |   | ||||||
|       int o =n*rhs._grid->_slice_stride[dimension]+b; |         int o =n*rhs._grid->_slice_stride[dimension]+b; | ||||||
|       int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); |   	//lhs._odata[lo+o]=rhs._odata[ro+o]; | ||||||
|       if ( ocb&cbmask ) { |  | ||||||
| 	//lhs._odata[lo+o]=rhs._odata[ro+o]; |  | ||||||
| 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]); | 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]); | ||||||
|       } |       } | ||||||
|  |     } | ||||||
|  |   } else {  | ||||||
|  | PARALLEL_NESTED_LOOP2 | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  |   | ||||||
|  |         int o =n*rhs._grid->_slice_stride[dimension]+b; | ||||||
|  |         int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); | ||||||
|  |         if ( ocb&cbmask ) { | ||||||
|  |   	//lhs._odata[lo+o]=rhs._odata[ro+o]; | ||||||
|  | 	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]); | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user