From 473fa28a6cd07090fb06e6823f900d1b08898972 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 6 Nov 2015 05:23:23 -0600 Subject: [PATCH] Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table to make a look sufficiently simple to thread. --- lib/cshift/Cshift_common.h | 140 ++++++++++++++++++++++++++----------- 1 file changed, 100 insertions(+), 40 deletions(-) diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 83508ca3..06f1b02b 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; - int bo=0; - //PARALLEL_NESTED_LOOP21 - for(int n=0;n_slice_stride[dimension]; - // int bo = n*rhs._grid->_slice_block[dimension]; - int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup - if ( ocb &cbmask ) { - buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + + if ( cbmask == 0x3 ) { +PARALLEL_NESTED_LOOP2 + for(int n=0;n_slice_stride[dimension]; + int bo = n*rhs._grid->_slice_block[dimension]; + buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); } } + } else { + int bo=0; + for(int n=0;n_slice_stride[dimension]; + int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup + if ( ocb &cbmask ) { + buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + } + } + } } } @@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; - //PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_stride[dimension]; - int offset = b+n*rhs._grid->_slice_block[dimension]; + int o=n*rhs._grid->_slice_stride[dimension]; + int offset = b+n*rhs._grid->_slice_block[dimension]; - int ocb=1<CheckerBoardFromOindex(o+b); - if ( ocb & cbmask ) { - cobj temp; - temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); extract(temp,pointers,offset); + + } + } + } else { + + assert(0); //Fixme think this is buggy + for(int n=0;n_slice_stride[dimension]; + int ocb=1<CheckerBoardFromOindex(o+b); + int offset = b+n*rhs._grid->_slice_block[dimension]; + + if ( ocb & cbmask ) { + cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + extract(temp,pointers,offset); + } } } } @@ -110,15 +135,26 @@ template void Scatter_plane_simple (Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; - int bo=0; - //PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_stride[dimension]; - // int bo =n*rhs._grid->_slice_block[dimension]; - int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup - if ( ocb & cbmask ) { - rhs._odata[so+o+b]=buffer[bo++]; + + if ( cbmask ==0x3 ) { +PARALLEL_NESTED_LOOP2 + for(int n=0;n_slice_stride[dimension]; + int bo =n*rhs._grid->_slice_block[dimension]; + rhs._odata[so+o+b]=buffer[bo+b]; + } + } + } else { + int bo=0; + for(int n=0;n_slice_stride[dimension]; + int bo =n*rhs._grid->_slice_block[dimension]; + int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup + if ( ocb & cbmask ) { + rhs._odata[so+o+b]=buffer[bo++]; + } } } } @@ -139,16 +175,28 @@ template void Scatter_plane_simple (Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; + + if(cbmask ==0x3 ) { PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_stride[dimension]; - int offset = b+n*rhs._grid->_slice_block[dimension]; - int ocb=1<CheckerBoardFromOindex(o+b); - if ( ocb&cbmask ) { + for(int n=0;n_slice_stride[dimension]; + int offset = b+n*rhs._grid->_slice_block[dimension]; merge(rhs._odata[so+o+b],pointers,offset); } } + } else { + assert(0); // think this is buggy FIXME + for(int n=0;n_slice_stride[dimension]; + int offset = b+n*rhs._grid->_slice_block[dimension]; + int ocb=1<CheckerBoardFromOindex(o+b); + if ( ocb&cbmask ) { + merge(rhs._odata[so+o+b],pointers,offset); + } + } + } } } @@ -168,17 +216,29 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc int e2=rhs._grid->_slice_block[dimension]; + + if(cbmask == 0x3 ){ PARALLEL_NESTED_LOOP2 - for(int n=0;n_slice_stride[dimension]+b; - int ocb=1<CheckerBoardFromOindex(o); - if ( ocb&cbmask ) { - //lhs._odata[lo+o]=rhs._odata[ro+o]; + int o =n*rhs._grid->_slice_stride[dimension]+b; + //lhs._odata[lo+o]=rhs._odata[ro+o]; vstream(lhs._odata[lo+o],rhs._odata[ro+o]); } - + } + } else { +PARALLEL_NESTED_LOOP2 + for(int n=0;n_slice_stride[dimension]+b; + int ocb=1<CheckerBoardFromOindex(o); + if ( ocb&cbmask ) { + //lhs._odata[lo+o]=rhs._odata[ro+o]; + vstream(lhs._odata[lo+o],rhs._odata[ro+o]); + } + } } }