1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided

loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table
to make a look sufficiently simple to thread.
This commit is contained in:
Peter Boyle 2015-11-06 05:23:23 -06:00
parent 5d854c869c
commit 473fa28a6c

View File

@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int bo=0;
//PARALLEL_NESTED_LOOP21 if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ PARALLEL_NESTED_LOOP2
for(int b=0;b<e2;b++){ for(int n=0;n<e1;n++){
int o = n*rhs._grid->_slice_stride[dimension]; for(int b=0;b<e2;b++){
// int bo = n*rhs._grid->_slice_block[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int bo = n*rhs._grid->_slice_block[dimension];
if ( ocb &cbmask ) { buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
} }
} }
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
}
}
}
} }
} }
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
//PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension]; if ( cbmask ==0x3){
int offset = b+n*rhs._grid->_slice_block[dimension]; PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); int o=n*rhs._grid->_slice_stride[dimension];
if ( ocb & cbmask ) { int offset = b+n*rhs._grid->_slice_block[dimension];
cobj temp;
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset); extract<cobj>(temp,pointers,offset);
}
}
} else {
assert(0); //Fixme think this is buggy
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*rhs._grid->_slice_block[dimension];
if ( ocb & cbmask ) {
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset);
}
} }
} }
} }
@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int bo=0;
//PARALLEL_NESTED_LOOP2 if ( cbmask ==0x3 ) {
for(int n=0;n<e1;n++){ PARALLEL_NESTED_LOOP2
for(int b=0;b<e2;b++){ for(int n=0;n<e1;n++){
int o =n*rhs._grid->_slice_stride[dimension]; for(int b=0;b<e2;b++){
// int bo =n*rhs._grid->_slice_block[dimension]; int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int bo =n*rhs._grid->_slice_block[dimension];
if ( ocb & cbmask ) { rhs._odata[so+o+b]=buffer[bo+b];
rhs._odata[so+o+b]=buffer[bo++]; }
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
}
} }
} }
} }
@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
if(cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2 PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset); merge(rhs._odata[so+o+b],pointers,offset);
} }
} }
} else {
assert(0); // think this is buggy FIXME
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset);
}
}
}
} }
} }
@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b; if(cbmask == 0x3 ){
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); PARALLEL_NESTED_LOOP2
if ( ocb&cbmask ) { for(int n=0;n<e1;n++){
//lhs._odata[lo+o]=rhs._odata[ro+o]; for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b;
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]); vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
} }
}
} else {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
}
}
} }
} }