1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-10 06:00:45 +01:00

Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided

loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table
to make a look sufficiently simple to thread.
This commit is contained in:
Peter Boyle 2015-11-06 05:23:23 -06:00
parent 5d854c869c
commit 473fa28a6c

View File

@ -29,12 +29,21 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int bo=0;
//PARALLEL_NESTED_LOOP21 if ( cbmask == 0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int bo = n*rhs._grid->_slice_block[dimension];
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
}
}
} else {
int bo=0;
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
// int bo = n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
@ -42,6 +51,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
} }
} }
} }
}
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
//PARALLEL_NESTED_LOOP2
if ( cbmask ==0x3){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension]; int o=n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
if ( ocb & cbmask ) {
cobj temp;
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset); extract<cobj>(temp,pointers,offset);
}
}
} else {
assert(0); //Fixme think this is buggy
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*rhs._grid->_slice_block[dimension];
if ( ocb & cbmask ) {
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset);
}
} }
} }
} }
@ -110,12 +135,22 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int bo=0;
//PARALLEL_NESTED_LOOP2 if ( cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]; int o =n*rhs._grid->_slice_stride[dimension];
// int bo =n*rhs._grid->_slice_block[dimension]; int bo =n*rhs._grid->_slice_block[dimension];
rhs._odata[so+o+b]=buffer[bo+b];
}
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++]; rhs._odata[so+o+b]=buffer[bo++];
@ -123,6 +158,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
} }
} }
} }
}
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split // Scatter for when there *is* need to SIMD split
@ -139,7 +175,18 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
if(cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2 PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
merge(rhs._odata[so+o+b],pointers,offset);
}
}
} else {
assert(0); // think this is buggy FIXME
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
@ -151,6 +198,7 @@ PARALLEL_NESTED_LOOP2
} }
} }
} }
}
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
@ -168,6 +216,18 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
if(cbmask == 0x3 ){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b;
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
}
}
} else {
PARALLEL_NESTED_LOOP2 PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -178,7 +238,7 @@ PARALLEL_NESTED_LOOP2
//lhs._odata[lo+o]=rhs._odata[ro+o]; //lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]); vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
} }
}
} }
} }