mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided
loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table to make a look sufficiently simple to thread.
This commit is contained in:
parent
5d854c869c
commit
473fa28a6c
@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
int bo=0;
|
|
||||||
//PARALLEL_NESTED_LOOP21
|
if ( cbmask == 0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int b=0;b<e2;b++){
|
for(int n=0;n<e1;n++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
for(int b=0;b<e2;b++){
|
||||||
// int bo = n*rhs._grid->_slice_block[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int bo = n*rhs._grid->_slice_block[dimension];
|
||||||
if ( ocb &cbmask ) {
|
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
int bo=0;
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
|
if ( ocb &cbmask ) {
|
||||||
|
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
//PARALLEL_NESTED_LOOP2
|
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
|
|
||||||
int o=n*rhs._grid->_slice_stride[dimension];
|
if ( cbmask ==0x3){
|
||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
PARALLEL_NESTED_LOOP2
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
int o=n*rhs._grid->_slice_stride[dimension];
|
||||||
if ( ocb & cbmask ) {
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
cobj temp;
|
|
||||||
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
extract<cobj>(temp,pointers,offset);
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
|
||||||
|
assert(0); //Fixme think this is buggy
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o=n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
int bo=0;
|
|
||||||
//PARALLEL_NESTED_LOOP2
|
if ( cbmask ==0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int b=0;b<e2;b++){
|
for(int n=0;n<e1;n++){
|
||||||
int o =n*rhs._grid->_slice_stride[dimension];
|
for(int b=0;b<e2;b++){
|
||||||
// int bo =n*rhs._grid->_slice_block[dimension];
|
int o =n*rhs._grid->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int bo =n*rhs._grid->_slice_block[dimension];
|
||||||
if ( ocb & cbmask ) {
|
rhs._odata[so+o+b]=buffer[bo+b];
|
||||||
rhs._odata[so+o+b]=buffer[bo++];
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int bo=0;
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o =n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int bo =n*rhs._grid->_slice_block[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
rhs._odata[so+o+b]=buffer[bo++];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
if(cbmask ==0x3 ) {
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
|
||||||
if ( ocb&cbmask ) {
|
|
||||||
merge(rhs._odata[so+o+b],pointers,offset);
|
merge(rhs._odata[so+o+b],pointers,offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
assert(0); // think this is buggy FIXME
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
if ( ocb&cbmask ) {
|
||||||
|
merge(rhs._odata[so+o+b],pointers,offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
PARALLEL_NESTED_LOOP2
|
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
|
|
||||||
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
if(cbmask == 0x3 ){
|
||||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
PARALLEL_NESTED_LOOP2
|
||||||
if ( ocb&cbmask ) {
|
for(int n=0;n<e1;n++){
|
||||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
|
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||||
|
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||||
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PARALLEL_NESTED_LOOP2
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
|
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||||
|
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
||||||
|
if ( ocb&cbmask ) {
|
||||||
|
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||||
|
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user