mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-10 06:00:45 +01:00
Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided
loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table to make a look sufficiently simple to thread.
This commit is contained in:
parent
5d854c869c
commit
473fa28a6c
@ -29,12 +29,21 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
int bo=0;
|
|
||||||
//PARALLEL_NESTED_LOOP21
|
if ( cbmask == 0x3 ) {
|
||||||
|
PARALLEL_NESTED_LOOP2
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int bo = n*rhs._grid->_slice_block[dimension];
|
||||||
|
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int bo=0;
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
// int bo = n*rhs._grid->_slice_block[dimension];
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
@ -42,6 +51,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
//PARALLEL_NESTED_LOOP2
|
|
||||||
|
if ( cbmask ==0x3){
|
||||||
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o=n*rhs._grid->_slice_stride[dimension];
|
int o=n*rhs._grid->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
if ( ocb & cbmask ) {
|
|
||||||
cobj temp;
|
|
||||||
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
|
||||||
extract<cobj>(temp,pointers,offset);
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
|
||||||
|
assert(0); //Fixme think this is buggy
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o=n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -110,12 +135,22 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
int bo=0;
|
|
||||||
//PARALLEL_NESTED_LOOP2
|
if ( cbmask ==0x3 ) {
|
||||||
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*rhs._grid->_slice_stride[dimension];
|
int o =n*rhs._grid->_slice_stride[dimension];
|
||||||
// int bo =n*rhs._grid->_slice_block[dimension];
|
int bo =n*rhs._grid->_slice_block[dimension];
|
||||||
|
rhs._odata[so+o+b]=buffer[bo+b];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int bo=0;
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o =n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int bo =n*rhs._grid->_slice_block[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
rhs._odata[so+o+b]=buffer[bo++];
|
rhs._odata[so+o+b]=buffer[bo++];
|
||||||
@ -123,6 +158,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there *is* need to SIMD split
|
// Scatter for when there *is* need to SIMD split
|
||||||
@ -139,7 +175,18 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
if(cbmask ==0x3 ) {
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
merge(rhs._odata[so+o+b],pointers,offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert(0); // think this is buggy FIXME
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
@ -151,6 +198,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// local to node block strided copies
|
// local to node block strided copies
|
||||||
@ -168,6 +216,18 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
if(cbmask == 0x3 ){
|
||||||
|
PARALLEL_NESTED_LOOP2
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
|
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||||
|
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||||
|
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
@ -178,7 +238,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||||
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user