Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided

loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table to make a look sufficiently simple to thread.
2025-07-08 09:27:06 +01:00 · 2015-11-06 05:23:23 -06:00
parent 5d854c869c
commit 473fa28a6c
1 changed files with 100 additions and 40 deletions
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  int bo=0;
+
-    //PARALLEL_NESTED_LOOP21
+  if ( cbmask == 0x3 ) { 
-  for(int n=0;n<e1;n++){
+PARALLEL_NESTED_LOOP2
-    for(int b=0;b<e2;b++){
+    for(int n=0;n<e1;n++){
-      int o  = n*rhs._grid->_slice_stride[dimension];
+      for(int b=0;b<e2;b++){
-      //      int bo = n*rhs._grid->_slice_block[dimension];
+	int o  = n*rhs._grid->_slice_stride[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	int bo = n*rhs._grid->_slice_block[dimension];
-      if ( ocb &cbmask ) {
+	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
  } else { 
     int bo=0;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*rhs._grid->_slice_stride[dimension];
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	 if ( ocb &cbmask ) {
 	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	 }
       }
     }
  }
 }
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  //PARALLEL_NESTED_LOOP2
+  
-  for(int n=0;n<e1;n++){
+  if ( cbmask ==0x3){
-    for(int b=0;b<e2;b++){
+PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-      int o=n*rhs._grid->_slice_stride[dimension];
+	int o=n*rhs._grid->_slice_stride[dimension];
-      int offset = b+n*rhs._grid->_slice_block[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      if ( ocb & cbmask ) {
 	cobj temp; 
 	temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
      }
    }
  } else { 
    assert(0); //Fixme think this is buggy
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*rhs._grid->_slice_stride[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	int offset = b+n*rhs._grid->_slice_block[dimension];
 	if ( ocb & cbmask ) {
 	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	  extract<cobj>(temp,pointers,offset);
 	}
      }
    }
  }
@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  int bo=0;
+  
-  //PARALLEL_NESTED_LOOP2
+  if ( cbmask ==0x3 ) {
-  for(int n=0;n<e1;n++){
+PARALLEL_NESTED_LOOP2
-    for(int b=0;b<e2;b++){
+    for(int n=0;n<e1;n++){
-      int o   =n*rhs._grid->_slice_stride[dimension];
+      for(int b=0;b<e2;b++){
-      //      int bo  =n*rhs._grid->_slice_block[dimension];
+	int o   =n*rhs._grid->_slice_stride[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	int bo  =n*rhs._grid->_slice_block[dimension];
-      if ( ocb & cbmask ) {
+	rhs._odata[so+o+b]=buffer[bo+b];
-	rhs._odata[so+o+b]=buffer[bo++];
+      }
    }
  } else { 
    int bo=0;
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
 	  rhs._odata[so+o+b]=buffer[bo++];
 	}
      }
    }
  }
@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  if(cbmask ==0x3 ) {
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
+      for(int b=0;b<e2;b++){
-      int o      = n*rhs._grid->_slice_stride[dimension];
+	int o      = n*rhs._grid->_slice_stride[dimension];
-      int offset = b+n*rhs._grid->_slice_block[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb&cbmask ) {
 	merge(rhs._odata[so+o+b],pointers,offset);
      }
    }
  } else { 
    assert(0); // think this is buggy FIXME
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs._grid->_slice_stride[dimension];
 	int offset = b+n*rhs._grid->_slice_block[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	if ( ocb&cbmask ) {
 	  merge(rhs._odata[so+o+b],pointers,offset);
 	}
      }
    }
  }
 }
@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
  if(cbmask == 0x3 ){
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
+      for(int b=0;b<e2;b++){
-      int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int o =n*rhs._grid->_slice_stride[dimension]+b;
-      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
+  	//lhs._odata[lo+o]=rhs._odata[ro+o];
      if ( ocb&cbmask ) {
 	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
-
+    }
  } else { 
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*rhs._grid->_slice_stride[dimension]+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 	}
      }
    }
  }