Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided

loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table to make a look sufficiently simple to thread.
2026-07-18 08:03:27 +01:00 · 2015-11-06 05:23:23 -06:00
parent 5d854c869c
commit 473fa28a6c
1 changed files with 100 additions and 40 deletions
@@ -29,18 +29,28 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+
+  if ( cbmask == 0x3 ) { 
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o  = n*rhs._grid->_slice_stride[dimension];
+	int bo = n*rhs._grid->_slice_block[dimension];
+	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+      }
+    }
+  } else { 
     int bo=0;
-    //PARALLEL_NESTED_LOOP21
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*rhs._grid->_slice_stride[dimension];
-      //      int bo = n*rhs._grid->_slice_block[dimension];
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	 if ( ocb &cbmask ) {
 	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	 }
       }
     }
+  }
 }


@@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  //PARALLEL_NESTED_LOOP2
+  
+  if ( cbmask ==0x3){
+PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){

 	int o=n*rhs._grid->_slice_stride[dimension];
 	int offset = b+n*rhs._grid->_slice_block[dimension];

-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb & cbmask ) {
-	cobj temp; 
-	temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
+
+      }
+    }
+  } else { 
+
+    assert(0); //Fixme think this is buggy
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o=n*rhs._grid->_slice_stride[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+
+	if ( ocb & cbmask ) {
+	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	  extract<cobj>(temp,pointers,offset);
+	}
      }
    }
  }
@@ -110,18 +135,29 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  int bo=0;
-  //PARALLEL_NESTED_LOOP2
+  
+  if ( cbmask ==0x3 ) {
+PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
-      //      int bo  =n*rhs._grid->_slice_block[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
+	rhs._odata[so+o+b]=buffer[bo+b];
+      }
+    }
+  } else { 
+    int bo=0;
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
 	  rhs._odata[so+o+b]=buffer[bo++];
 	}
      }
    }
+  }
 }

 //////////////////////////////////////////////////////
@@ -139,7 +175,18 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+
+  if(cbmask ==0x3 ) {
 PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+	merge(rhs._odata[so+o+b],pointers,offset);
+      }
+    }
+  } else { 
+    assert(0); // think this is buggy FIXME
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs._grid->_slice_stride[dimension];
@@ -150,6 +197,7 @@ PARALLEL_NESTED_LOOP2
 	}
      }
    }
+  }
 }

 //////////////////////////////////////////////////////
@@ -168,6 +216,18 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs

  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
+
+  if(cbmask == 0x3 ){
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+ 
+        int o =n*rhs._grid->_slice_stride[dimension]+b;
+  	//lhs._odata[lo+o]=rhs._odata[ro+o];
+	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+      }
+    }
+  } else { 
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -178,7 +238,7 @@ PARALLEL_NESTED_LOOP2
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 	}
-
+      }
    }
  }