Partial optimisation; comms in x-dir for red black dslash will be slow as the checker skipping block strided

loops are non threadable. Will need to write a kernel for these instead and drive them with a lookup table to make a look sufficiently simple to thread.
2026-05-31 06:24:18 +01:00 · 2015-11-06 05:23:23 -06:00
parent 5d854c869c
commit 473fa28a6c
1 changed files with 100 additions and 40 deletions
@@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  int bo=0;
-    //PARALLEL_NESTED_LOOP21
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o  = n*rhs._grid->_slice_stride[dimension];
-      //      int bo = n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
-      if ( ocb &cbmask ) {
-	buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+
+  if ( cbmask == 0x3 ) { 
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o  = n*rhs._grid->_slice_stride[dimension];
+	int bo = n*rhs._grid->_slice_block[dimension];
+	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
+  } else { 
+     int bo=0;
+     for(int n=0;n<e1;n++){
+       for(int b=0;b<e2;b++){
+	 int o  = n*rhs._grid->_slice_stride[dimension];
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	 if ( ocb &cbmask ) {
+	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	 }
+       }
+     }
  }
 }

@@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  //PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
  
-      int o=n*rhs._grid->_slice_stride[dimension];
-      int offset = b+n*rhs._grid->_slice_block[dimension];
+  if ( cbmask ==0x3){
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){

-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb & cbmask ) {
-	cobj temp; 
-	temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	int o=n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+
+	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
+
+      }
+    }
+  } else { 
+
+    assert(0); //Fixme think this is buggy
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o=n*rhs._grid->_slice_stride[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+
+	if ( ocb & cbmask ) {
+	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	  extract<cobj>(temp,pointers,offset);
+	}
      }
    }
  }
@@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  int bo=0;
-  //PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o   =n*rhs._grid->_slice_stride[dimension];
-      //      int bo  =n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
-      if ( ocb & cbmask ) {
-	rhs._odata[so+o+b]=buffer[bo++];
+  
+  if ( cbmask ==0x3 ) {
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
+	rhs._odata[so+o+b]=buffer[bo+b];
+      }
+    }
+  } else { 
+    int bo=0;
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	if ( ocb & cbmask ) {
+	  rhs._odata[so+o+b]=buffer[bo++];
+	}
      }
    }
  }
@@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+
+  if(cbmask ==0x3 ) {
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o      = n*rhs._grid->_slice_stride[dimension];
-      int offset = b+n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
 	merge(rhs._odata[so+o+b],pointers,offset);
      }
    }
+  } else { 
+    assert(0); // think this is buggy FIXME
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	if ( ocb&cbmask ) {
+	  merge(rhs._odata[so+o+b],pointers,offset);
+	}
+      }
+    }
  }
 }

@@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs

  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
-PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){

-      int o =n*rhs._grid->_slice_stride[dimension]+b;
-      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
-      if ( ocb&cbmask ) {
-	//lhs._odata[lo+o]=rhs._odata[ro+o];
+  if(cbmask == 0x3 ){
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+ 
+        int o =n*rhs._grid->_slice_stride[dimension]+b;
+  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
+    }
+  } else { 
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
 
+        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
+        if ( ocb&cbmask ) {
+  	//lhs._odata[lo+o]=rhs._odata[ro+o];
+	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	}
+      }
    }
  }