From 2c54be651c195c23e17133f3482bf66fdcbb3df4 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 29 Nov 2023 09:43:29 -0500
Subject: [PATCH] Further updates

---
 .../GeneralCoarsenedMatrixMultiRHS.h          |  38 +++---
 Grid/lattice/Lattice_transfer.h               |   4 +-
 Grid/lattice/PaddedCell.h                     | 113 ++++++++++--------
 Grid/stencil/GeneralLocalStencil.h            |   7 ++
 4 files changed, 93 insertions(+), 69 deletions(-)
diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
index 1da968bd..132fcbf8 100644
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
@@ -94,26 +94,27 @@ public:
       int ghost_zone=0;
       for(int32_t point = 0 ; point < geom.npoint; point++){
 	int i=s*geom.npoint+point;
-	if( Stencil._entries[i]._permute ) {
+	if( Stencil._entries[i]._wrap ) {
 	  ghost_zone=1;
 	}
       }
+      //      std::cout << "site " <<s<<"/"<<sites <<" ghost_zone "<<ghost_zone<<std::endl;
       GeneralStencilEntryReordered tmp;
       if( ghost_zone==0) {
 	for(int32_t point = 0 ; point < geom.npoint; point++){
 	  int i=s*geom.npoint+point;
  	  tmp._offset = Stencil._entries[i]._offset;
-	  tmp._permute= Stencil._entries[i]._permute; // Should be no premute and j=site
+	  tmp._wrap= Stencil._entries[i]._wrap; // Should be no premute and j=site
 	  tmp._input = s;
 	  StencilTmp.push_back(tmp);
 	}
 	j++;
       }
     }
-
     std::cout << " oSites " << _CoarseGridMulti->oSites()<<std::endl;
     std::cout << " npoint " << geom.npoint<<std::endl;
-    std::cout << " StencilTmp "<<StencilTmp.size();
+    std::cout << " StencilTmp "<<StencilTmp.size()<<std::endl;
+    
     assert(_CoarseGridMulti->oSites()*geom.npoint==StencilTmp.size());
     acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
     CopyMatrix();
@@ -198,9 +199,9 @@ public:
       bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
 	+ 2.0*osites*sizeof(siteVector)*npoint;
 
-      std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
-      std::cout << " padded local dims   "<<pin.Grid()->LocalDimensions()<<std::endl;
-      std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
+      //      std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
+      //      std::cout << " padded local dims   "<<pin.Grid()->LocalDimensions()<<std::endl;
+      //      std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
       tmult-=usecond();
       autoView( Stencil_v  , Stencil, AcceleratorRead);
       accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
@@ -282,9 +283,9 @@ public:
       bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
 	+ 2.0*osites*sizeof(siteVector)*npoint;
 
-      std::cout << " osites "<<osites <<" bound "<<bound<< " stencilsize  "<<StencilMasked.size()<<std::endl;
-      std::cout << " padded local dims   "<<pin.Grid()->LocalDimensions()<<std::endl;
-      std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
+      //      std::cout << " osites "<<osites <<" bound "<<bound<< " stencilsize  "<<StencilMasked.size()<<std::endl;
+      //      std::cout << " padded local dims   "<<pin.Grid()->LocalDimensions()<<std::endl;
+      //      std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
       tmult-=usecond();
       auto Stencil_v = &StencilMasked[0];
       accelerator_for(rspb, StencilMasked.size()*nbasis, Nsimd, {
@@ -294,14 +295,19 @@ public:
 	  int32_t point= bp/nbasis;
 	  int32_t b    = bp%nbasis;
 	  auto SE  = &Stencil_v[ss*npoint+point];
-	  int32_t s   = SE->_input;
+	  int32_t s   = SE->_input; // site of padded
 	  int32_t snbr= SE->_offset;
-	  std::cout << " unpadded " << ss<<" padded " << s<< " point "<<point <<" row " <<b<<std::endl;
 	  auto nbr = coalescedRead(in_v[snbr]);
 	  auto res = Aview_p[point][s](0,b)*nbr(0);
 	  for(int bb=1;bb<nbasis;bb++) {
 	    res = res + Aview_p[point][s](bb,b)*nbr(bb);
 	  }
+	  //	  std::cout << " unpadded " << ss<<" padded " << s<< " point "<<point <<" row " <<b<<" "<< innerProduct(res,res) <<std::endl;
+	  //	  std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" res "<< innerProduct(res,res) <<std::endl;
+	  //	  std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbrIP "<< innerProduct(nbr,nbr) <<std::endl;
+	  //	  std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbr "<< nbr <<std::endl;
+	  //	  std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbr "<< in_v[snbr] <<std::endl;
+	  //	  std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" A   "<< innerProduct(Aview_p[point][s],Aview_p[point][s]) <<std::endl;
 	  coalescedWrite(Vview_p[point][ss](b),res);
       });
       tmult2-=usecond();
@@ -332,10 +338,10 @@ public:
     std::cout << GridLogMessage<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
     std::cout << GridLogMessage<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
     //    std::cout << GridLogMessage<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+    //    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
+    //    std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
+    //    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    //    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
     
   };
   virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 0521757d..cf8fd090 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -851,8 +851,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 #endif
     });
   t_acc+=usecond();
-  std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
-  std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
+  //  std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
+  //  std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
   acceleratorFreeDevice(table_d);    
   free(table);
   
diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h
index e573049d..3fca0ea2 100644
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -95,32 +95,38 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
   accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
 
     // scalar layout won't coalesce
-    int blane=acceleratorSIMTlane(Nsimd); // buffer lane
-    int olane=blane%rNsimd;               // reduced lattice lane
-    int obit =blane/rNsimd;
+#ifdef GRID_SIMT
+      {
+	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int blane=0;blane<Nsimd;blane++) {
+#endif
+	int olane=blane%rNsimd;               // reduced lattice lane
+	int obit =blane/rNsimd;
 
-    ///////////////////////////////////////////////////////////////
-    // osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
-    ///////////////////////////////////////////////////////////////
-    int ssp = ss*simd[dim]+obit;
-    int b    = ssp%block;
-    int n    = ssp/block;
-    int osite= b+n*stride + ox*block;
-
-    ////////////////////////////////////////////
-    // isite -- map lane within buffer to lane within lattice
-    ////////////////////////////////////////////
-    Coordinate icoor;
-    int lane;
-    Lexicographic::CoorFromIndex(icoor,olane,rsimd);
-    icoor[dim]=ix;
-    Lexicographic::IndexFromCoor(icoor,lane,simd);
-
-    ///////////////////////////////////////////
-    // Transfer into lattice - will coalesce
-    ///////////////////////////////////////////
-    sobj obj = extractLane(blane,buf_p[ss+offset]);
-    insertLane(lane,lat_v[osite],obj);
+	///////////////////////////////////////////////////////////////
+	// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
+	///////////////////////////////////////////////////////////////
+	int ssp = ss*simd[dim]+obit;
+	int b    = ssp%block;
+	int n    = ssp/block;
+	int osite= b+n*stride + ox*block;
+	
+	////////////////////////////////////////////
+	// isite -- map lane within buffer to lane within lattice
+	////////////////////////////////////////////
+	Coordinate icoor;
+	int lane;
+	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
+	icoor[dim]=ix;
+	Lexicographic::IndexFromCoor(icoor,lane,simd);
+	
+	///////////////////////////////////////////
+	// Transfer into lattice - will coalesce
+	///////////////////////////////////////////
+	sobj obj = extractLane(blane,buf_p[ss+offset]);
+	insertLane(lane,lat_v[osite],obj);
+      }
   });
 }
 
@@ -165,34 +171,39 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
   accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
 
     // scalar layout won't coalesce
-    int blane=acceleratorSIMTlane(Nsimd); // buffer lane
-    int olane=blane%rNsimd;               // reduced lattice lane
-    int obit =blane/rNsimd;
+#ifdef GRID_SIMT
+      {
+	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int blane=0;blane<Nsimd;blane++) {
+#endif
+	int olane=blane%rNsimd;               // reduced lattice lane
+	int obit =blane/rNsimd;
+	
+	////////////////////////////////////////////
+	// osite
+	////////////////////////////////////////////
+	int ssp = ss*simd[dim]+obit;
+	int b    = ssp%block;
+	int n    = ssp/block;
+	int osite= b+n*stride + ox*block;
 
-    ////////////////////////////////////////////
-    // osite
-    ////////////////////////////////////////////
-    int ssp = ss*simd[dim]+obit;
-    int b    = ssp%block;
-    int n    = ssp/block;
-    int osite= b+n*stride + ox*block;
-
-    ////////////////////////////////////////////
-    // isite -- map lane within buffer to lane within lattice
-    ////////////////////////////////////////////
-    Coordinate icoor;
-    int lane;
-    Lexicographic::CoorFromIndex(icoor,olane,rsimd);
-    icoor[dim]=ix;
-    Lexicographic::IndexFromCoor(icoor,lane,simd);
-
-    ///////////////////////////////////////////
-    // Take out of lattice
-    ///////////////////////////////////////////
-
-    sobj obj = extractLane(lane,lat_v[osite]);
-    insertLane(blane,buf_p[ss+offset],obj);
+	////////////////////////////////////////////
+	// isite -- map lane within buffer to lane within lattice
+	////////////////////////////////////////////
+	Coordinate icoor;
+	int lane;
+	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
+	icoor[dim]=ix;
+	Lexicographic::IndexFromCoor(icoor,lane,simd);
+	
+	///////////////////////////////////////////
+	// Take out of lattice
+	///////////////////////////////////////////
 
+	sobj obj = extractLane(lane,lat_v[osite]);
+	insertLane(blane,buf_p[ss+offset],obj);
+      }
   });
   /*
   int words =block*nblock/simd[dim];
diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h
index b2221bed..bace6aca 100644
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -32,6 +32,7 @@ NAMESPACE_BEGIN(Grid);
 struct GeneralStencilEntry { 
   uint64_t _offset;            // 4 bytes 
   uint8_t _permute;            // 1 bytes // Horrible alignment properties
+  uint8_t _wrap;               // 1 bytes // Horrible alignment properties
 };
 struct GeneralStencilEntryReordered : public GeneralStencilEntry {
   uint64_t _input;
@@ -105,10 +106,12 @@ public:
 	  // Simpler version using icoor calculation
 	  ////////////////////////////////////////////////
 	  SE._permute =0;
+	  SE._wrap=0;
 	  for(int d=0;d<Coor.size();d++){
 
 	    int fd = grid->_fdimensions[d];
 	    int rd = grid->_rdimensions[d];
+	    int ld = grid->_ldimensions[d];
 	    int ly = grid->_simd_layout[d];
 
 	    assert((ly==1)||(ly==2)||(ly==grid->Nsimd()));
@@ -116,6 +119,10 @@ public:
 	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
 	    int x = Coor[d];                // x in [0... rd-1] as an oSite 
 
+	    if ( (x + shift)%fd != (x+shift)%ld ){
+	      SE._wrap = 1;
+	    }
+	    
 	    int permute_dim  = grid->PermuteDim(d);
 	    int permute_slice=0;
 	    if(permute_dim){