From b9939e397496c758e6df7d4bffc134f0aea3c58d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 23 Apr 2015 20:42:30 +0100
Subject: [PATCH] Reorganised the TODO. Really getting somewhere

---
 TODO                   | 298 ++++++++++++-----------------------------
 lib/Grid_init.cc       |  61 +++++++--
 lib/Grid_summation.h   |  94 ++++++++++++-
 tests/Grid_nersc_io.cc |  13 +-
 4 files changed, 239 insertions(+), 227 deletions(-)
diff --git a/TODO b/TODO
index 256f2609..12e33302 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,77 @@
+* - BinaryWriter, TextWriter etc...
+  - protocol buffers? replace xml
+
+* Stencil operator support                           -----Initial thoughts, trial implementation DONE.
+                                                     -----some simple tests that Stencil matches Cshift.
+                                                     -----do all permute in comms phase, so that copy permute
+						     -----cases move into a buffer.
+						     -----allow transform in/out buffers spproj
+
+
+* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
+
+* Consider switch std::vector to boost arrays or something lighter weight
+  boost::multi_array<type, 3> A()...    to replace multi1d, multi2d etc..
+
+* How to define simple matrix operations, such as flavour matrices?
+
+* Make the Tensor types and Complex etc... play more nicely.
+
+* Dirac, Pauli, SU subgroup, etc.. * Gamma/Dirac structures
+
+* Fourspin, two spin project
+
+* su3 exponentiation & log etc.. [Jamie's code?]
+  TaProj
+
+* Parallel MPI2 IO
+
+* rb4d support.
+
+* Check for missing functionality                    - partially audited against QDP++ layout
+
+* Optimise the extract/merge SIMD routines; Azusa??
+ - I have collated into single location at least.
+ - Need to use _mm_*insert/extract routines.
+
+* Conformable test in Cshift routines.
+
+* QDP++ regression suite and comparative benchmark
+
+AUDITS:
+
+* FIXME audit
+* const audit
+* Replace vset with a call to merge.; 
+* care in Gmerge,Gextract over vset .
+* extract / merge extra implementation removal      
+* Test infrastructure
+
+   // TODO
+   //
+   // Base class to share common code between vRealF, VComplexF etc...
+  //
+   // Unary functions
+   // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
+   // exp, log, sqrt, fabs
+   //
+  // transposeColor, transposeSpin,
+   // adjColor, adjSpin,
+   //
+  // copyMask.
+   //
+   // localMaxAbs
+   //
+   // Fourier transform equivalent.
+
+* LinearOperator
+
+  LinearSolver
+
+  Polynomial etc...
+
+======================================================================================================
+
 FUNCTIONALITY:
 * Conditional execution, where etc...                -----DONE, simple test
 * Integer relational support                         -----DONE
@@ -22,222 +96,24 @@ FUNCTIONALITY:
   - lib/qcd/actions
   - lib/qcd/measurements
 
+* Subset support, slice sums etc...                  -----DONE
+  sliceSum(orthog)
+  sum
+  innerProduct
+  norm2
 
-Not done, or just incomplete
-* random number generation
+* Subgrid Transferral                                -----DONE
+  subBlock (coarseLattice,fineLattice)
+  projectBlockBasis  
+  promoteBlockBasis
 
-* Consider switch std::vector to boost arrays or something lighter weight
-  boost::multi_array<type, 3> A()...    to replace multi1d, multi2d etc..
+* random number generation                           ----- DONE
 
-* How to define simple matrix operations, such as flavour matrices?
-
-* Dirac, Pauli, SU subgroup, etc.. * Gamma/Dirac structures
-
-* Fourspin, two spin project
-
-* su3 exponentiation, log etc.. [Jamie's code?]
-
-* Stencil operator support                           -----Initial thoughts, trial implementation DONE.
-                                                     -----some simple tests that Stencil matches Cshift.
-                                                     -----do all permute in comms phase, so that copy permute
-						     -----cases move into a buffer.
-						     -----allow transform in/out buffers spproj
-
-* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
-
-* Subset support, slice sums etc...                  -----Only need slice sum?
-                                                     -----Generic cartesian subslicing?
-                                                     -----Array ranges / boost extents?
-                                                     -----Multigrid grid transferral?
-                                                     -----Suggests generalised cartesian subblocking
-                                                          sums, returning modified grid?
-					             -----What should interface be?
-
-* Grid transferral
-  * pickCheckerboard, pickSubPlane, pickSubBlock,
-  *                    sumSubPlane, sumSubBlocks
-
-* rb4d support.
-
-* Check for missing functionality                    - partially audited against QDP++ layout
-
-* Optimise the extract/merge SIMD routines; Azusa??
-
- - I have collated into single location at least.
- - Need to use _mm_*insert/extract routines.
-
-* Conformable test in Cshift routines.
-
-
-
-* Broadcast, reduction tests. innerProduct, localInnerProduct
-
-* QDP++ regression suite and comparative benchmark
+* Broadcast, reduction tests. innerProduct, localInnerProduct --- DONE
 
 * I/O support
-
-* NERSC Lattice loading, plaquette test
-
-  - MPI IO?
-  - BinaryWriter, TextWriter etc...
-  - protocol buffers?
-
-AUDITS:
-// Lattice support audit                 Tested in Grid_main.cc
-//
-//     -=,+=,*=                           Y
-//     add,+,sub,-,mult,mac,*             Y
-//     innerProduct,norm2                 Y
-//     localInnerProduct,outerProduct,    Y
-//     adj,conj                           Y
-//     transpose,                         Y
-//     trace                              Y
-//
-//     transposeIndex                     Y
-//     traceIndex                         Y
-//     peekIndex                          Y
-//
-//     real,imag                          missing, semantic thought needed on real/im support.
-//                                        perhaps I just keep everything complex?
-// 
-
-* FIXME audit
-* const audit
-* Replace vset with a call to merge.; 
-* care in Gmerge,Gextract over vset .
-* extract / merge extra implementation removal      
-* Test infrastructure
-
-[ More on subsets and grid transfers ]
-i)  Three classes of subset;   red black parity subsetting (pick checkerboard).
-                             cartesian sub-block subsetting
-                             rbNd 
-
-ii) Need to be able to project one Grid to another Grid.
-
-Lattice<vobj> coarse_data SubBlockSum (GridBase *CoarseGrid, Lattice<vobj> &fine_data)
-
-Operation ensure either:
- rd[dim] divide rd[dim] fine_data
-
-This will give a distributed array over mpi ranks in a given dim IF coarse gd != 1 and _processors[d]>1
-Dimension can be *replicated* on all ranks in dimension. Need a "replicated" option on GridCartesian etc..
-
-This will give "slice" summation and fourier projection assistance.
-
-    Generic concept is to subdivide (based on RD so applies to red/black or full).
-    Return a type on SUB-grid from CellSum TOP-grid
-    SUB-grid need not distribute but be replicated in some dims if that is how the
-    cartesian communicator works.
-
-Instead of subsetting 
-
-iii) No general permutation map.
+* NERSC Lattice loading, plaquette test             ------- DONE single node 
 
 
- ? Cell definition <-> sliceSum.
- ? Replicated arrays.
+* Controling std::cout                              ------- DONE
 
-
-
-
-// Cartesian grid inheritance
-//            Grid::GridBase
-//                     |
-//           __________|___________
-//          |                      |
-// Grid::GridCartesian   Grid::GridCartesianRedBlack
-//
-// TODO: document the following as an API guaranteed public interface
-
-    /* 
-     *       Rough map of functionality against QDP++ Layout
-     *
-     *       Param     |     Grid                     |     QDP++             
-     *       -----------------------------------------
-     *                 |                              |
-     *        void     |     oSites, iSites, lSites   |  sitesOnNode 
-     *        void     |     gSites                   |  vol
-     *                 |                              |
-     *        gcoor    |     oIndex, iIndex           |  linearSiteIndex // no virtual node in QDP
-     *        lcoor    |                              |
-     * 
-     *        void     |     CheckerBoarded           |  -        // No checkerboarded in QDP
-     *        void     |     FullDimensions           |  lattSize
-     *        void     |     GlobalDimensions         |  lattSize // No checkerboarded in QDP
-     *        void     |     LocalDimensions          |  subgridLattSize
-     *        void     |     VirtualLocalDimensions   |  subgridLattSize // no virtual node in QDP
-     *                 |                              |
-     *       int x 3   |     oiSiteRankToGlobal       |  siteCoords
-     *                 |     ProcessorCoorLocalCoorToGlobalCoor | 
-     *                 |                              |
-     *     vector<int> |     GlobalCoorToRankIndex   |  nodeNumber(coord)
-     *     vector<int> |     GlobalCoorToProcessorCoorLocalCoor|  nodeCoord(coord)
-     *                 |                              |
-     *     void        |     Processors               |  logicalSize    // returns cart array shape
-     *     void        |     ThisRank        |  nodeNumber();  // returns this node rank
-     *     void        |     ThisProcessorCoor        |    // returns this node coor
-     *     void        |     isBoss(void)             |  primaryNode();
-     *                 |                              |
-     *                 |     RankFromProcessorCoor    |  getLogicalCoorFrom(node)
-     *                 |     ProcessorCoorFromRank    |  getNodeNumberFrom(logical_coord)
-     */
-  // Work out whether to permute 
-  // ABCDEFGH ->   AE BF CG DH       permute              wrap num
-  //
-  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH   0   0
-  // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA   0   1
-  // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB   0   2
-  // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC   0   3
-  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD   1   0 
-  // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE   1   1 
-  // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF   1   2
-  // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG   1   3
-
-  // Suppose 4way simd in one dim.
-  // ABCDEFGH ->   AECG BFDH      permute              wrap num
-
-  // Shift 0       AECG BFDH      0,00 0,00 ABCDEFGH         0     0
-  // Shift 1       BFDH CGEA      0,00 1,01 BCDEFGHA         0     1
-  // Shift 2       CGEA DHFB      1,01 1,01 CDEFGHAB         1     0
-  // Shift 3       DHFB EAGC      1,01 1,11 DEFGHABC         1     1
-  // Shift 4       EAGC FBHD      1,11 1,11 EFGHABCD         2     0 
-  // Shift 5       FBHD GCAE      1,11 1,10 FGHABCDE         2     1
-  // Shift 6       GCAE HDBF      1,10 1,10 GHABCDEF         3     0
-  // Shift 7       HDBF AECG      1,10 0,00 HABCDEFG         3     1
-
-  // Generalisation to 8 way simd, 16 way simd required.
-  //
-  // Need log2 Nway masks. consisting of 
-  //	    1 bit  256 bit granule
-  //	    2 bit  128 bit granule
-  //        4 bits 64  bit granule
-  //        8 bits 32  bit granules
-  //
-  //        15 bits....
-    // TODO
-    //
-    // Base class to share common code between vRealF, VComplexF etc...
-    //
-    // lattice Broad cast assignment
-    //
-    // where() support
-    // implement with masks, and/or? Type of the mask & boolean support?
-    //
-    // Unary functions
-    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
-    // exp, log, sqrt, fabs
-    //
-    // transposeColor, transposeSpin,
-    // adjColor, adjSpin,
-    // traceColor, traceSpin.
-    // peekColor, peekSpin + pokeColor PokeSpin
-    //
-    // copyMask.
-    //
-    // localMaxAbs
-    //
-    // norm2,
-    // sumMulti equivalent.
-    // Fourier transform equivalent.
-    //
diff --git a/lib/Grid_init.cc b/lib/Grid_init.cc
index ee11a982..762f7568 100644
--- a/lib/Grid_init.cc
+++ b/lib/Grid_init.cc
@@ -10,18 +10,57 @@
 #include <sys/stat.h> 
 #include <sys/time.h>
 #include <signal.h>
-
+#include <iostream>
 #include <Grid.h>
 
 #undef __X86_64
+#define MAC
+
+#ifdef MAC
+#include <execinfo.h>
+#endif
+
 namespace Grid {
 
+  std::streambuf *Grid_saved_stream_buf;
+#if 0
+  void Grid_quiesce_nodes(void)
+  {
+#ifdef GRID_COMMS_MPI
+    int me;
+    MPI_Comm_rank(MPI_COMM_WORLD,&me);
+    std::streambuf* Grid_saved_stream_buf = std::cout.rdbuf();
+    if ( me ) { 
+      std::ofstream file("log.node");
+      std::cout.rdbuf(file.rdbuf());
+    }
+#endif
+  }
+#endif
+  void Grid_quiesce_nodes(void)
+  {
+#ifdef GRID_COMMS_MPI
+    int me;
+    MPI_Comm_rank(MPI_COMM_WORLD,&me);
+    if ( me ) { 
+      std::cout.setstate(std::ios::badbit);
+    }
+#endif
+  }
+  void Grid_unquiesce_nodes(void)
+  {
+#ifdef GRID_COMMS_MPI
+    std::cout.clear();
+#endif
+  }
+
 void Grid_init(int *argc,char ***argv)
 {
 #ifdef GRID_COMMS_MPI
   MPI_Init(argc,argv);
 #endif
   Grid_debug_handler_init();
+  Grid_quiesce_nodes();
 }
 void Grid_finalize(void)
 {
@@ -35,6 +74,10 @@ double usecond(void) {
   return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 
+
+#define _NBACKTRACE (256)
+void * Grid_backtrace_buffer[_NBACKTRACE];
+
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
   printf("Caught signal %d\n",si->si_signo);
@@ -43,10 +86,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 
 #ifdef __X86_64
     ucontext_t * uc= (ucontext_t *)ptr;
-
   struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
   printf("  instruction %llx\n",(uint64_t)sc->rip);
-
 #define REG(A)  printf("  %s %lx\n",#A, sc-> A);
   REG(rdi);
   REG(rsi);
@@ -68,14 +109,14 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
   REG(r14);
   REG(r15);
 #endif
-
-  fflush(stdout);
-
-  if ( si->si_signo == SIGSEGV ) {
-    printf("Grid_sa_signal_handler: Oops... this was a sigsegv you naughty naughty programmer. Goodbye\n");
-    fflush(stdout);
-    exit(-1);
+#ifdef MAC
+  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
+  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
+  for (int i = 0; i < symbols; i++){
+    printf ("%s\n", strings[i]);
   }
+#endif
+  exit(0);
   return;
 };
 
diff --git a/lib/Grid_summation.h b/lib/Grid_summation.h
index 12eeb2bc..c500c5f1 100644
--- a/lib/Grid_summation.h
+++ b/lib/Grid_summation.h
@@ -16,6 +16,96 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
   }
 }
 
+template<class vobj,int nbasis>
+inline void projectBlockBasis(Lattice<iVector<vComplex,nbasis > > &coarseData,
+			      const             Lattice<vobj>   &fineData,
+			      const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData._grid;
+  GridBase * coarse= coarseData._grid;
+  int  _ndimension = coarse->_ndimension;
+
+  // checks
+  assert( nbasis == Basis.size() );
+  subdivides(coarse,fine); 
+  for(int i=0;i<nbasis;i++){
+    conformable(Basis,fineData);
+  }
+
+  std::vector<int>  block_r      (_ndimension);
+  
+  for(int d=0 ; d<_ndimension;d++){
+    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
+  }
+
+  coarseData=zero;
+
+  // Loop with a cache friendly loop ordering
+  for(int sf=0;sf<fine->oSites();sf++){
+
+    int sc;
+    std::vector<int> coor_c(_ndimension);
+    std::vector<int> coor_f(_ndimension);
+    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+
+    for(int i=0;i<nbasis;i++) {
+      
+      coarseData._odata[sc][i]=coarseData._odata[sc][i]
+	+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
+
+    }
+  }
+  return;
+  
+}
+
+
+template<class vobj,int nbasis>
+inline void promoteBlockBasis(const Lattice<iVector<vComplex,nbasis > > &coarseData,
+			                        Lattice<vobj>   &fineData,
+			      const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData._grid;
+  GridBase * coarse= coarseData._grid;
+  int  _ndimension = coarse->_ndimension;
+
+  // checks
+  assert( nbasis == Basis.size() );
+  subdivides(coarse,fine); 
+  for(int i=0;i<nbasis;i++){
+    conformable(Basis,fineData);
+  }
+
+  std::vector<int>  block_r      (_ndimension);
+  
+  for(int d=0 ; d<_ndimension;d++){
+    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
+  }
+
+  // Loop with a cache friendly loop ordering
+  for(int sf=0;sf<fine->oSites();sf++){
+
+    int sc;
+    std::vector<int> coor_c(_ndimension);
+    std::vector<int> coor_f(_ndimension);
+
+    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+
+    for(int i=0;i<nbasis;i++) {
+
+      if(i==0) fineData._odata[sf]=                    coarseData._odata[sc][i]*Basis[i]._odata[sf];
+      else     fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc][i]*Basis[i]._odata[sf];
+
+    }
+  }
+  return;
+  
+}
+
 // useful in multigrid project;
 // Generic name : Coarsen?
 template<class vobj>
@@ -30,10 +120,6 @@ inline void sumBlocks(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
   
   std::vector<int>  block_r      (_ndimension);
   
-  ///////////////////////////////////////////////////////////
-  // Detect whether the result is replicated in dimension d
-  ///////////////////////////////////////////////////////////
-  
   for(int d=0 ; d<_ndimension;d++){
     block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
   }
diff --git a/tests/Grid_nersc_io.cc b/tests/Grid_nersc_io.cc
index 35cad831..f9faf7ef 100644
--- a/tests/Grid_nersc_io.cc
+++ b/tests/Grid_nersc_io.cc
@@ -11,12 +11,15 @@ int main (int argc, char ** argv)
   Grid_init(&argc,&argv);
 
   std::vector<int> simd_layout({1,1,2,2});
-  std::vector<int> mpi_layout ({1,1,1,1});
+  std::vector<int> mpi_layout ({2,2,2,2});
   std::vector<int> latt_size  ({16,16,16,32});
+  std::vector<int> clatt_size  ({4,4,4,8});
   int orthodir=3;
   int orthosz =latt_size[orthodir];
     
   GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
+  GridCartesian     Coarse(clatt_size,simd_layout,mpi_layout);
+
   GridRNG           FineRNG(&Fine);
   LatticeGaugeField Umu(&Fine);
 
@@ -40,6 +43,7 @@ int main (int argc, char ** argv)
 
   // (1+2+3)=6 = N(N-1)/2 terms
   LatticeComplex Plaq(&Fine);
+  LatticeComplex cPlaq(&Coarse);
   Plaq = zero;
   for(int mu=1;mu<Nd;mu++){
     for(int nu=0;nu<mu;nu++){
@@ -47,6 +51,7 @@ int main (int argc, char ** argv)
     }
   }
 
+  
   double vol = Fine.gSites();
   Complex PlaqScale(1.0/vol/6.0/3.0);
 
@@ -66,7 +71,6 @@ int main (int argc, char ** argv)
     std::cout << "total " <<Pt*PlaqScale<<std::endl;
   }  
 
-
   TComplex Tp = sum(Plaq);
   Complex p  = TensorRemove(Tp);
   std::cout << "calculated plaquettes " <<p*PlaqScale<<std::endl;
@@ -77,5 +81,10 @@ int main (int argc, char ** argv)
   Complex l  = TensorRemove(Tl);
   std::cout << "calculated link trace " <<l*LinkTraceScale<<std::endl;
 
+  sumBlocks(cPlaq,Plaq);
+  TComplex TcP = sum(cPlaq);
+  Complex ll= TensorRemove(TcP);
+  std::cout << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;
+  
   Grid_finalize();
 }