diff --git a/TODO b/TODO index 256f2609..12e33302 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,77 @@ +* - BinaryWriter, TextWriter etc... + - protocol buffers? replace xml + +* Stencil operator support -----Initial thoughts, trial implementation DONE. + -----some simple tests that Stencil matches Cshift. + -----do all permute in comms phase, so that copy permute + -----cases move into a buffer. + -----allow transform in/out buffers spproj + + +* CovariantShift support -----Use a class to store gauge field? (parallel transport?) + +* Consider switch std::vector to boost arrays or something lighter weight + boost::multi_array A()... to replace multi1d, multi2d etc.. + +* How to define simple matrix operations, such as flavour matrices? + +* Make the Tensor types and Complex etc... play more nicely. + +* Dirac, Pauli, SU subgroup, etc.. * Gamma/Dirac structures + +* Fourspin, two spin project + +* su3 exponentiation & log etc.. [Jamie's code?] + TaProj + +* Parallel MPI2 IO + +* rb4d support. + +* Check for missing functionality - partially audited against QDP++ layout + +* Optimise the extract/merge SIMD routines; Azusa?? + - I have collated into single location at least. + - Need to use _mm_*insert/extract routines. + +* Conformable test in Cshift routines. + +* QDP++ regression suite and comparative benchmark + +AUDITS: + +* FIXME audit +* const audit +* Replace vset with a call to merge.; +* care in Gmerge,Gextract over vset . +* extract / merge extra implementation removal +* Test infrastructure + + // TODO + // + // Base class to share common code between vRealF, VComplexF etc... + // + // Unary functions + // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar only arg + // exp, log, sqrt, fabs + // + // transposeColor, transposeSpin, + // adjColor, adjSpin, + // + // copyMask. + // + // localMaxAbs + // + // Fourier transform equivalent. + +* LinearOperator + + LinearSolver + + Polynomial etc... + +====================================================================================================== + FUNCTIONALITY: * Conditional execution, where etc... -----DONE, simple test * Integer relational support -----DONE @@ -22,222 +96,24 @@ FUNCTIONALITY: - lib/qcd/actions - lib/qcd/measurements +* Subset support, slice sums etc... -----DONE + sliceSum(orthog) + sum + innerProduct + norm2 -Not done, or just incomplete -* random number generation +* Subgrid Transferral -----DONE + subBlock (coarseLattice,fineLattice) + projectBlockBasis + promoteBlockBasis -* Consider switch std::vector to boost arrays or something lighter weight - boost::multi_array A()... to replace multi1d, multi2d etc.. +* random number generation ----- DONE -* How to define simple matrix operations, such as flavour matrices? - -* Dirac, Pauli, SU subgroup, etc.. * Gamma/Dirac structures - -* Fourspin, two spin project - -* su3 exponentiation, log etc.. [Jamie's code?] - -* Stencil operator support -----Initial thoughts, trial implementation DONE. - -----some simple tests that Stencil matches Cshift. - -----do all permute in comms phase, so that copy permute - -----cases move into a buffer. - -----allow transform in/out buffers spproj - -* CovariantShift support -----Use a class to store gauge field? (parallel transport?) - -* Subset support, slice sums etc... -----Only need slice sum? - -----Generic cartesian subslicing? - -----Array ranges / boost extents? - -----Multigrid grid transferral? - -----Suggests generalised cartesian subblocking - sums, returning modified grid? - -----What should interface be? - -* Grid transferral - * pickCheckerboard, pickSubPlane, pickSubBlock, - * sumSubPlane, sumSubBlocks - -* rb4d support. - -* Check for missing functionality - partially audited against QDP++ layout - -* Optimise the extract/merge SIMD routines; Azusa?? - - - I have collated into single location at least. - - Need to use _mm_*insert/extract routines. - -* Conformable test in Cshift routines. - - - -* Broadcast, reduction tests. innerProduct, localInnerProduct - -* QDP++ regression suite and comparative benchmark +* Broadcast, reduction tests. innerProduct, localInnerProduct --- DONE * I/O support - -* NERSC Lattice loading, plaquette test - - - MPI IO? - - BinaryWriter, TextWriter etc... - - protocol buffers? - -AUDITS: -// Lattice support audit Tested in Grid_main.cc -// -// -=,+=,*= Y -// add,+,sub,-,mult,mac,* Y -// innerProduct,norm2 Y -// localInnerProduct,outerProduct, Y -// adj,conj Y -// transpose, Y -// trace Y -// -// transposeIndex Y -// traceIndex Y -// peekIndex Y -// -// real,imag missing, semantic thought needed on real/im support. -// perhaps I just keep everything complex? -// - -* FIXME audit -* const audit -* Replace vset with a call to merge.; -* care in Gmerge,Gextract over vset . -* extract / merge extra implementation removal -* Test infrastructure - -[ More on subsets and grid transfers ] -i) Three classes of subset; red black parity subsetting (pick checkerboard). - cartesian sub-block subsetting - rbNd - -ii) Need to be able to project one Grid to another Grid. - -Lattice coarse_data SubBlockSum (GridBase *CoarseGrid, Lattice &fine_data) - -Operation ensure either: - rd[dim] divide rd[dim] fine_data - -This will give a distributed array over mpi ranks in a given dim IF coarse gd != 1 and _processors[d]>1 -Dimension can be *replicated* on all ranks in dimension. Need a "replicated" option on GridCartesian etc.. - -This will give "slice" summation and fourier projection assistance. - - Generic concept is to subdivide (based on RD so applies to red/black or full). - Return a type on SUB-grid from CellSum TOP-grid - SUB-grid need not distribute but be replicated in some dims if that is how the - cartesian communicator works. - -Instead of subsetting - -iii) No general permutation map. +* NERSC Lattice loading, plaquette test ------- DONE single node - ? Cell definition <-> sliceSum. - ? Replicated arrays. +* Controling std::cout ------- DONE - - - -// Cartesian grid inheritance -// Grid::GridBase -// | -// __________|___________ -// | | -// Grid::GridCartesian Grid::GridCartesianRedBlack -// -// TODO: document the following as an API guaranteed public interface - - /* - * Rough map of functionality against QDP++ Layout - * - * Param | Grid | QDP++ - * ----------------------------------------- - * | | - * void | oSites, iSites, lSites | sitesOnNode - * void | gSites | vol - * | | - * gcoor | oIndex, iIndex | linearSiteIndex // no virtual node in QDP - * lcoor | | - * - * void | CheckerBoarded | - // No checkerboarded in QDP - * void | FullDimensions | lattSize - * void | GlobalDimensions | lattSize // No checkerboarded in QDP - * void | LocalDimensions | subgridLattSize - * void | VirtualLocalDimensions | subgridLattSize // no virtual node in QDP - * | | - * int x 3 | oiSiteRankToGlobal | siteCoords - * | ProcessorCoorLocalCoorToGlobalCoor | - * | | - * vector | GlobalCoorToRankIndex | nodeNumber(coord) - * vector | GlobalCoorToProcessorCoorLocalCoor| nodeCoord(coord) - * | | - * void | Processors | logicalSize // returns cart array shape - * void | ThisRank | nodeNumber(); // returns this node rank - * void | ThisProcessorCoor | // returns this node coor - * void | isBoss(void) | primaryNode(); - * | | - * | RankFromProcessorCoor | getLogicalCoorFrom(node) - * | ProcessorCoorFromRank | getNodeNumberFrom(logical_coord) - */ - // Work out whether to permute - // ABCDEFGH -> AE BF CG DH permute wrap num - // - // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0 - // Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1 - // Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2 - // Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3 - // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0 - // Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1 - // Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2 - // Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3 - - // Suppose 4way simd in one dim. - // ABCDEFGH -> AECG BFDH permute wrap num - - // Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0 - // Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1 - // Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0 - // Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1 - // Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0 - // Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1 - // Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0 - // Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1 - - // Generalisation to 8 way simd, 16 way simd required. - // - // Need log2 Nway masks. consisting of - // 1 bit 256 bit granule - // 2 bit 128 bit granule - // 4 bits 64 bit granule - // 8 bits 32 bit granules - // - // 15 bits.... - // TODO - // - // Base class to share common code between vRealF, VComplexF etc... - // - // lattice Broad cast assignment - // - // where() support - // implement with masks, and/or? Type of the mask & boolean support? - // - // Unary functions - // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar only arg - // exp, log, sqrt, fabs - // - // transposeColor, transposeSpin, - // adjColor, adjSpin, - // traceColor, traceSpin. - // peekColor, peekSpin + pokeColor PokeSpin - // - // copyMask. - // - // localMaxAbs - // - // norm2, - // sumMulti equivalent. - // Fourier transform equivalent. - // diff --git a/lib/Grid_init.cc b/lib/Grid_init.cc index ee11a982..762f7568 100644 --- a/lib/Grid_init.cc +++ b/lib/Grid_init.cc @@ -10,18 +10,57 @@ #include #include #include - +#include #include #undef __X86_64 +#define MAC + +#ifdef MAC +#include +#endif + namespace Grid { + std::streambuf *Grid_saved_stream_buf; +#if 0 + void Grid_quiesce_nodes(void) + { +#ifdef GRID_COMMS_MPI + int me; + MPI_Comm_rank(MPI_COMM_WORLD,&me); + std::streambuf* Grid_saved_stream_buf = std::cout.rdbuf(); + if ( me ) { + std::ofstream file("log.node"); + std::cout.rdbuf(file.rdbuf()); + } +#endif + } +#endif + void Grid_quiesce_nodes(void) + { +#ifdef GRID_COMMS_MPI + int me; + MPI_Comm_rank(MPI_COMM_WORLD,&me); + if ( me ) { + std::cout.setstate(std::ios::badbit); + } +#endif + } + void Grid_unquiesce_nodes(void) + { +#ifdef GRID_COMMS_MPI + std::cout.clear(); +#endif + } + void Grid_init(int *argc,char ***argv) { #ifdef GRID_COMMS_MPI MPI_Init(argc,argv); #endif Grid_debug_handler_init(); + Grid_quiesce_nodes(); } void Grid_finalize(void) { @@ -35,6 +74,10 @@ double usecond(void) { return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; } + +#define _NBACKTRACE (256) +void * Grid_backtrace_buffer[_NBACKTRACE]; + void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) { printf("Caught signal %d\n",si->si_signo); @@ -43,10 +86,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) #ifdef __X86_64 ucontext_t * uc= (ucontext_t *)ptr; - struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; printf(" instruction %llx\n",(uint64_t)sc->rip); - #define REG(A) printf(" %s %lx\n",#A, sc-> A); REG(rdi); REG(rsi); @@ -68,14 +109,14 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) REG(r14); REG(r15); #endif - - fflush(stdout); - - if ( si->si_signo == SIGSEGV ) { - printf("Grid_sa_signal_handler: Oops... this was a sigsegv you naughty naughty programmer. Goodbye\n"); - fflush(stdout); - exit(-1); +#ifdef MAC + int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE); + char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols); + for (int i = 0; i < symbols; i++){ + printf ("%s\n", strings[i]); } +#endif + exit(0); return; }; diff --git a/lib/Grid_summation.h b/lib/Grid_summation.h index 12eeb2bc..c500c5f1 100644 --- a/lib/Grid_summation.h +++ b/lib/Grid_summation.h @@ -16,6 +16,96 @@ inline void subdivides(GridBase *coarse,GridBase *fine) } } +template +inline void projectBlockBasis(Lattice > &coarseData, + const Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData._grid; + GridBase * coarse= coarseData._grid; + int _ndimension = coarse->_ndimension; + + // checks + assert( nbasis == Basis.size() ); + subdivides(coarse,fine); + for(int i=0;i block_r (_ndimension); + + for(int d=0 ; d<_ndimension;d++){ + block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; + } + + coarseData=zero; + + // Loop with a cache friendly loop ordering + for(int sf=0;sfoSites();sf++){ + + int sc; + std::vector coor_c(_ndimension); + std::vector coor_f(_ndimension); + GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + + for(int i=0;i +inline void promoteBlockBasis(const Lattice > &coarseData, + Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData._grid; + GridBase * coarse= coarseData._grid; + int _ndimension = coarse->_ndimension; + + // checks + assert( nbasis == Basis.size() ); + subdivides(coarse,fine); + for(int i=0;i block_r (_ndimension); + + for(int d=0 ; d<_ndimension;d++){ + block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; + } + + // Loop with a cache friendly loop ordering + for(int sf=0;sfoSites();sf++){ + + int sc; + std::vector coor_c(_ndimension); + std::vector coor_f(_ndimension); + + GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + + for(int i=0;i @@ -30,10 +120,6 @@ inline void sumBlocks(Lattice &coarseData,const Lattice &fineData) std::vector block_r (_ndimension); - /////////////////////////////////////////////////////////// - // Detect whether the result is replicated in dimension d - /////////////////////////////////////////////////////////// - for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } diff --git a/tests/Grid_nersc_io.cc b/tests/Grid_nersc_io.cc index 35cad831..f9faf7ef 100644 --- a/tests/Grid_nersc_io.cc +++ b/tests/Grid_nersc_io.cc @@ -11,12 +11,15 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); std::vector simd_layout({1,1,2,2}); - std::vector mpi_layout ({1,1,1,1}); + std::vector mpi_layout ({2,2,2,2}); std::vector latt_size ({16,16,16,32}); + std::vector clatt_size ({4,4,4,8}); int orthodir=3; int orthosz =latt_size[orthodir]; GridCartesian Fine(latt_size,simd_layout,mpi_layout); + GridCartesian Coarse(clatt_size,simd_layout,mpi_layout); + GridRNG FineRNG(&Fine); LatticeGaugeField Umu(&Fine); @@ -40,6 +43,7 @@ int main (int argc, char ** argv) // (1+2+3)=6 = N(N-1)/2 terms LatticeComplex Plaq(&Fine); + LatticeComplex cPlaq(&Coarse); Plaq = zero; for(int mu=1;mu