From 94b8fb56862289c0663453c0e2d82fa6da310f38 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 19 Nov 2017 01:39:04 +0000 Subject: [PATCH 1/7] Debug in progress --- lib/communicator/Communicator_base.cc | 47 ++++++++++++------- lib/communicator/Communicator_mpi3.cc | 2 +- lib/lattice/Lattice_transfer.h | 67 +++++++++++++++++++++++++++ tests/solver/Test_dwf_mrhs_cg_mpi.cc | 6 +-- 4 files changed, 102 insertions(+), 20 deletions(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 531dd358..223b07fd 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -134,8 +134,18 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); - assert(_ndimension = parent._ndimension); - + + int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); + std::vector parent_processor_coor(_ndimension,0); + std::vector parent_processors (_ndimension,1); + + // Can make 5d grid from 4d etc... + int pad = _ndimension-parent_ndimension; + for(int d=0;d &processors, std::vector ssize(_ndimension); // coor of split within parent for(int d=0;d<_ndimension;d++){ - ccoor[d] = parent._processor_coor[d] % processors[d]; - scoor[d] = parent._processor_coor[d] / processors[d]; - ssize[d] = parent._processors[d] / processors[d]; + ccoor[d] = parent_processor_coor[d] % processors[d]; + scoor[d] = parent_processor_coor[d] / processors[d]; + ssize[d] = parent_processors[d] / processors[d]; } int crank; // rank within subcomm ; srank is rank of subcomm within blocks of subcomms // Mpi uses the reverse Lexico convention to us @@ -166,38 +176,34 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors, MPI_Comm comm_split; if ( Nchild > 1 ) { - /* std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec< &processors, // Set up from the new split communicator ////////////////////////////////////////////////////////////////////////////////////////////////////// InitFromMPICommunicator(processors,comm_split); + + std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; + for(int d=0;d &out, const Lattice &in){ // NB: Easiest to programme if keep in lex order. // ///////////////////////////////////////////////////////// +/* +[0,0,0,0,0] S {V<4>{V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)}}} +[0,0,0,0,1] S {V<4>{V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)}}} +[0,0,0,0,2] S {V<4>{V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)}}} +[0,0,0,0,3] S {V<4>{V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)}}} +[0,0,0,0,4] S {V<4>{V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)}}} +[0,0,0,0,5] S {V<4>{V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)}}} +[0,0,0,0,6] S {V<4>{V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)}}} +[0,0,0,0,7] S {V<4>{V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)}}} +[0,0,0,0,8] S {V<4>{V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)}}} +[0,0,0,0,9] S {V<4>{V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)}}} +[0,0,0,0,10] S {V<4>{V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)}}} +[0,0,0,0,11] S {V<4>{V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)}}} +[0,0,0,0,12] S {V<4>{V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)}}} +[0,0,0,0,13] S {V<4>{V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)}}} +[0,0,0,0,14] S {V<4>{V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)}}} +[0,0,0,0,15] S {V<4>{V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)}}} + + +Process decomp +[A(0 1) A(2 3) B(0 1) B(2 3)] [ A(4 5) A(6 7) B(4 5) B(6 7)] [ A(8 9) A(10 11) B(8 9) B(10 11)] [A(12 13) A(14 15) B(12 13) B(14 15)] + +A2A(Full) + -- divides M*fL into fP segments of size M*fL/fP = fL/sP + -- total is fP * fL/sP = M * fL + A(0 1) A(4 5) A(8 9) A(12 13) + A(2 3) A(6 7) A(10 11) A(14 15) + B(0 1) B(4 5) B(8 9) B(12 13) + B(2 3) B(6 7) B(10 11) B(14 15) + + +A2A(Split) + A(0 1) A(4 5) A(2 3) A(6 7) + A(8 9) A(12 13) A(10 11) A(14 15) + B(0 1) B(2 3) B(4 5) B(6 7) + B(8 9) B(10 11) B(12 13) B(14 15) + +-------------------- +-- General case +-------------------- +G global lattice +fP - procs +sP - Procs in split grid +M - subdivisions/vectors - M*sP = fP ** constraint 1 +fL = G/fP per node (full) +sL = G/sP per node split + +[ G * M ] total = G*fP/sP. +[ Subdivide fL*M by fP => fL *M / fP = fL/fP *fP/sP = fL/sP ] +-------------------- +-- 1st A2A chunk is fL*M/fP = G/fP *fP/sP /fP = fL/sP +-- Let cL = fL/sP chunk. ( Divide into fP/sP = M chunks ) + +-- node 0 1st cL of node 0,1,... fP-1 ; vector 0 +-- node 1 2nd cL of node 0,1,... fP-1 +-- node 2 3nd cL of node 0,1,... fP-1 +-- node 3 4th cL of node 0,1,... fP-1 +... when node > sP get vector 1 etc... + +-- 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) +-- node 0 1st cL of node 0M..(1M-1); 2nd cL of node 0M..(1M-1)).. +-- node 1 1st cL of node 1M..(2M-1); 2nd cL of node 1M..(2M-1).. +-- node 2 1st cL of node 2M..(3M-1); 2nd cL of node 2M..(3M-1).. +-- node 3 1st cL of node 3M..(3M-1); 2nd cL of node 2M..(3M-1).. +-- +-- Insert correctly + */ template void Grid_split(std::vector > & full,Lattice & split) { diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index f640edff..d380f91e 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -95,7 +95,7 @@ int main (int argc, char ** argv) FermionField tmp(FGrid); for(int s=0;s HermOp(Ddwf); MdagMLinearOperator HermOpCk(Dchk); - ConjugateGradient CG((1.0e-5),10000); + ConjugateGradient CG((1.0e-2),10000); s_res = zero; CG(HermOp,s_src,s_res); From e55397bc134ead26dbac8e2ef244406a8b9d6a3b Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Fri, 24 Nov 2017 14:18:30 +0000 Subject: [PATCH 2/7] Staggerd cg --- lib/algorithms/LinearOperator.h | 20 +++++++++++--------- lib/algorithms/iterative/SchurRedBlack.h | 9 +++++++-- lib/lattice/Lattice_transfer.h | 16 ++++++---------- tests/solver/Test_staggered_cg_schur.cc | 14 ++++++++++++++ 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 0fa039c8..26746e6e 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -308,32 +308,34 @@ namespace Grid { public: SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + GridLogIterative.TimingMode(1); + std::cout << GridLogIterative << " HermOpAndNorm "< inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.checkerboard = cb; - int ssh=0; - //parallel_for - for(int ss=0;ssoSites();ss++){ - std::vector coor; + + parallel_for(int ss=0;ssoSites();ss++){ int cbos; - + std::vector coor; full._grid->oCoorFromOindex(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { + int ssh=half._grid->oIndex(coor); half._odata[ssh] = full._odata[ss]; - ssh++; } } } template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.checkerboard; - int ssh=0; - //parallel_for - for(int ss=0;ssoSites();ss++){ + parallel_for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; @@ -77,8 +73,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine) cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { + int ssh=half._grid->oIndex(coor); full._odata[ss]=half._odata[ssh]; - ssh++; } } } diff --git a/tests/solver/Test_staggered_cg_schur.cc b/tests/solver/Test_staggered_cg_schur.cc index 09044995..a5c25b85 100644 --- a/tests/solver/Test_staggered_cg_schur.cc +++ b/tests/solver/Test_staggered_cg_schur.cc @@ -70,7 +70,21 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); SchurRedBlackStaggeredSolve SchurSolver(CG); + double volume=1.0; + for(int mu=0;mu volume * 1146 + double ncall=CG.IterationsToComplete; + double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146 + + std::cout< &processors, ////////////////////////////////////////////////////////////////////////////////////////////////////// InitFromMPICommunicator(processors,comm_split); - std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; - for(int d=0;d &proc MPI_Comm_rank(communicator,&_processor); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); - if ( communicator_base != communicator_world ) { + if ( 0 && (communicator_base != communicator_world) ) { std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"< Date: Mon, 27 Nov 2017 12:33:08 +0000 Subject: [PATCH 4/7] Believe split/unsplit works, but need to make pretty --- lib/lattice/Lattice_transfer.h | 201 ++++++++++++++++++++++----------- 1 file changed, 133 insertions(+), 68 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index dd03fb4f..3d9289d6 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -890,50 +890,85 @@ void Grid_split(std::vector > & full,Lattice & split) if ( ratio[d] != 1 ) { full_grid ->AllToAll(d,alldata,tmpdata); - // std::cout << GridLogMessage << "Grid_split: dim " <_processors[d]<_processors[d] > 1 ) { + alldata=tmpdata; + split_grid->AllToAll(d,alldata,tmpdata); + } - auto rdims = ldims; rdims[d] *= ratio[d]; - auto rsites= lsites*ratio[d]; - for(int v=0;v_processors[d]; + int fP = full_grid->_processors[d]; - for(int r=0;r_processors[d] > 1 ) { - tmpdata = alldata; - split_grid->AllToAll(d,tmpdata,alldata); - } } } vectorizeFromLexOrdArray(alldata,split); @@ -1008,55 +1043,84 @@ void Grid_unsplit(std::vector > & full,Lattice & split) std::vector rcoor(ndim); int nvec = 1; - lsites = split_grid->lSites(); - std::vector ldims = split_grid->_ldimensions; + uint64_t rsites = split_grid->lSites(); + std::vector rdims = split_grid->_ldimensions; - // for(int d=ndim-1;d>=0;d--){ for(int d=0;d_processors[d]; + int fP = full_grid->_processors[d]; - if ( split_grid->_processors[d] > 1 ) { - tmpdata = alldata; - split_grid->AllToAll(d,tmpdata,alldata); - } + int M = ratio[d]; + auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor + auto lsites= rsites/M; // Decreases rsites by M - ////////////////////////////////////////// - //Local volume for this dimension is expanded by ratio of processor extents - // Number of vectors is decreased by same factor - // Rearrange to lexico for bigger volume - ////////////////////////////////////////// - auto rsites= lsites/ratio[d]; - auto rdims = ldims; rdims[d]/=ratio[d]; + int fvol = lsites; + int svol = rsites; + int chunk = (nvec*fvol)/sP; + int cL = (nvec*ldims[d])/sP; + + for(int c=0;c= tmpdata.size() ) { - // rsite, rcoor --> smaller local volume - // lsite, lcoor --> bigger original (single node?) volume - // For loop over each site within smaller subvol - for(int rsite=0;rsiteAllToAll(d,tmpdata,alldata); + if ( split_grid->_processors[d] > 1 ) { + split_grid->AllToAll(d,tmpdata,alldata); + tmpdata=alldata; + } + full_grid ->AllToAll(d,tmpdata,alldata); + + rdims[d]/= M; + rsites /= M; + nvec *= M; // Increase nvec by subdivision factor + } } } @@ -1064,12 +1128,13 @@ void Grid_unsplit(std::vector > & full,Lattice & split) for(int v=0;v Date: Mon, 27 Nov 2017 12:34:25 +0000 Subject: [PATCH 5/7] Clean on multinode target after split 1 1 2 4 -> 1 1 2 2 --- tests/solver/Test_dwf_mrhs_cg_mpi.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index d380f91e..b3611e01 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -121,12 +121,12 @@ int main (int argc, char ** argv) random(pRNG5,src[s]); tmp = 100.0*s; src[s] = (src[s] * 0.1) + tmp; - std::cout << " src ["< Date: Mon, 27 Nov 2017 15:10:22 +0000 Subject: [PATCH 6/7] Debug --- tests/solver/Test_dwf_mrhs_cg_mpi.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index b3611e01..06df58c6 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -173,6 +173,7 @@ int main (int argc, char ** argv) // std::cout << " s_src \n" << s_src << std::endl; // std::cout << " s_src_tmp \n" << s_src_tmp << std::endl; // std::cout << " s_src_diff \n" << s_src_diff << std::endl; + // exit(0); #endif /////////////////////////////////////////////////////////////// From 28ceacec45e052578cf6b4fa1f394c87f417d1d2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 27 Nov 2017 15:13:29 +0000 Subject: [PATCH 7/7] Split/Unsplit working --- lib/lattice/Lattice_transfer.h | 275 ++++++++------------------------- 1 file changed, 65 insertions(+), 210 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 78b80ba4..c7e2a507 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -694,30 +694,6 @@ void precisionChange(Lattice &out, const Lattice &in){ //////////////////////////////////////////////////////////////////////////////// // Communicate between grids //////////////////////////////////////////////////////////////////////////////// -// -// All to all plan -// -// Subvolume on fine grid is v. Vectors a,b,c,d -// -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -// SIMPLEST CASE: -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Mesh of nodes (2) ; subdivide to 1 subdivisions -// -// Lex ord: -// N0 va0 vb0 N1 va1 vb1 -// -// For each dimension do an all to all -// -// full AllToAll(0) -// N0 va0 va1 N1 vb0 vb1 -// -// REARRANGE -// N0 va01 N1 vb01 -// -// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract". -// NB: Easiest to programme if keep in lex order. -// /////////////////////////////////////////////////////////////////////////////////////////////////////////// // SIMPLE CASE: /////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -751,75 +727,16 @@ void precisionChange(Lattice &out, const Lattice &in){ // // Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract". // NB: Easiest to programme if keep in lex order. -// -///////////////////////////////////////////////////////// /* - -[0,0,0,0,0] S {V<4>{V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)}}} -[0,0,0,0,1] S {V<4>{V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)}}} -[0,0,0,0,2] S {V<4>{V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)}}} -[0,0,0,0,3] S {V<4>{V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)}}} -[0,0,0,0,4] S {V<4>{V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)}}} -[0,0,0,0,5] S {V<4>{V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)}}} -[0,0,0,0,6] S {V<4>{V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)}}} -[0,0,0,0,7] S {V<4>{V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)}}} -[0,0,0,0,8] S {V<4>{V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)}}} -[0,0,0,0,9] S {V<4>{V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)}}} -[0,0,0,0,10] S {V<4>{V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)}}} -[0,0,0,0,11] S {V<4>{V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)}}} -[0,0,0,0,12] S {V<4>{V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)}}} -[0,0,0,0,13] S {V<4>{V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)}}} -[0,0,0,0,14] S {V<4>{V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)}}} -[0,0,0,0,15] S {V<4>{V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)}}} - - -Process decomp -[A(0 1) A(2 3) B(0 1) B(2 3)] [ A(4 5) A(6 7) B(4 5) B(6 7)] [ A(8 9) A(10 11) B(8 9) B(10 11)] [A(12 13) A(14 15) B(12 13) B(14 15)] - -A2A(Full) - -- divides M*fL into fP segments of size M*fL/fP = fL/sP - -- total is fP * fL/sP = M * fL - A(0 1) A(4 5) A(8 9) A(12 13) - A(2 3) A(6 7) A(10 11) A(14 15) - B(0 1) B(4 5) B(8 9) B(12 13) - B(2 3) B(6 7) B(10 11) B(14 15) - - -A2A(Split) - A(0 1) A(4 5) A(2 3) A(6 7) - A(8 9) A(12 13) A(10 11) A(14 15) - B(0 1) B(2 3) B(4 5) B(6 7) - B(8 9) B(10 11) B(12 13) B(14 15) - --------------------- --- General case --------------------- -G global lattice -fP - procs -sP - Procs in split grid -M - subdivisions/vectors - M*sP = fP ** constraint 1 -fL = G/fP per node (full) -sL = G/sP per node split - -[ G * M ] total = G*fP/sP. -[ Subdivide fL*M by fP => fL *M / fP = fL/fP *fP/sP = fL/sP ] --------------------- --- 1st A2A chunk is fL*M/fP = G/fP *fP/sP /fP = fL/sP --- Let cL = fL/sP chunk. ( Divide into fP/sP = M chunks ) - --- node 0 1st cL of node 0,1,... fP-1 ; vector 0 --- node 1 2nd cL of node 0,1,... fP-1 --- node 2 3nd cL of node 0,1,... fP-1 --- node 3 4th cL of node 0,1,... fP-1 -... when node > sP get vector 1 etc... - --- 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) --- node 0 1st cL of node 0M..(1M-1); 2nd cL of node 0M..(1M-1)).. --- node 1 1st cL of node 1M..(2M-1); 2nd cL of node 1M..(2M-1).. --- node 2 1st cL of node 2M..(3M-1); 2nd cL of node 2M..(3M-1).. --- node 3 1st cL of node 3M..(3M-1); 2nd cL of node 2M..(3M-1).. --- --- Insert correctly + * Let chunk = (fvol*nvec)/sP be size of a chunk. ( Divide lexico vol * nvec into fP/sP = M chunks ) + * + * 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) + * + * node 0 1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1).. data chunk x M x sP = fL / sP * M * sP = fL * M growth + * node 1 1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1).. + * node 2 1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1).. + * node 3 1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1).. + * etc... */ template void Grid_split(std::vector > & full,Lattice & split) @@ -879,7 +796,6 @@ void Grid_split(std::vector > & full,Lattice & split) int nvec = nvector; // Counts down to 1 as we collapse dims std::vector ldims = full_grid->_ldimensions; - std::vector lcoor(ndim); for(int d=ndim-1;d>=0;d--){ @@ -891,73 +807,40 @@ void Grid_split(std::vector > & full,Lattice & split) split_grid->AllToAll(d,alldata,tmpdata); } - /* --- Let chunk = (fL*nvec)/sP chunk. ( Divide into fP/sP = M chunks ) --- --- 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) --- --- node 0 1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1).. data chunk x M x sP = fL / sP * M * sP = fL * M growth --- node 1 1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1).. --- node 2 1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1).. --- node 3 1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1).. --- --- Loop over c = 0..chunk-1 --- Loop over n = 0..M --- Loop over j = 0..sP --- total chunk*M*sP = fL/sP*fP/sP*sP = G/sP = sL --- csite = (c+m*chunk)% --- split into m*chunk+o = lsite*nvec/fP --- Must turn to vec, rsite, - */ - auto rdims = ldims; - int M = ratio[d]; - nvec /= M; // Reduce nvec by subdivision factor - rdims[d] *= M; // increase local dims by same factor + auto M = ratio[d]; auto rsites= lsites*M;// increases rsites by M + nvec /= M; // Reduce nvec by subdivision factor + rdims[d] *= M; // increase local dim by same factor int sP = split_grid->_processors[d]; int fP = full_grid->_processors[d]; int fvol = lsites; - int svol = rsites; - int chunk = (nvec*fvol)/sP; - int cL = (nvec*ldims[d])/sP; - - for(int c=0;c coor(ndim); + Lexicographic::CoorFromIndex(coor, lex_fvol, ldims); + coor[d] += m*ldims[d]; + Lexicographic::IndexFromCoor(coor, lex_r, rdims); + lex_r += lex_vec * rsites; - alldata[rsite] = tmpdata[c+chunk*m+chunk*M*s]; - - if ( 0 - &&(lcoor[0]==0) - &&(lcoor[1]==0) - &&(lcoor[2]==0) - &&(lcoor[3]==0) ) { - - std::cout << GridLogMessage << " SPLIT rcoor[d] = "< > & full,Lattice & split) ///////////////////////////////////////////////////////////////// // Start from split grid and work towards full grid ///////////////////////////////////////////////////////////////// - std::vector lcoor(ndim); - std::vector rcoor(ndim); int nvec = 1; uint64_t rsites = split_grid->lSites(); @@ -1046,77 +927,52 @@ void Grid_unsplit(std::vector > & full,Lattice & split) if ( ratio[d] != 1 ) { - { - int sP = split_grid->_processors[d]; - int fP = full_grid->_processors[d]; + auto M = ratio[d]; - int M = ratio[d]; - auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor - auto lsites= rsites/M; // Decreases rsites by M - - int fvol = lsites; - int svol = rsites; - int chunk = (nvec*fvol)/sP; - int cL = (nvec*ldims[d])/sP; + int sP = split_grid->_processors[d]; + int fP = full_grid->_processors[d]; + + auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor + auto lsites= rsites/M; // Decreases rsites by M + + int fvol = lsites; + int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); + { + // Loop over reordered data post A2A for(int c=0;c= tmpdata.size() ) { - - std::cout << "c "< coor(ndim); + Lexicographic::CoorFromIndex(coor, lex_fvol, ldims); + coor[d] += m*ldims[d]; + Lexicographic::IndexFromCoor(coor, lex_r, rdims); + lex_r += lex_vec * rsites; + // LexicoFind coordinate & vector number within split lattice + tmpdata[lex_c] = alldata[lex_r]; } } } - - if ( split_grid->_processors[d] > 1 ) { - split_grid->AllToAll(d,tmpdata,alldata); - tmpdata=alldata; - } - full_grid ->AllToAll(d,tmpdata,alldata); - - rdims[d]/= M; - rsites /= M; - nvec *= M; // Increase nvec by subdivision factor } + + if ( split_grid->_processors[d] > 1 ) { + split_grid->AllToAll(d,tmpdata,alldata); + tmpdata=alldata; + } + full_grid ->AllToAll(d,tmpdata,alldata); + rdims[d]/= M; + rsites /= M; + nvec *= M; // Increase nvec by subdivision factor } } @@ -1129,7 +985,6 @@ void Grid_unsplit(std::vector > & full,Lattice & split) } vectorizeFromLexOrdArray(scalardata,full[v]); } - } }