From 23813ac798a4c5db96fbef83de3389c99b736055 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 30 Sep 2015 16:01:28 -0700 Subject: [PATCH 01/15] No compile on babbage fix --- lib/parallelIO/BinaryIO.h | 2 +- lib/parallelIO/NerscIO.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 01b79dc6..bb1b559a 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -7,7 +7,7 @@ #include - +#include // 64bit endian swap is a portability pain #ifndef __has_builtin // Optional of course. #define __has_builtin(x) 0 // Compatibility with non-clang compilers. diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h index 595588dd..a442f20b 100644 --- a/lib/parallelIO/NerscIO.h +++ b/lib/parallelIO/NerscIO.h @@ -83,7 +83,7 @@ inline void NerscMachineCharacteristics(NerscField &header) std::time_t t = std::time(nullptr); std::tm tm = *std::localtime(&t); std::ostringstream oss; - oss << std::put_time(&tm, "%c %Z"); + // oss << std::put_time(&tm, "%c %Z"); header.creation_date = oss.str(); header.archive_date = header.creation_date; From f4b6d1dfea103366cce94fdc1216aec74bf6f5ff Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 30 Sep 2015 16:02:14 -0700 Subject: [PATCH 02/15] NGO stores reenabled --- lib/simd/Grid_imci.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/simd/Grid_imci.h b/lib/simd/Grid_imci.h index 63765a47..bf48ab4a 100644 --- a/lib/simd/Grid_imci.h +++ b/lib/simd/Grid_imci.h @@ -9,10 +9,10 @@ #include -#ifndef KNC_ONLY_STORES -#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512 -#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512 -#endif +//#ifndef KNC_ONLY_STORES +//#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512 +//#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512 +//#endif namespace Optimization { From a660ce716baebc519305fd2eaa6eb336b8d4b720 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 30 Sep 2015 16:02:44 -0700 Subject: [PATCH 03/15] No compile babbage fix --- lib/qcd/utils/SUn.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h index 66e5f8ab..4be60e75 100644 --- a/lib/qcd/utils/SUn.h +++ b/lib/qcd/utils/SUn.h @@ -524,16 +524,22 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g // reunitarise?? static void LieRandomize(GridParallelRNG &pRNG,LatticeMatrix &out,double scale=1.0){ GridBase *grid = out._grid; + LatticeComplex ca (grid); LatticeMatrix lie(grid); LatticeMatrix la (grid); Complex ci(0.0,scale); + Complex cone(1.0,0.0); Matrix ta; lie=zero; for(int a=0;a Date: Wed, 30 Sep 2015 16:03:05 -0700 Subject: [PATCH 04/15] No compile babbage fix --- tests/Test_multishift_sqrt.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/Test_multishift_sqrt.cc b/tests/Test_multishift_sqrt.cc index 111a521f..f252accc 100644 --- a/tests/Test_multishift_sqrt.cc +++ b/tests/Test_multishift_sqrt.cc @@ -17,7 +17,8 @@ public: pRNG.SeedFixedIntegers(seeds); random(pRNG,sqrtscale); - sqrtscale = real(sqrtscale)*3.0+0.5;// force real pos def + sqrtscale = 0.5*(sqrtscale + conjugate(sqrtscale)); + sqrtscale = sqrtscale*3.0+0.5;// force real pos def scale = sqrtscale *sqrtscale; //scale should be bounded by 12.25 // From 1878bf97d050d38ce117c43a196d63ed5584132c Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 30 Sep 2015 16:04:01 -0700 Subject: [PATCH 05/15] Babbage fix --- lib/qcd/action/fermion/WilsonFermion.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index caea055f..053526cd 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -82,7 +82,8 @@ namespace QCD { template void WilsonFermion::Mooee(const FermionField &in, FermionField &out) { out.checkerboard = in.checkerboard; - out = (4.0+mass)*in; + typename FermionField::scalar_type scal(4.0+mass); + out = scal*in; } template From 0f59356e86b44d687d49f1a667c4e4fa007bfd76 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 2 Nov 2015 00:00:15 +0000 Subject: [PATCH 06/15] Problem in comms fixed --- lib/stencil/Stencil_common.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stencil/Stencil_common.cc b/lib/stencil/Stencil_common.cc index 8a8b3a54..db24b478 100644 --- a/lib/stencil/Stencil_common.cc +++ b/lib/stencil/Stencil_common.cc @@ -8,7 +8,7 @@ namespace Grid { int checkerboard, const std::vector &directions, const std::vector &distances) - : _entries(npoints), _permute_type(npoints) + : _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints) { _npoints = npoints; _grid = grid; From c26220e9ab0bb4e4185f7a8012f622f8f1bf7e45 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 4 Nov 2015 09:54:48 +0000 Subject: [PATCH 07/15] EO benchmark as well as non-eo --- benchmarks/Benchmark_dwf.cc | 46 ++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 27772fdb..b53b2c91 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -24,7 +24,7 @@ int main (int argc, char ** argv) std::cout< latt4 = GridDefaultLatt(); - const int Ls=8; + const int Ls=16; GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -82,22 +82,24 @@ int main (int argc, char ** argv) DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); std::cout< Date: Wed, 4 Nov 2015 09:56:58 +0000 Subject: [PATCH 09/15] EO bug fix when spread out in x-direction --- lib/Stencil.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index c6e51059..d73db237 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -124,6 +124,7 @@ namespace Grid { if ( comm_dim ) { sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + // std::cout << "dim "<=0); assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; - + std::vector > send_buf(buffer_size); // hmm... std::vector > recv_buf(buffer_size); - + int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - + for(int x=0;x>1; int bytes = words * sizeof(cobj); @@ -201,10 +202,11 @@ namespace Grid { recv_from_rank, bytes); - for(int i=0;i_slice_nblock[dimension]*_grid->_slice_block[dimension]; int words = sizeof(cobj)/sizeof(vector_type); + assert(cbmask==0x3); // Fixme think there is a latent bug if not true /* * possibly slow to allocate * Doesn't matter in this test, but may want to preallocate in the From 1271508ca26198a39d441e275d5b393ee272483f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 4 Nov 2015 09:57:57 +0000 Subject: [PATCH 10/15] Bug fix for spread out in x (EO) direction. This is really annoying -- it is very hard to thread the loops with the index recursion on buffer offset in the red-black case. Must think of a good threading solution here. --- lib/cshift/Cshift_common.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 822e02c9..83508ca3 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -29,14 +29,15 @@ Gather_plane_simple (const Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; -PARALLEL_NESTED_LOOP2 + int bo=0; + //PARALLEL_NESTED_LOOP21 for(int n=0;n_slice_stride[dimension]; - int bo = n*rhs._grid->_slice_block[dimension]; + // int bo = n*rhs._grid->_slice_block[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb &cbmask ) { - buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); } } } @@ -59,7 +60,7 @@ Gather_plane_extract(const Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; -PARALLEL_NESTED_LOOP2 + //PARALLEL_NESTED_LOOP2 for(int n=0;n void Scatter_plane_simple (Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; -PARALLEL_NESTED_LOOP2 + int bo=0; + //PARALLEL_NESTED_LOOP2 for(int n=0;n_slice_stride[dimension]; - int bo =n*rhs._grid->_slice_block[dimension]; + // int bo =n*rhs._grid->_slice_block[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { - rhs._odata[so+o+b]=buffer[bo+b]; + rhs._odata[so+o+b]=buffer[bo++]; } } } From 12c5ec813c2f14629c4c980cd4cc87dc06b627a6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 4 Nov 2015 09:59:27 +0000 Subject: [PATCH 11/15] Useful debug messages (commented out) are included for preservation in case I need to revisit this --- lib/cshift/Cshift_mpi.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/cshift/Cshift_mpi.h b/lib/cshift/Cshift_mpi.h index 9cdfd316..915b75ee 100644 --- a/lib/cshift/Cshift_mpi.h +++ b/lib/cshift/Cshift_mpi.h @@ -9,7 +9,7 @@ template Lattice Cshift(const Lattice &rhs,int dimension typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; - Lattice ret(rhs._grid); + Lattice ret(rhs._grid); int fd = rhs._grid->_fdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension]; @@ -26,10 +26,13 @@ template Lattice Cshift(const Lattice &rhs,int dimension if ( !comm_dim ) { + // std::cout << "Cshift_local" < void Cshift_comms(Lattice& ret,const Lattice &r sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); + // std::cout << "Cshift_comms dim "< void Cshift_comms(Lattice &ret,const Lattice &r int xmit_to_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); + // for(int i=0;i Date: Wed, 4 Nov 2015 10:00:27 +0000 Subject: [PATCH 12/15] formatting only --- lib/qcd/action/fermion/WilsonKernels.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 77f68e4d..a897921a 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -78,7 +78,7 @@ void WilsonKernels::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel } Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st); accumReconXm(result,Uchi); - + // Ym SE=st.GetEntry(ptype,Ym,sF); if ( SE->_is_local && SE->_permute ) { From 24044dbc56e1b20c548871badff1c9ca925420d3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 4 Nov 2015 10:00:55 +0000 Subject: [PATCH 13/15] Debugged a problem with checkerboarded cshift in the checker dimension which arose only when mpi spread out in the checker dimension. Added a test that trapped and helped debug this --- lib/stencil/Stencil_common.cc | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/lib/stencil/Stencil_common.cc b/lib/stencil/Stencil_common.cc index db24b478..f9081adc 100644 --- a/lib/stencil/Stencil_common.cc +++ b/lib/stencil/Stencil_common.cc @@ -61,11 +61,17 @@ namespace Grid { sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); if ( sshift[0] == sshift[1] ) { Comms(point,dimension,shift,0x3); + // std::cout<<"Comms 0x3"< o"<<_entries[i][ss]._offset<<"; l"<< + // _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); - for(int x=0;x= rd ); - // int comm_proc = ((x+sshift)/ld)%pd; - // int offnode = (comm_proc!=0); - int sx = (x+sshift)%rd; + for(int x=0;xx) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -249,7 +256,7 @@ namespace Grid { int so = plane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane int bo = 0; // offset in buffer - + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ From 01f286c9fe4bb7de32093006fd576815b9c7fb3e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 4 Nov 2015 10:02:17 +0000 Subject: [PATCH 14/15] Better testing for red black cshift which was sufficient to chase down a spread out x-direction problem. --- tests/Test_cshift_red_black.cc | 37 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/Test_cshift_red_black.cc b/tests/Test_cshift_red_black.cc index 65af9857..88355e47 100644 --- a/tests/Test_cshift_red_black.cc +++ b/tests/Test_cshift_red_black.cc @@ -54,27 +54,27 @@ int main (int argc, char ** argv) TComplex cm; for(int dir=0;dir coor(4); @@ -105,18 +105,18 @@ int main (int argc, char ** argv) Fine.CoorFromIndex(peer,index,latt_size); if (nrm > 0){ - std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir + std::cout<<"FAIL shift "<< shift<<" in dir "<< dir <<" ["< 0){ - std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir + std::cout<<"FAIL shift "<< shift<<" in dir "<< dir <<" ["< Date: Wed, 4 Nov 2015 10:03:04 +0000 Subject: [PATCH 15/15] Added an even odd stencil test, shook out a problem with spread out x-direction. Generalise test to allow different types of "Field" to be used. --- tests/Test_stencil.cc | 179 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 147 insertions(+), 32 deletions(-) diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc index cc49ab22..5949fa63 100644 --- a/tests/Test_stencil.cc +++ b/tests/Test_stencil.cc @@ -8,6 +8,10 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); + // typedef LatticeColourMatrix Field; + typedef LatticeComplex Field; + typedef typename Field::vector_object vobj; + typedef typename vobj::scalar_object sobj; std::vector latt_size = GridDefaultLatt(); std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); @@ -18,23 +22,40 @@ int main (int argc, char ** argv) GridCartesian Fine(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout); GridParallelRNG fRNG(&Fine); + // fRNG.SeedRandomDevice(); std::vector seeds({1,2,3,4}); fRNG.SeedFixedIntegers(seeds); - LatticeColourMatrix Foo(&Fine); - LatticeColourMatrix Bar(&Fine); - LatticeColourMatrix Check(&Fine); - LatticeColourMatrix Diff(&Fine); - + Field Foo(&Fine); + Field Bar(&Fine); + Field Check(&Fine); + Field Diff(&Fine); + LatticeComplex lex(&Fine); + + lex = zero; random(fRNG,Foo); gaussian(fRNG,Bar); + /* + Integer stride =1000; + { + double nrm; + LatticeComplex coor(&Fine); + + for(int d=0;d directions(npoint,dir); @@ -48,8 +69,8 @@ int main (int argc, char ** argv) ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir]; } - std::vector > comm_buf(myStencil._unified_buffer_size); - SimpleCompressor compress; + std::vector > comm_buf(myStencil._unified_buffer_size); + SimpleCompressor compress; myStencil.HaloExchange(Foo,comm_buf,compress); Bar = Cshift(Foo,dir,disp); @@ -75,9 +96,114 @@ int main (int argc, char ** argv) Real nrm = norm2(Diff); std::cout< coor(4); + for(coor[3]=0;coor[3] 0){ + std::cout <<"Coor (" << coor[0]<<","< directions(npoint,dir); + std::vector displacements(npoint,disp); + + CartesianStencil EStencil(&rbFine,npoint,Even,directions,displacements); + CartesianStencil OStencil(&rbFine,npoint,Odd,directions,displacements); + + std::vector ocoor(4); + for(int o=0;o > Ecomm_buf(EStencil._unified_buffer_size); + std::vector > Ocomm_buf(OStencil._unified_buffer_size); + + SimpleCompressor compress; + + EStencil.HaloExchange(EFoo,Ecomm_buf,compress); + OStencil.HaloExchange(OFoo,Ocomm_buf,compress); + + Bar = Cshift(Foo,dir,disp); + + if ( disp & 0x1 ) { + ECheck.checkerboard = Even; + OCheck.checkerboard = Odd; + } else { + ECheck.checkerboard = Odd; + OCheck.checkerboard = Even; + } + // Implement a stencil code that should agree with that darn cshift! + for(int i=0;ioSites();i++){ + int permute_type; + StencilEntry *SE; + SE = EStencil.GetEntry(permute_type,0,i); + std::cout << "Even source "<< i<<" -> " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) + permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type); + else if (SE->_is_local) + OCheck._odata[i] = EFoo._odata[SE->_offset]; + else + OCheck._odata[i] = Ecomm_buf[SE->_offset]; + } + for(int i=0;ioSites();i++){ + int permute_type; + StencilEntry *SE; + SE = OStencil.GetEntry(permute_type,0,i); + std::cout << "ODD source "<< i<<" -> " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) + permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type); + else if (SE->_is_local) + ECheck._odata[i] = OFoo._odata[SE->_offset]; + else + ECheck._odata[i] = Ocomm_buf[SE->_offset]; + } + + setCheckerboard(Check,ECheck); + setCheckerboard(Check,OCheck); + + Real nrmC = norm2(Check); + Real nrmB = norm2(Bar); + Diff = Check-Bar; + Real nrm = norm2(Diff); + std::cout< coor(4); for(coor[3]=0;coor[3] 0){ - printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n", - coor[0],coor[1],coor[2],coor[3],r,c, - nn, - real(check()()(r,c)), - imag(check()()(r,c)), - real(bar()()(r,c)) - ); - } - snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c)); - snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c)); - snrm=snrm+nn; - }} + sobj ddiff; + ddiff = check -bar; + diff =norm2(ddiff); + if ( diff > 0){ + std::cout <<"Coor (" << coor[0]<<","<