diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index de1cfe01..ec913c18 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -81,6 +81,7 @@ public: using OperatorFunction::operator(); RealD Tolerance; + Integer MaxIterationsMshift; Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion std::vector IterationsToCompleteShift; // Iterations for this shift @@ -95,9 +96,9 @@ public: ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts, GridBase* _SinglePrecGrid, LinearOperatorBase &_Linop_f, - int _ReliableUpdateFreq - ) : - MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq) + int _ReliableUpdateFreq) : + MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq), + MaxIterations(20000) { verbose=1; IterationsToCompleteShift.resize(_shifts.order); @@ -244,7 +245,7 @@ public: // Iteration loop int k; - for (k=1;k<=MaxIterations;k++){ + for (k=1;k<=MaxIterationsMshift;k++){ a = c /cp; AXPYTimer.Start(); @@ -350,12 +351,17 @@ public: } } - if ( all_converged ){ + if ( all_converged || k == MaxIterationsMshift-1){ SolverTimer.Stop(); - std::cout< #include #include +#include #ifdef GRID_CUDA #include diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index a1234e3b..72f8b810 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -484,24 +484,26 @@ public: int dag = compress.dag; int face_idx=0; +#define vet_same_node(a,b) \ + { auto tmp = b; } if ( dag ) { - assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); - assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); - assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); - assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); - assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); - assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); - assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); - assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); - assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); - assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); - assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); - assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); - assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); - assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); - assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 68422f28..2d868c27 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -52,13 +52,6 @@ public: typedef AcceleratorVector StencilVector; public: -#ifdef GRID_SYCL -#define SYCL_HACK -#endif -#ifdef SYCL_HACK - static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf, - int ss,int sU,const SiteSpinor *in, SiteSpinor *out); -#endif static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 4ca24789..1ddb30ba 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -63,6 +63,10 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, _tmp(&FiveDimRedBlackGrid), Dirichlet(0) { + Stencil.lo = &Lebesgue; + StencilEven.lo = &LebesgueEvenOdd; + StencilOdd.lo = &LebesgueEvenOdd; + // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 2833fdc4..1a262533 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -60,6 +60,9 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, _tmp(&Hgrid), anisotropyCoeff(anis) { + Stencil.lo = &Lebesgue; + StencilEven.lo = &LebesgueEvenOdd; + StencilOdd.lo = &LebesgueEvenOdd; // Allocate the required comms buffer ImportGauge(_Umu); if (anisotropyCoeff.isAnisotropic){ diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index bdba7cb2..fcf1f1f3 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -433,11 +433,23 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S }); #define ASM_CALL(A) \ - thread_for( ss, Nsite, { \ + thread_for( sss, Nsite, { \ + int ss = st.lo->Reorder(sss); \ int sU = ss; \ int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ }); +#define ASM_CALL_SLICE(A) \ + auto grid = in.Grid() ; \ + int nt = grid->LocalDimensions()[4]; \ + int nxyz = Nsite/nt ; \ + for(int t=0;t::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ + });} template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h index cb680f2f..f237dee4 100644 --- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h @@ -127,6 +127,8 @@ NAMESPACE_BEGIN(Grid); ApproxNegPowerAction.tolerances[i] = action_tolerance[i]; ApproxHalfPowerAction.tolerances[i] = action_tolerance[i]; ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i]; + } + for(int i=0;i + NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -58,7 +60,7 @@ NAMESPACE_BEGIN(Grid); //Allow derived classes to override the multishift CG virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ #if 0 - SchurDifferentiableOperator schurOp(numerator ? NumOp : DenOp); + SchurDifferentiableOperator schurOp(numerator ? NumOpD : DenOpD); ConjugateGradientMultiShift msCG(MaxIter, approx); msCG(schurOp,in, out); #else @@ -66,7 +68,8 @@ NAMESPACE_BEGIN(Grid); SchurDifferentiableOperator schurOpF(numerator ? NumOpF : DenOpF); FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); - + + // Action better with higher precision? ConjugateGradientMultiShiftMixedPrec msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); precisionChange(inD2,in); std::cout << "msCG single solve "< &out_elems, FermionFieldD &out){ SchurDifferentiableOperator schurOpD2(numerator ? NumOpD2 : DenOpD2); - SchurDifferentiableOperator schurOpF(numerator ? NumOpF : DenOpF); + SchurDifferentiableOperator schurOpF (numerator ? NumOpF : DenOpF); FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); std::vector out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid()); - ConjugateGradientMultiShiftMixedPrec msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); + ConjugateGradientMultiShiftMixedPrecCleanup msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); precisionChange(inD2,in); std::cout << "msCG in "<_directions[point]; @@ -367,7 +368,40 @@ public: if ( displacement == 0 ) return 1; return 0; } +#else + // fancy calculation for shm code + inline int SameNode(int point) { + int dimension = this->_directions[point]; + int displacement = this->_distances[point]; + + int pd = _grid->_processors[dimension]; + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + int recv_from_rank; + int xmit_to_rank; + + if ( ! comm_dim ) return 1; + + int nbr_proc; + if (displacement>0) nbr_proc = 1; + else nbr_proc = pd-1; + + // FIXME this logic needs to be sorted for three link term + // assert( (displacement==1) || (displacement==-1)); + // Present hack only works for >= 4^4 subvol per node + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); + + if ( shm==NULL ) return 0; + return 1; + } +#endif ////////////////////////////////////////// // Comms packet queue for asynch thread // Use OpenMP Tasks for cleaner ??? @@ -1075,7 +1109,7 @@ public: int comms_recv = this->_comms_recv[point]; int comms_partial_send = this->_comms_partial_send[point] ; int comms_partial_recv = this->_comms_partial_recv[point] ; - + assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1146,11 +1180,32 @@ public: recv_buf=this->u_recv_buf_p; } + // potential SHM fast path for intranode + int shm_send=0; + int shm_recv=0; +#ifdef SHM_FAST_PATH + // Put directly in place if we can + send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); + if ( (send_buf==NULL) ) { + shm_send=0; + send_buf = this->u_send_buf_p; + } else { + shm_send=1; + } + void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf); + if ( test_ptr != NULL ) shm_recv = 1; + // static int printed; + // if (!printed){ + // std::cout << " GATHER FAST PATH SHM "<u_send_buf_p; // Gather locally, must send assert(send_buf!=NULL); +#endif // std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - +#ifdef SHM_FAST_PATH + #warning STENCIL SHM FAST PATH SELECTED + // shm == receive pointer if offnode + // shm == Translate[send pointer] if on node -- my view of his send pointer + cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); + if (shm==NULL) { + shm = rp; + // we found a packet that comes from MPI and contributes to this shift. + // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil. + // Kernel will add the exterior_terms except if is_same_node. + // leg of stencil + shm_recv=0; + } else { + shm_recv=1; + } + rpointers[i] = shm; + // Test send side + void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp); + if ( test_ptr != NULL ) shm_send = 1; + // static int printed; + // if (!printed){ + // std::cout << " GATHERSIMD FAST PATH SHM "<u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); } diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 8427dc14..2dde1433 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -248,17 +248,23 @@ inline int acceleratorIsCommunicable(void *ptr) ////////////////////////////////////////////// // SyCL acceleration ////////////////////////////////////////////// -#ifdef GRID_SYCL -NAMESPACE_END(Grid); -#include -#include +#ifdef GRID_SYCL #define GRID_SYCL_LEVEL_ZERO_IPC -#ifdef GRID_SYCL_LEVEL_ZERO_IPC +NAMESPACE_END(Grid); +#if 0 +#include +#include #include #include +#else +#include +#include +#include +#include #endif + NAMESPACE_BEGIN(Grid); extern cl::sycl::queue *theGridAccelerator; diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc index 0a924486..059a0f20 100644 --- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc +++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc @@ -232,31 +232,34 @@ int main(int argc, char **argv) { // std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); - OneFlavourRationalParams OFRp; // Up/down - OFRp.lo = 4.0e-5; + int SP_iters=10000; + + RationalActionParams OFRp; // Up/down + OFRp.lo = 6.0e-5; OFRp.hi = 90.0; - OFRp.MaxIter = 60000; - OFRp.tolerance= 1.0e-5; - OFRp.mdtolerance= 1.0e-3; + OFRp.inv_pow = 2; + OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space + OFRp.action_tolerance= 1.0e-8; + OFRp.action_degree = 18; + OFRp.md_tolerance= 1.0e-5; + OFRp.md_degree = 14; // OFRp.degree = 20; converges // OFRp.degree = 16; - OFRp.degree = 18; OFRp.precision= 80; OFRp.BoundsCheckFreq=0; std::vector ActionTolByPole({ - 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-7,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8 }); std::vector MDTolByPole({ - 1.0e-5,5.0e-6,1.0e-6,1.0e-7, // soften convergence more more + 1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more // 1.0e-6,3.0e-7,1.0e-7,1.0e-7, // 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8, - 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8 }); @@ -340,6 +343,7 @@ int main(int argc, char **argv) { ParamsDirF.dirichlet=Dirichlet; ParamsDir.partialDirichlet=1; ParamsDirF.partialDirichlet=1; + std::cout << GridLogMessage<< "Partial Dirichlet depth is "< *> Bdys; + std::vector *> Bdys; #else - std::vector *> Bdys; + std::vector *> Bdys; #endif typedef SchurDiagMooeeOperator LinearOperatorF; @@ -544,19 +548,19 @@ int main(int argc, char **argv) { ParamsNumD2.partialDirichlet = ParamsNum.partialDirichlet; NumeratorsD2.push_back (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2)); - Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( + Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction( *Numerators[h],*Denominators[h], *NumeratorsF[h],*DenominatorsF[h], *NumeratorsD2[h],*DenominatorsD2[h], - OFRp, 400) ); - Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( + OFRp, SP_iters) ); + Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction( *Numerators[h],*Denominators[h], *NumeratorsF[h],*DenominatorsF[h], *NumeratorsD2[h],*DenominatorsD2[h], - OFRp, 400) ); + OFRp, SP_iters) ); #else - Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); - Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); #endif } } diff --git a/HMC/Mobius2p1f_EOFA_96I_hmc.cc b/HMC/Mobius2p1f_EOFA_96I_hmc.cc index d27d558e..6e7fb3cd 100644 --- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc +++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc @@ -183,7 +183,7 @@ int main(int argc, char **argv) { // 4/2 => 0.6 dH // 3/3 => 0.8 dH .. depth 3, slower //MD.MDsteps = 4; - MD.MDsteps = 3; + MD.MDsteps = 12; MD.trajL = 0.5; HMCparameters HMCparams; @@ -200,8 +200,8 @@ int main(int argc, char **argv) { TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition CheckpointerParameters CPparams; - CPparams.config_prefix = "ckpoint_DDHMC_lat"; - CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.config_prefix = "ckpoint_HMC_lat"; + CPparams.rng_prefix = "ckpoint_HMC_rng"; CPparams.saveInterval = 1; CPparams.format = "IEEE64BIG"; TheHMC.Resources.LoadNerscCheckpointer(CPparams); @@ -228,7 +228,7 @@ int main(int argc, char **argv) { Real pv_mass = 1.0; // std::vector hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // std::vector hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); - std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated + std::vector hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); auto GridPtr = TheHMC.Resources.GetCartesian(); @@ -299,8 +299,8 @@ int main(int argc, char **argv) { //////////////////////////////////// // Collect actions //////////////////////////////////// - ActionLevel Level1(1); - ActionLevel Level2(3); + // ActionLevel Level1(1); + ActionLevel Level2(1); ActionLevel Level3(15); //////////////////////////////////// @@ -369,7 +369,7 @@ int main(int argc, char **argv) { ActionCGL, ActionCGR, DerivativeCGL, DerivativeCGR, SFRp, true); - // Level2.push_back(&EOFA); + Level2.push_back(&EOFA); //////////////////////////////////// // up down action @@ -477,7 +477,7 @@ int main(int argc, char **argv) { // Gauge action ///////////////////////////////////////////////////////////// Level3.push_back(&GaugeAction); - TheHMC.TheAction.push_back(Level1); + // TheHMC.TheAction.push_back(Level1); TheHMC.TheAction.push_back(Level2); TheHMC.TheAction.push_back(Level3); std::cout << GridLogMessage << " Action complete "<< std::endl; diff --git a/configure.ac b/configure.ac index 2e6199c7..fedca3fe 100644 --- a/configure.ac +++ b/configure.ac @@ -646,6 +646,14 @@ case ${ac_SHM_FORCE_MPI} in ;; *) ;; esac +############### force MPI in SMP +AC_ARG_ENABLE([shm-fast-path],[AS_HELP_STRING([--enable-shm-fast-path],[Allow kernels to remote copy over intranode])],[ac_SHM_FAST_PATH=${enable_shm_fast_path}],[ac_SHM_FAST_PATH=no]) +case ${ac_SHM_FAST_PATH} in + yes) + AC_DEFINE([SHM_FAST_PATH],[1],[SHM_FAST_PATH] ) + ;; + *) ;; +esac ############### communication type selection AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh index cefab776..decbb6cd 100755 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -23,12 +23,7 @@ export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 -for i in 0 -do -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 -done -#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log -#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log +#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index bb7b517d..0e48625b 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK -if [ $MPI_LOCALRANKID = "0" ] -then -# ~psteinbr/build_pti/ze_tracer -h $@ - onetrace --chrome-device-timeline $@ -else +#if [ $MPI_LOCALRANKID = "0" ] +#then +# ~psteinbr/build_pti/ze_tracer -c $@ +# onetrace --chrome-kernel-timeline $@ +#else $@ -fi +#fi diff --git a/systems/PVC/config-command b/systems/PVC/config-command index cd7bba1d..c3523c2d 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -1,4 +1,4 @@ -INSTALL=/nfs/site/home/azusayax/install +INSTALL=/nfs/site/home/paboylx/prereqs/ ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ diff --git a/systems/PVC/setup.sh b/systems/PVC/setup.sh index 2a6f920b..9b515a62 100644 --- a/systems/PVC/setup.sh +++ b/systems/PVC/setup.sh @@ -1,5 +1,6 @@ export https_proxy=http://proxy-chain.intel.com:911 -export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH module load intel-release source /opt/intel/oneapi/PVC_setup.sh diff --git a/tests/core/Test_fft_matt.cc b/tests/core/Test_fft_matt.cc new file mode 100644 index 00000000..d4455a7e --- /dev/null +++ b/tests/core/Test_fft_matt.cc @@ -0,0 +1,160 @@ + /************************************************************************************* + + grid` physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); // naughty seeding + GridParallelRNG pRNG(&GRID); + pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeFieldD Umu(&GRID); + + SU::ColdConfiguration(pRNG,Umu); // Unit gauge + + //////////////////////////////////////////////////// + // Wilson test + //////////////////////////////////////////////////// + { + LatticeFermionD src(&GRID); gaussian(pRNG,src); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + + RealD mass=0.01; + WilsonFermionD Dw(Umu,GRID,RBGRID,mass); + + Dw.M(src,tmp); + + std::cout << "Dw src = " < HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" <