diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index dd83f269..40c1871f 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -119,6 +119,9 @@ public: void DhopOE(const FermionField &in, FermionField &out,int dag); void DhopEO(const FermionField &in, FermionField &out,int dag); + void DhopComms (const FermionField &in, FermionField &out); + void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids); + // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 2d868c27..ad077dd3 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -57,6 +57,10 @@ public: int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; + static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + uint64_t *ids); + static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index ec3bd94a..92de5a40 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -438,6 +438,29 @@ void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int DhopInternal(StencilOdd,UmuEven,in,out,dag); } +template +void WilsonFermion5D::DhopComms(const FermionField &in, FermionField &out) +{ + int dag =0 ; + conformable(in.Grid(),FermionGrid()); // verifies full grid + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + Compressor compressor(dag); + Stencil.HaloExchangeOpt(in,compressor); +} +template +void WilsonFermion5D::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids) +{ + conformable(in.Grid(),FermionGrid()); // verifies full grid + conformable(in.Grid(),out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + int LLs = in.Grid()->_rdimensions[0]; + int Opt = WilsonKernelsStatic::Opt; + Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids); +} + template void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 43662b9c..1d0dfb61 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -411,6 +411,46 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #undef LoopBody } +#ifdef GRID_SYCL +extern "C" { + ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void ); + void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value ); +} +#ifdef GRID_SIMT +#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id()) +#else +#define MAKE_ID(A) (0) +#endif + +#else + +#define MAKE_ID(A) (0) + +#endif + + +#define KERNEL_CALL_ID(A) \ + const uint64_t NN = Nsite*Ls; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ + const int Nsimd = SiteHalfSpinor::Nsimd(); \ + const int lane=acceleratorSIMTlane(Nsimd); \ + int idx=sF*Nsimd+lane; \ + uint64_t id = MAKE_ID(); \ + ids[idx]=id; \ + }); \ + accelerator_barrier(); #define KERNEL_CALLNB(A) \ const uint64_t NN = Nsite*Ls; \ @@ -418,7 +458,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S int sF = ss; \ int sU = ss/Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ - }); + }); #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); @@ -451,6 +491,8 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ });} + + template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, @@ -485,6 +527,18 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField } assert(0 && " Kernel optimisation case not covered "); } + +template +void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + uint64_t *ids) +{ + autoView(U_v , U,AcceleratorRead); + autoView(in_v , in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v , st,AcceleratorRead); + KERNEL_CALL_ID(GenericDhopSite); +} template void WilsonKernels::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, diff --git a/tests/Test_dwf_dslash_repro.cc b/tests/Test_dwf_dslash_repro.cc new file mode 100644 index 00000000..1bf813d9 --- /dev/null +++ b/tests/Test_dwf_dslash_repro.cc @@ -0,0 +1,239 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_cg_prec.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + +typedef LatticeFermionD FermionField; + +int VerifyOnDevice(const FermionField &res, FermionField &ref) +{ + deviceVector Fails(1); + int * Fail = &Fails[0]; + int FailHost=0; + + typedef typename FermionField::vector_object vobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + const uint64_t NN = res.Grid()->oSites(); + + acceleratorPut(*Fail,FailHost); + + accelerator_barrier(); + // Inject an error + + int injection=0; + if(getenv("GRID_ERROR_INJECT")) injection=1; + autoView(res_v,res,AcceleratorWrite); + autoView(ref_v,ref,AcceleratorRead); + if ( res.Grid()->ThisRank()== 0 ) + { + if (((random()&0xF)==0)&&injection) { + uint64_t sF = random()%(NN); + int lane=0; + printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank()); + auto vv = acceleratorGet(res_v[sF]); + double *dd = (double *)&vv; + *dd=M_PI; + acceleratorPut(res_v[sF],vv); + } + } + + accelerator_for( sF, NN, vobj::Nsimd(), { +#ifdef GRID_SIMT + { + int blane = acceleratorSIMTlane(vobj::Nsimd()); +#else + for(int blane;blaneoSites(); + + /////////////////////////////// + // Pull back to host + /////////////////////////////// + autoView(res_v,res,CpuRead); + autoView(ref_v,ref,CpuRead); + + std::vector ids_host(NN*Nsimd); + + acceleratorCopyFromDevice(ids,&ids_host[0],NN*Nsimd*sizeof(uint64_t)); + + ////////////////////////////////////////////////////////////// + // Redo check on host and print IDs + ////////////////////////////////////////////////////////////// + + for(int ss=0;ss< NN; ss++){ + int sF = ss; + for(int lane=0;lane>0 )&0xFF; + int slice =(id>>8 )&0xFF; + int eu =(id>>16)&0xFF; + std::cout << GridHostname()<<" miscompare site "< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + LatticeFermionD src(FGrid); random(RNG5,src); + LatticeFermionD junk(FGrid); random(RNG5,junk); + + LatticeFermionD result(FGrid); result=Zero(); + LatticeFermionD ref(FGrid); ref=Zero(); + + SU::HotConfiguration(RNG4,Umu); + + RealD mass=0.1; + RealD M5=1.8; + + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + int nsecs=600; + if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ + std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds"); + GridCmdOptionInt(arg,nsecs); + } + + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl; + UGrid->Barrier(); + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl; + + std::cout << GridLogMessage << "::::::::::::: Starting DWF repro for "<Broadcast(0,(void *)&start,sizeof(start)); + + FlightRecorder::ContinueOnFail = 0; + FlightRecorder::PrintEntireLog = 0; + FlightRecorder::ChecksumComms = 0; + FlightRecorder::ChecksumCommsSend=0; + + if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s); + + const uint64_t NN = FGrid->oSites()*vComplexD::Nsimd(); + + deviceVector ids_device(NN); + uint64_t *ids = &ids_device[0]; + + + Ddwf.DhopComms(src,ref); + Ddwf.DhopCalc(src,ref,ids); + + Ddwf.DhopComms(src,result); + + int iter=0; + do { + + result=junk; + + Ddwf.DhopCalc(src,result,ids); + + if ( VerifyOnDevice(result, ref) ) { + printf("Node %s Iter %d detected fails\n",GridHostname(),iter); + PrintFails(result,ref,ids); + // std::cout << " Dslash "<Broadcast(0,(void *)&now,sizeof(now)); + } while (now < (start + nsecs) ); + + + Grid_finalize(); +}