From 9b6ab6db16cb72433199a429193eb1da41fd7293 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 19 Apr 2016 15:38:01 -0700 Subject: [PATCH] simd in 5th dimension support --- lib/qcd/action/fermion/WilsonFermion.cc | 15 +- lib/qcd/action/fermion/WilsonFermion.h | 3 - lib/qcd/action/fermion/WilsonFermion5D.cc | 410 +++++--------------- lib/qcd/action/fermion/WilsonFermion5D.h | 30 +- lib/qcd/action/fermion/WilsonKernels.cc | 2 + lib/qcd/action/fermion/WilsonKernelsAsm.cc | 2 + lib/qcd/action/fermion/WilsonKernelsHand.cc | 79 +++- 7 files changed, 172 insertions(+), 369 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index d874e0ac..d5fde091 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -288,11 +288,7 @@ PARALLEL_FOR_LOOP void WilsonFermion::DhopInternal(StencilImpl & st,DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { - if ( Impl::overlapCommsCompute () ) { - DhopInternalCommsOverlapCompute(st,U,in,out,dag); - } else { - DhopInternalCommsThenCompute(st,U,in,out,dag); - } + DhopInternalCommsThenCompute(st,U,in,out,dag); } template void WilsonFermion::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U, @@ -330,15 +326,6 @@ PARALLEL_FOR_LOOP } }; - - template - void WilsonFermion::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) { - - assert(0); - - }; - FermOpTemplateInstantiate(WilsonFermion); GparityFermOpTemplateInstantiate(WilsonFermion); diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 6bd9fb38..220c3449 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -116,9 +116,6 @@ namespace Grid { void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) ; - void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) ; - // Constructor WilsonFermion(GaugeField &_Umu, diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 9874031d..b8dba2f7 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -68,10 +68,8 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); - assert(FiveDimRedBlackGrid._ndimension==5); assert(FourDimRedBlackGrid._ndimension==4); - assert(FiveDimRedBlackGrid._checker_dim==1); // Dimension zero of the five-d is the Ls direction @@ -106,11 +104,75 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, dslashtime=0; dslash1time=0; } + +template +WilsonFermion5D::WilsonFermion5D(int simd, GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _M5,const ImplParams &p) : + Kernels(p), + _FiveDimGrid (&FiveDimGrid), + _FiveDimRedBlackGrid(&FiveDimRedBlackGrid), + _FourDimGrid (&FourDimGrid), + _FourDimRedBlackGrid(&FourDimRedBlackGrid), + Stencil (_FiveDimGrid,npoint,Even,directions,displacements), + StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even + StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd + M5(_M5), + Umu(_FourDimGrid), + UmuEven(_FourDimRedBlackGrid), + UmuOdd (_FourDimRedBlackGrid), + Lebesgue(_FourDimGrid), + LebesgueEvenOdd(_FourDimRedBlackGrid) +{ + int nsimd = Simd::Nsimd(); + + // some assertions + assert(FiveDimGrid._ndimension==5); + assert(FiveDimRedBlackGrid._ndimension==5); + assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction + assert(FourDimGrid._ndimension==4); + assert(FourDimRedBlackGrid._ndimension==4); + + // Dimension zero of the five-d is the Ls direction + Ls=FiveDimGrid._fdimensions[0]; + assert(FiveDimGrid._processors[0] ==1); + assert(FiveDimGrid._simd_layout[0] ==nsimd); + + assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); + assert(FiveDimRedBlackGrid._processors[0] ==1); + assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); + + // Other dimensions must match the decomposition of the four-D fields + for(int d=0;d<4;d++){ + assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]); + assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); + + assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]); + assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); + + assert(FourDimGrid._simd_layout[d]=1); + assert(FourDimRedBlackGrid._simd_layout[d] ==1); + assert(FourDimRedBlackGrid._simd_layout[d] ==1); + assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); + + assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); + assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); + assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); + } + + // Allocate the required comms buffer + ImportGauge(_Umu); +} + + template void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) { - GaugeField HUmu(_Umu._grid); - HUmu = _Umu*(-0.5); + GaugeField HUmu(_Umu._grid); + HUmu = _Umu*(-0.5); Impl::DoubleStore(GaugeGrid(),Umu,HUmu); pickCheckerboard(Even,UmuEven,Umu); pickCheckerboard(Odd ,UmuOdd,Umu); @@ -294,11 +356,7 @@ void WilsonFermion5D::DhopInternalCommsThenCompute(StencilImpl & st, Lebes Compressor compressor(dag); // Assume balanced KMP_AFFINITY; this is forced in GridThread.h - - int threads = GridThread::GetThreads(); - int HT = GridThread::GetHyperThreads(); - int cores = GridThread::GetCores(); - int nwork = U._grid->oSites(); + int LLs = in._grid->_rdimensions[0]; commtime -=usecond(); auto handle = st.HaloExchangeBegin(in,compressor); @@ -318,97 +376,48 @@ void WilsonFermion5D::DhopInternalCommsThenCompute(StencilImpl & st, Lebes if( this->HandOptDslash ) { PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - { - int sd; - for(sd=0;sdAsmOptDslash ) { - // for(int i=0;i<1;i++){ - // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ - // PerformanceCounter Counter(i); - // Counter.Start(); -#pragma omp parallel for - for(int t=0;toSites();ss++){ + for(int s=0;sHandOptDslash ) { - /* - -#pragma omp parallel for schedule(static) - for(int t=0;toSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - int sU=ss; - for(int s=0;s -void WilsonFermion5D::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) -{ - // assert((dag==DaggerNo) ||(dag==DaggerYes)); - alltime-=usecond(); - Compressor compressor(dag); - - // Assume balanced KMP_AFFINITY; this is forced in GridThread.h - - int threads = GridThread::GetThreads(); - int HT = GridThread::GetHyperThreads(); - int cores = GridThread::GetCores(); - int nwork = U._grid->oSites(); - - commtime -=usecond(); - auto handle = st.HaloExchangeBegin(in,compressor); - st.HaloExchangeComplete(handle); - commtime +=usecond(); - - jointime -=usecond(); - jointime +=usecond(); - - // Dhop takes the 4d grid from U, and makes a 5d index for fermion - // Not loop ordering and data layout. - // Designed to create - // - per thread reuse in L1 cache for U - // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. - -#pragma omp parallel - { - for(int jjj=0;jjj<100;jjj++){ -#pragma omp barrier - dslashtime -=usecond(); - if ( dag == DaggerYes ) { - if( this->HandOptDslash ) { -#pragma omp for - for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - { - int sd; - for(sd=0;sdAsmOptDslash ) { - // for(int i=0;i<1;i++){ - // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ - // PerformanceCounter Counter(i); - // Counter.Start(); - -#pragma omp for - for(int t=0;tHandOptDslash ) { -#pragma omp for - - for(int ss=0;ssoSites();ss++){ - int sU=ss; - for(int s=0;soSites();ss++){ - int sU=ss; - for(int s=0;s -void WilsonFermion5D::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) -{ - // assert((dag==DaggerNo) ||(dag==DaggerYes)); - alltime-=usecond(); - Compressor compressor(dag); - - // Assume balanced KMP_AFFINITY; this is forced in GridThread.h - - int threads = GridThread::GetThreads(); - int HT = GridThread::GetHyperThreads(); - int cores = GridThread::GetCores(); - int nwork = U._grid->oSites(); - - commtime -=usecond(); - auto handle = st.HaloExchangeBegin(in,compressor); - st.HaloExchangeComplete(handle); - commtime +=usecond(); - - jointime -=usecond(); - jointime +=usecond(); - - // Dhop takes the 4d grid from U, and makes a 5d index for fermion - // Not loop ordering and data layout. - // Designed to create - // - per thread reuse in L1 cache for U - // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. - -#pragma omp parallel - { - for(int jjj=0;jjj<100;jjj++){ -#pragma omp barrier - dslashtime -=usecond(); - if ( dag == DaggerYes ) { - if( this->HandOptDslash ) { -#pragma omp for - for(int ss=0;ssoSites();ss++){ - int sU=0; - for(int s=0;soSites();ss++){ - { - int sd; - for(sd=0;sdAsmOptDslash ) { - // for(int i=0;i<1;i++){ - // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ - // PerformanceCounter Counter(i); - // Counter.Start(); - -#pragma omp for - for(int t=0;tHandOptDslash ) { -#pragma omp for - - for(int ss=0;ssoSites();ss++){ - int sU=0; - for(int s=0;soSites();ss++){ - int sU=0; - for(int s=0;s -void WilsonFermion5D::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) -{ - assert(0); -} template void WilsonFermion5D::DhopOE(const FermionField &in, FermionField &out,int dag) @@ -706,6 +470,8 @@ void WilsonFermion5D::DW(const FermionField &in, FermionField &out,int dag FermOpTemplateInstantiate(WilsonFermion5D); GparityFermOpTemplateInstantiate(WilsonFermion5D); +template class WilsonFermion5D; +template class WilsonFermion5D; }} diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index 840c1a46..792e4dd0 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -87,6 +87,7 @@ namespace Grid { virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac // These can be overridden by fancy 5d chiral action virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); @@ -121,32 +122,12 @@ namespace Grid { FermionField &out, int dag); - void DhopInternalOMPbench(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); - - void DhopInternalL1bench(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); - void DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalCommsOverlapCompute(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); // Constructors WilsonFermion5D(GaugeField &_Umu, @@ -156,6 +137,15 @@ namespace Grid { GridRedBlackCartesian &FourDimRedBlackGrid, double _M5,const ImplParams &p= ImplParams()); + // Constructors + WilsonFermion5D(int simd, + GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _M5,const ImplParams &p= ImplParams()); + // DoubleStore void ImportGauge(const GaugeField &_Umu); diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index b94284f7..387ac0cd 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -529,5 +529,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField #endif FermOpTemplateInstantiate(WilsonKernels); +template class WilsonKernels; +template class WilsonKernels; }} diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index bdda199f..859d1a20 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -256,5 +256,7 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField template class WilsonKernels; template class WilsonKernels; template class WilsonKernels; + template class WilsonKernels; + template class WilsonKernels; }} #endif diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 74440f16..c4e4d87e 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -54,14 +54,15 @@ Author: paboyle Chi_11 = ref()(1)(1);\ Chi_12 = ref()(1)(2); +// To splat or not to splat depends on the implementation #define MULT_2SPIN(A)\ auto & ref(U._odata[sU](A)); \ - U_00 = ref()(0,0);\ - U_10 = ref()(1,0);\ - U_20 = ref()(2,0);\ - U_01 = ref()(0,1);\ - U_11 = ref()(1,1); \ - U_21 = ref()(2,1);\ + Impl::loadLinkElement(U_00,ref()(0,0)); \ + Impl::loadLinkElement(U_10,ref()(1,0)); \ + Impl::loadLinkElement(U_20,ref()(2,0)); \ + Impl::loadLinkElement(U_01,ref()(0,1)); \ + Impl::loadLinkElement(U_11,ref()(1,1)); \ + Impl::loadLinkElement(U_21,ref()(2,1)); \ UChi_00 = U_00*Chi_00;\ UChi_10 = U_00*Chi_10;\ UChi_01 = U_10*Chi_00;\ @@ -74,9 +75,9 @@ Author: paboyle UChi_11+= U_11*Chi_11;\ UChi_02+= U_21*Chi_01;\ UChi_12+= U_21*Chi_11;\ - U_00 = ref()(0,2);\ - U_10 = ref()(1,2);\ - U_20 = ref()(2,2);\ + Impl::loadLinkElement(U_00,ref()(0,2)); \ + Impl::loadLinkElement(U_10,ref()(1,2)); \ + Impl::loadLinkElement(U_20,ref()(2,2)); \ UChi_00+= U_00*Chi_02;\ UChi_10+= U_00*Chi_12;\ UChi_01+= U_10*Chi_02;\ @@ -84,6 +85,7 @@ Author: paboyle UChi_02+= U_20*Chi_02;\ UChi_12+= U_20*Chi_12; + #define PERMUTE_DIR(dir) \ permute##dir(Chi_00,Chi_00);\ permute##dir(Chi_01,Chi_01);\ @@ -809,7 +811,6 @@ int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,Doub int sF,int sU,const FermionField &in, FermionField &out) { DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3 - //check consistency of return types between these functions and the ones in WilsonKernels.cc return 0; } @@ -843,6 +844,47 @@ int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,D + ////////////// +/* +template<> +int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out) +{ + DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3 + return 0; + +} + +template<> +int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out) +{ + DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + return 0; +} + +template<> +int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out) +{ + DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + return 0; +} + +template<> +int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out) +{ + DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 + return 0; +} + +*/ + template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); @@ -870,4 +912,21 @@ template int WilsonKernels::DiracOptHandDhopSiteDag(StencilI std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out); + + + +template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out); +template int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out); +template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out); +template int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int ss,int sU,const FermionField &in, FermionField &out); + + }}