diff --git a/lib/Grid_stencil.h b/lib/Grid_stencil.h index 4d32575f..fa537f65 100644 --- a/lib/Grid_stencil.h +++ b/lib/Grid_stencil.h @@ -44,6 +44,24 @@ namespace Grid { class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: + typedef uint32_t StencilInteger; + + + + StencilInteger alignup(StencilInteger n){ + n--; // 1000 0011 --> 1000 0010 + n |= n >> 1; // 1000 0010 | 0100 0001 = 1100 0011 + n |= n >> 2; // 1100 0011 | 0011 0000 = 1111 0011 + n |= n >> 4; // 1111 0011 | 0000 1111 = 1111 1111 + n |= n >> 8; // ... (At this point all bits are 1, so further bitwise-or + n |= n >> 16; // operations produce no effect.) + n++; // 1111 1111 --> 1 0000 0000 + return n; + }; + void LebesgueOrder(void); + + std::vector _LebesgueReorder; + int _checkerboard; int _npoints; // Move to template param? GridBase * _grid; diff --git a/lib/qcd/Grid_qcd_dirac.cc b/lib/qcd/Grid_qcd_dirac.cc index 6526dfd3..9514a2a6 100644 --- a/lib/qcd/Grid_qcd_dirac.cc +++ b/lib/qcd/Grid_qcd_dirac.cc @@ -34,6 +34,10 @@ namespace Grid { " " }; - + // void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){ + // vHalfSpinColourVector hspin; + // spProjXp(hspin,in); + // mult(&out,&u,&hspin); + // } } } diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc index efe72ca5..7ce41e57 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.cc +++ b/lib/qcd/Grid_qcd_wilson_dop.cc @@ -96,17 +96,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) WilsonCompressor compressor; Stencil.HaloExchange(in,comm_buf,compressor); - for(int ss=0;ssoSites();ss++){ + vHalfSpinColourVector tmp; + vHalfSpinColourVector chi; + vSpinColourVector result; + vHalfSpinColourVector Uchi; + vHalfSpinColourVector *chi_p; + int offset,local,perm, ptype; - int offset,local,perm, ptype; + for(int sss=0;sssoSites();sss++){ - vSpinColourVector result; - vHalfSpinColourVector chi; - vHalfSpinColourVector tmp; - vHalfSpinColourVector Uchi; - vHalfSpinColourVector *chi_p; - - result=zero; + int ss = sss; + //int ss = Stencil._LebesgueReorder[sss]; // Xp offset = Stencil._offsets [Xp][ss]; @@ -114,15 +114,16 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) perm = Stencil._permute[Xp][ss]; ptype = Stencil._permute_type[Xp]; chi_p = &comm_buf[offset]; - if ( local ) { - chi_p = χ + if ( local && perm ) + { + spProjXp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { spProjXp(chi,in._odata[offset]); - if ( perm ) { - permute(tmp,chi,ptype); - chi_p = &tmp; - } + } else { + chi=comm_buf[offset]; } - mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)()); + mult(&Uchi(),&Umu._odata[ss](Xp),&chi()); spReconXp(result,Uchi); // Yp @@ -130,16 +131,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) local = Stencil._is_local[Yp][ss]; perm = Stencil._permute[Yp][ss]; ptype = Stencil._permute_type[Yp]; - chi_p = &comm_buf[offset]; - if ( local ) { - chi_p = χ + + if ( local && perm ) + { + spProjYp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { spProjYp(chi,in._odata[offset]); - if ( perm ) { - permute(tmp,chi,ptype); - chi_p = &tmp; - } + } else { + chi=comm_buf[offset]; } - mult(&(Uchi()),&(Umu._odata[ss](Yp)),&(*chi_p)()); + mult(&Uchi(),&Umu._odata[ss](Yp),&chi()); accumReconYp(result,Uchi); // Zp @@ -147,17 +149,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) local = Stencil._is_local[Zp][ss]; perm = Stencil._permute[Zp][ss]; ptype = Stencil._permute_type[Zp]; - chi_p = &comm_buf[offset]; - if ( local ) { - chi_p = χ + if ( local && perm ) + { + spProjZp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { spProjZp(chi,in._odata[offset]); - if ( perm ) { - permute(tmp,chi,ptype); - chi_p = &tmp; - } + } else { + chi=comm_buf[offset]; } - mult(&(Uchi()),&(Umu._odata[ss](Zp)),&(*chi_p)()); + mult(&Uchi(),&Umu._odata[ss](Zp),&chi()); accumReconZp(result,Uchi); // Tp @@ -165,17 +167,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) local = Stencil._is_local[Tp][ss]; perm = Stencil._permute[Tp][ss]; ptype = Stencil._permute_type[Tp]; - chi_p = &comm_buf[offset]; - if ( local ) { - chi_p = χ + if ( local && perm ) + { + spProjTp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { spProjTp(chi,in._odata[offset]); - if ( perm ) { - permute(tmp,chi,ptype); - chi_p = &tmp; - } + } else { + chi=comm_buf[offset]; } - mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)()); + mult(&Uchi(),&Umu._odata[ss](Tp),&chi()); accumReconTp(result,Uchi); // Xm @@ -183,16 +185,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) local = Stencil._is_local[Xm][ss]; perm = Stencil._permute[Xm][ss]; ptype = Stencil._permute_type[Xm]; - chi_p = &comm_buf[offset]; - if ( local ) { - chi_p = χ + + if ( local && perm ) + { + spProjXm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { spProjXm(chi,in._odata[offset]); - if ( perm ) { - permute(tmp,chi,ptype); - chi_p = &tmp; - } + } else { + chi=comm_buf[offset]; } - mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)()); + mult(&Uchi(),&Umu._odata[ss](Xm),&chi()); accumReconXm(result,Uchi); @@ -201,17 +204,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) local = Stencil._is_local[Ym][ss]; perm = Stencil._permute[Ym][ss]; ptype = Stencil._permute_type[Ym]; - chi_p = &comm_buf[offset]; - if ( local ) { - chi_p = χ + if ( local && perm ) + { + spProjYm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { spProjYm(chi,in._odata[offset]); - if ( perm ) { - permute(tmp,chi,ptype); - chi_p = &tmp; - } + } else { + chi=comm_buf[offset]; } - mult(&(Uchi()),&(Umu._odata[ss](Ym)),&(*chi_p)()); + mult(&Uchi(),&Umu._odata[ss](Ym),&chi()); accumReconYm(result,Uchi); // Zm @@ -219,17 +222,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) local = Stencil._is_local[Zm][ss]; perm = Stencil._permute[Zm][ss]; ptype = Stencil._permute_type[Zm]; - chi_p = &comm_buf[offset]; - if ( local ) { - chi_p = χ + if ( local && perm ) + { + spProjZm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { spProjZm(chi,in._odata[offset]); - if ( perm ) { - permute(tmp,chi,ptype); - chi_p = &tmp; - } + } else { + chi=comm_buf[offset]; } - mult(&(Uchi()),&(Umu._odata[ss](Zm)),&(*chi_p)()); + mult(&Uchi(),&Umu._odata[ss](Zm),&chi()); accumReconZm(result,Uchi); // Tm @@ -237,24 +240,25 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) local = Stencil._is_local[Tm][ss]; perm = Stencil._permute[Tm][ss]; ptype = Stencil._permute_type[Tm]; - chi_p = &comm_buf[offset]; - if ( local ) { - chi_p = χ + if ( local && perm ) + { + spProjTm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { spProjTm(chi,in._odata[offset]); - if ( perm ) { - permute(tmp,chi,ptype); - chi_p = &tmp; - } + } else { + chi=comm_buf[offset]; } - mult(&(Uchi()),&(Umu._odata[ss](Tm)),&(*chi_p)()); + mult(&Uchi(),&Umu._odata[ss](Tm),&chi()); accumReconTm(result,Uchi); - out._odata[ss] = result; } } + + void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out) { return; diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h index dc3c80b9..900f1801 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.h +++ b/lib/qcd/Grid_qcd_wilson_dop.h @@ -48,6 +48,8 @@ namespace Grid { // m+4r -1/2 Dhop; both cb's void Dw(const LatticeFermion &in, LatticeFermion &out); + typedef iScalar > matrix; + // half checkerboard operaions void MpcDag (const LatticeFermion &in, LatticeFermion &out); void Mpc (const LatticeFermion &in, LatticeFermion &out); diff --git a/lib/stencil/Grid_stencil_common.cc b/lib/stencil/Grid_stencil_common.cc index 52daf466..6cbcc890 100644 --- a/lib/stencil/Grid_stencil_common.cc +++ b/lib/stencil/Grid_stencil_common.cc @@ -2,6 +2,96 @@ namespace Grid { + + +void CartesianStencil::LebesgueOrder(void) +{ + _LebesgueReorder.resize(0); + + // Align up dimensions to power of two. + const StencilInteger one=1; + StencilInteger ND = _grid->_ndimension; + std::vector dims(ND); + std::vector adims(ND); + std::vector > bitlist(ND); + + + for(StencilInteger mu=0;mu_rdimensions[mu]; + assert ( dims[mu] != 0 ); + adims[mu] = alignup(dims[mu]); + } + + // List which bits of padded volume coordinate contribute; this strategy + // i) avoids recursion + // ii) has loop lengths at most the width of a 32 bit word. + int sitebit=0; + int split=24; + for(int mu=0;mu ax(ND); + + for(StencilInteger asite=0;asitedims[mu]-1 ) contained = 0; + + } + + if ( contained ) { + int site = ax[0] + + dims[0]*ax[1] + +dims[0]*dims[1]*ax[2] + +dims[0]*dims[1]*dims[2]*ax[3]; + + _LebesgueReorder.push_back(site); + } + } + + assert( _LebesgueReorder.size() == vol ); +} + CartesianStencil::CartesianStencil(GridBase *grid, int npoints, int checkerboard, @@ -20,6 +110,8 @@ namespace Grid { _unified_buffer_size=0; _request_count =0; + LebesgueOrder(); + int osites = _grid->oSites(); for(int i=0;i simd_layout({1,1,2,2}); std::vector mpi_layout ({1,1,1,1}); - std::vector latt_size ({4,4,8,8}); + std::vector latt_size ({8,8,8,8}); GridCartesian Grid(latt_size,simd_layout,mpi_layout); std::vector seeds({1,2,3,4}); @@ -45,9 +45,7 @@ int main (int argc, char ** argv) } for(int mu=0;mu(Umu,U[mu],mu); - U[mu] = peekIndex<3>(Umu,mu); + U[mu] = peekIndex(Umu,mu); } std::vector mask({1,1,1,1,1,1,1,1});