namespace Grid { template class BlockedGrid { public: GridBase* _grid; typedef typename Field::scalar_type Coeff_t; typedef typename Field::vector_type vCoeff_t; std::vector _bs; // block size std::vector _nb; // number of blocks std::vector _l; // local dimensions irrespective of cb std::vector _l_cb; // local dimensions of checkerboarded vector std::vector _l_cb_o; // local dimensions of inner checkerboarded vector std::vector _bs_cb; // block size in checkerboarded vector std::vector _nb_o; // number of blocks of simd o-sites int _nd, _blocks, _cf_size, _cf_block_size, _cf_o_block_size, _o_blocks, _block_sites; BlockedGrid(GridBase* grid, const std::vector& block_size) : _grid(grid), _bs(block_size), _nd((int)_bs.size()), _nb(block_size), _l(block_size), _l_cb(block_size), _nb_o(block_size), _l_cb_o(block_size), _bs_cb(block_size) { _blocks = 1; _o_blocks = 1; _l = grid->FullDimensions(); _l_cb = grid->LocalDimensions(); _l_cb_o = grid->_rdimensions; _cf_size = 1; _block_sites = 1; for (int i=0;i<_nd;i++) { _l[i] /= grid->_processors[i]; assert(!(_l[i] % _bs[i])); // lattice must accommodate choice of blocksize int r = _l[i] / _l_cb[i]; assert(!(_bs[i] % r)); // checkerboarding must accommodate choice of blocksize _bs_cb[i] = _bs[i] / r; _block_sites *= _bs_cb[i]; _nb[i] = _l[i] / _bs[i]; _nb_o[i] = _nb[i] / _grid->_simd_layout[i]; if (_nb[i] % _grid->_simd_layout[i]) { // simd must accommodate choice of blocksize std::cout << GridLogMessage << "Problem: _nb[" << i << "] = " << _nb[i] << " _grid->_simd_layout[" << i << "] = " << _grid->_simd_layout[i] << std::endl; assert(0); } _blocks *= _nb[i]; _o_blocks *= _nb_o[i]; _cf_size *= _l[i]; } _cf_size *= 12 / 2; _cf_block_size = _cf_size / _blocks; _cf_o_block_size = _cf_size / _o_blocks; std::cout << GridLogMessage << "BlockedGrid:" << std::endl; std::cout << GridLogMessage << " _l = " << _l << std::endl; std::cout << GridLogMessage << " _l_cb = " << _l_cb << std::endl; std::cout << GridLogMessage << " _l_cb_o = " << _l_cb_o << std::endl; std::cout << GridLogMessage << " _bs = " << _bs << std::endl; std::cout << GridLogMessage << " _bs_cb = " << _bs_cb << std::endl; std::cout << GridLogMessage << " _nb = " << _nb << std::endl; std::cout << GridLogMessage << " _nb_o = " << _nb_o << std::endl; std::cout << GridLogMessage << " _blocks = " << _blocks << std::endl; std::cout << GridLogMessage << " _o_blocks = " << _o_blocks << std::endl; std::cout << GridLogMessage << " sizeof(vCoeff_t) = " << sizeof(vCoeff_t) << std::endl; std::cout << GridLogMessage << " _cf_size = " << _cf_size << std::endl; std::cout << GridLogMessage << " _cf_block_size = " << _cf_block_size << std::endl; std::cout << GridLogMessage << " _block_sites = " << _block_sites << std::endl; std::cout << GridLogMessage << " _grid->oSites() = " << _grid->oSites() << std::endl; // _grid->Barrier(); //abort(); } void block_to_coor(int b, std::vector& x0) { std::vector bcoor; bcoor.resize(_nd); x0.resize(_nd); assert(b < _o_blocks); Lexicographic::CoorFromIndex(bcoor,b,_nb_o); int i; for (i=0;i<_nd;i++) { x0[i] = bcoor[i]*_bs_cb[i]; } //std::cout << GridLogMessage << "Map block b -> " << x0 << std::endl; } void block_site_to_o_coor(const std::vector& x0, std::vector& coor, int i) { Lexicographic::CoorFromIndex(coor,i,_bs_cb); for (int j=0;j<_nd;j++) coor[j] += x0[j]; } int block_site_to_o_site(const std::vector& x0, int i) { std::vector coor; coor.resize(_nd); block_site_to_o_coor(x0,coor,i); Lexicographic::IndexFromCoor(coor,i,_l_cb_o); return i; } vCoeff_t block_sp(int b, const Field& x, const Field& y) { std::vector x0; block_to_coor(b,x0); vCoeff_t ret = 0.0; for (int i=0;i<_block_sites;i++) { // only odd sites int ss = block_site_to_o_site(x0,i); ret += TensorRemove(innerProduct(x._odata[ss],y._odata[ss])); } return ret; } vCoeff_t block_sp(int b, const Field& x, const std::vector< ComplexD >& y) { std::vector x0; block_to_coor(b,x0); constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t); int lsize = _cf_o_block_size / _block_sites; std::vector< ComplexD > ret(nsimd); for (int i=0;i void vcaxpy(iScalar& r,const vCoeff_t& a,const iScalar& x,const iScalar& y) { vcaxpy(r._internal,a,x._internal,y._internal); } template void vcaxpy(iVector& r,const vCoeff_t& a,const iVector& x,const iVector& y) { for (int i=0;i x0; block_to_coor(b,x0); for (int i=0;i<_block_sites;i++) { // only odd sites int ss = block_site_to_o_site(x0,i); vcaxpy(ret._odata[ss],a,x._odata[ss],y._odata[ss]); } } void block_caxpy(int b, std::vector< ComplexD >& ret, const vCoeff_t& a, const Field& x, const std::vector< ComplexD >& y) { std::vector x0; block_to_coor(b,x0); constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t); int lsize = _cf_o_block_size / _block_sites; for (int i=0;i<_block_sites;i++) { // only odd sites int ss = block_site_to_o_site(x0,i); int n = lsize / nsimd; for (int l=0;l& x) { std::vector x0; block_to_coor(b,x0); int lsize = _cf_o_block_size / _block_sites; for (int i=0;i<_block_sites;i++) { // only odd sites int ss = block_site_to_o_site(x0,i); for (int l=0;l& x) { std::vector x0; block_to_coor(b,x0); int lsize = _cf_o_block_size / _block_sites; for (int i=0;i<_block_sites;i++) { // only odd sites int ss = block_site_to_o_site(x0,i); for (int l=0;l void vcscale(iScalar& r,const vCoeff_t& a,const iScalar& x) { vcscale(r._internal,a,x._internal); } template void vcscale(iVector& r,const vCoeff_t& a,const iVector& x) { for (int i=0;i x0; block_to_coor(b,x0); for (int i=0;i<_block_sites;i++) { // only odd sites int ss = block_site_to_o_site(x0,i); vcscale(ret._odata[ss],a,ret._odata[ss]); } } void getCanonicalBlockOffset(int cb, std::vector& x0) { const int ndim = 5; assert(_nb.size() == ndim); std::vector _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] }; std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; x0.resize(ndim); assert(cb >= 0); assert(cb < _nbc[0]*_nbc[1]*_nbc[2]*_nbc[3]*_nbc[4]); Lexicographic::CoorFromIndex(x0,cb,_nbc); int i; for (i=0;i& buf) { std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; std::vector ldim = v._grid->LocalDimensions(); std::vector cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] }; const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4]; // take canonical block cb of v and put it in canonical ordering in buf std::vector cx0; getCanonicalBlockOffset(cb,cx0); #pragma omp parallel { std::vector co0,cl0; co0=cx0; cl0=cx0; #pragma omp for for (int i=0;i<_nbsc;i++) { Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo for (int j=0;j<(int)_bsc.size();j++) cl0[j] = cx0[j] + co0[j]; std::vector l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] }; int oi = v._grid->oIndex(l0); int ii = v._grid->iIndex(l0); int lti = i; //if (cb < 2 && i<2) // std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl; for (int s=0;s<4;s++) for (int c=0;c<3;c++) { Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii]; int ti = 12*lti + 3*s + c; ld = Coeff_t(buf[2*ti+0], buf[2*ti+1]); } } } } void peekBlockOfVectorCanonical(int cb,const Field& v,std::vector& buf) { std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; std::vector ldim = v._grid->LocalDimensions(); std::vector cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] }; const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4]; // take canonical block cb of v and put it in canonical ordering in buf std::vector cx0; getCanonicalBlockOffset(cb,cx0); buf.resize(_cf_block_size * 2); #pragma omp parallel { std::vector co0,cl0; co0=cx0; cl0=cx0; #pragma omp for for (int i=0;i<_nbsc;i++) { Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo for (int j=0;j<(int)_bsc.size();j++) cl0[j] = cx0[j] + co0[j]; std::vector l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] }; int oi = v._grid->oIndex(l0); int ii = v._grid->iIndex(l0); int lti = i; //if (cb < 2 && i<2) // std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl; for (int s=0;s<4;s++) for (int c=0;c<3;c++) { Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii]; int ti = 12*lti + 3*s + c; buf[2*ti+0] = ld.real(); buf[2*ti+1] = ld.imag(); } } } } int globalToLocalCanonicalBlock(int slot,const std::vector& src_nodes,int nb) { // processor coordinate int _nd = (int)src_nodes.size(); std::vector _src_nodes = src_nodes; std::vector pco(_nd); Lexicographic::CoorFromIndex(pco,slot,_src_nodes); std::vector cpco = { pco[1], pco[2], pco[3], pco[4], pco[0] }; // get local block std::vector _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] }; assert(_nd == 5); std::vector c_src_local_blocks(_nd); for (int i=0;i<_nd;i++) { assert(_grid->_fdimensions[i] % (src_nodes[i] * _bs[i]) == 0); c_src_local_blocks[(i+4) % 5] = _grid->_fdimensions[i] / src_nodes[i] / _bs[i]; } std::vector cbcoor(_nd); // coordinate of block in slot in canonical form Lexicographic::CoorFromIndex(cbcoor,nb,c_src_local_blocks); // cpco, cbcoor std::vector clbcoor(_nd); for (int i=0;i<_nd;i++) { int cgcoor = cpco[i] * c_src_local_blocks[i] + cbcoor[i]; // global block coordinate int pcoor = cgcoor / _nbc[i]; // processor coordinate in my Grid int tpcoor = _grid->_processor_coor[(i+1)%5]; if (pcoor != tpcoor) return -1; clbcoor[i] = cgcoor - tpcoor * _nbc[i]; // canonical local block coordinate for canonical dimension i } int lnb; Lexicographic::IndexFromCoor(clbcoor,lnb,_nbc); //std::cout << "Mapped slot = " << slot << " nb = " << nb << " to " << lnb << std::endl; return lnb; } }; }