#ifndef GRID_LATTICE_H #define GRID_LATTICE_H #include "Grid.h" namespace dpo { template class Lattice { public: Grid *_grid; int checkerboard; std::vector > _odata; public: Lattice(Grid *grid) : _grid(grid) { _odata.reserve(_grid->oSites()); if ( ((uint64_t)&_odata[0])&0xF) { exit(-1); } checkerboard=0; } // overloading dpo::conformable but no conformable in dpo ...?:w template friend void conformable(const Lattice &lhs,const Lattice &rhs); // Performance difference between operator * and mult is troubling. // Auto move constructor seems to lose surprisingly much. // Site wise binary operations // We eliminate a temporary object assignment if use the mult,add,sub routines. // For the operator versions we rely on move constructor to eliminate the // vector copy back. template friend void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs); template friend void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs); template friend void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs); friend void axpy(Lattice &ret,double a,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]); } } friend void axpy(Lattice &ret,std::complex a,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]); } } inline friend Lattice operator / (const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss]; } return ret; }; #if 0 // Collapse doesn't appear to work the way I think it should in icpc friend Lattice Cshift(Lattice &rhs,int dimension,int shift) { Lattice ret(rhs._grid); ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); int sx,so,o; int rd = rhs._grid->_rdimensions[dimension]; int ld = rhs._grid->_dimensions[dimension]; // Map to always positive shift. shift = (shift+ld)%ld; // Work out whether to permute and the permute type // ABCDEFGH -> AE BF CG DH permute // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH // Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG // Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF // Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD // Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC // Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB // Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA int permute_dim =rhs._grid->_layout[dimension]>1 ; int permute_type=0; for(int d=0;d_layout[d]>1 ) permute_type++; // loop over perp slices. // Threading considerations: // Need to map thread_num to // // x_min,x_max for Loop-A // n_min,n_max for Loop-B // b_min,b_max for Loop-C // In a way that maximally load balances. // // Optimal: // There are rd*n_block*block items of work. // These serialise as item "w" // b=w%block; w=w/block // n=w%nblock; x=w/nblock. Perhaps 20 cycles? // // Logic: // x_chunk = (rd+thread)/nthreads simply divide work across nodes. // // rd=5 , threads = 8; // 0 1 2 3 4 5 6 7 // 0 0 0 1 1 1 1 1 for(int x=0;x_ostride[dimension]; so =sx*rhs._grid->_ostride[dimension]; int permute_slice=0; if ( permute_dim ) { permute_slice = shift/rd; if ( x_slice_block[dimension]*internal; for(int n=0;n_slice_nblock[dimension];n++){ vComplex *optr = (vComplex *)&ret._odata[o]; vComplex *iptr = (vComplex *)&rhs._odata[so]; for(int b=0;b_slice_stride[dimension]; so+=rhs._grid->_slice_stride[dimension]; } } else { for(int n=0;n_slice_nblock[dimension];n++){ for(int i=0;i_slice_block[dimension];i++){ ret._odata[o+i]=rhs._odata[so+i]; } o+=rhs._grid->_slice_stride[dimension]; so+=rhs._grid->_slice_stride[dimension]; } } #else if ( permute_slice ) { int internal=sizeof(vobj)/sizeof(vComplex); int num =rhs._grid->_slice_block[dimension]*internal; #pragma omp parallel for collapse(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_stride[dimension]]; vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]]; permute(optr[b],iptr[b],permute_type); } } } else { #pragma omp parallel for collapse(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int i=0;i_slice_block[dimension];i++){ int oo = o+ n*rhs._grid->_slice_stride[dimension]; int soo=so+ n*rhs._grid->_slice_stride[dimension]; ret._odata[oo+i]=rhs._odata[soo+i]; } } } #endif } return ret; } #else friend Lattice Cshift(Lattice &rhs,int dimension,int shift) { Lattice ret(rhs._grid); ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); int rd = rhs._grid->_rdimensions[dimension]; int ld = rhs._grid->_dimensions[dimension]; // Map to always positive shift. shift = (shift+ld)%ld; // Work out whether to permute and the permute type // ABCDEFGH -> AE BF CG DH permute // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH // Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG // Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF // Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD // Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC // Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB // Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA int permute_dim =rhs._grid->_layout[dimension]>1 ; int permute_type=0; for(int d=0;d_layout[d]>1 ) permute_type++; // loop over all work int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; #pragma omp parallel for for(int ww=0;ww_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension]; int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension]; int x = w; int sx,so,o; sx = (x-shift+ld)%rd; o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert. so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; int permute_slice=0; if ( permute_dim ) { permute_slice = shift/rd; if ( x inline Lattice & operator = (const sobj & r){ #pragma omp parallel for for(int ss=0;ss<_grid->oSites();ss++){ this->_odata[ss]=r; } return *this; } // Poke a scalar object into the SIMD array template friend void pokeSite(const sobj &s,Lattice &l,std::vector &site){ if ( l._grid.checkerboard != l._grid->Checkerboard(site)){ printf("Poking wrong checkerboard\n"); exit(EXIT_FAILURE); } int o_index = l._grid->oIndex(site); int i_index = l._grid->iIndex(site); Real *v_ptr = (Real *)&l._odata[o_index]; Real *s_ptr = (Real *)&s; v_ptr = v_ptr + 2*i_index; for(int i=0;i friend void peekSite(sobj &s,const Lattice &l,std::vector &site){ // FIXME : define exceptions set and throw up. if ( l.checkerboard != l._grid->CheckerBoard(site)){ printf("Peeking wrong checkerboard\n"); exit(EXIT_FAILURE); } int o_index = l._grid->oIndex(site); int i_index = l._grid->iIndex(site); Real *v_ptr = (Real *)&l._odata[o_index]; Real *s_ptr = (Real *)&s; v_ptr = v_ptr + 2*i_index; for(int i=0;i &l){ Real *v_ptr = (Real *)&l._odata[0]; size_t v_len = l._grid->oSites()*sizeof(vobj); size_t d_len = v_len/sizeof(Real); for(int i=0;i &l){ // Zero mean, unit variance. std::normal_distribution distribution(0.0,1.0); Real *v_ptr = (Real *)&l._odata[0]; size_t v_len = l._grid->oSites()*sizeof(vobj); size_t d_len = v_len/sizeof(Real); // Not a parallel RNG. Could make up some seed per 4d site, seed // per hypercube type scheme. for(int i=0;i operator -(const Lattice &r) { Lattice ret(r._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss]= -r._odata[ss]; } return ret; } // *=,+=,-= operators template inline Lattice &operator *=(const T &r) { *this = (*this)*r; return *this; } template inline Lattice &operator -=(const T &r) { *this = (*this)-r; return *this; } template inline Lattice &operator +=(const T &r) { *this = (*this)+r; return *this; } inline friend Lattice > _trace(const Lattice &lhs){ Lattice > ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = trace(lhs._odata[ss]); } return ret; }; inline friend Lattice > > trace(const Lattice &lhs){ Lattice > > ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = trace(lhs._odata[ss]); } return ret; }; inline friend Lattice adj(const Lattice &lhs){ Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = adj(lhs._odata[ss]); } return ret; }; inline friend Lattice conj(const Lattice &lhs){ Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = conj(lhs._odata[ss]); } return ret; }; // remove and insert a half checkerboard friend void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ #pragma omp parallel for for(int ss=0;ssoSites();ss++){ half._odata[ss] = full._odata[ss*2+cb]; } half.checkerboard = cb; } friend void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.checkerboard; #pragma omp parallel for for(int ss=0;ssoSites();ss++){ full._odata[ss*2+cb]=half._odata[ss]; } } }; // class Lattice /* Need to implement the multiplication return type matching S S -> S, S M -> M, M S -> M through all nested possibilities. template class lhs,template class rhs> class MultTypeSelector { template using ltype = lhs typedef lhs type; }; */ template void conformable(const Lattice &lhs,const Lattice &rhs) { assert(lhs._grid == rhs._grid); assert(lhs.checkerboard == rhs.checkerboard); } template void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); uint32_t vec_len = lhs._grid->oSites(); #pragma omp parallel for for(int ss=0;ss void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); } } template void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); } } // Lattice BinOp Lattice, template inline auto operator * (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); mult(ret,lhs,rhs); return ret; } template inline auto operator + (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); add(ret,lhs,rhs); return ret; } template inline auto operator - (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); sub(ret,lhs,rhs); return ret; } // Scalar BinOp Lattice ;generate return type template inline auto operator * (const left &lhs,const Lattice &rhs) -> Lattice { std::cerr <<"Oscalar * Lattice calling mult"< ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs*rhs._odata[ss]; } return ret; } template inline auto operator + (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs+rhs._odata[ss]; } return ret; } template inline auto operator - (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs-rhs._odata[ss]; } return ret; } template inline auto operator * (const Lattice &lhs,const right &rhs) -> Lattice { std::cerr <<"Lattice * Oscalar calling mult"< ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]*rhs; } return ret; } template inline auto operator + (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]+rhs; } return ret; } template inline auto operator - (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]-rhs; } return ret; } } #endif