#ifndef GRID_LATTICE_H #define GRID_LATTICE_H #include "Grid.h" namespace Grid { // Permute the pointers 32bitx16 = 512 static int permute_map[4][16] = { { 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}, { 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}, { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}, { 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8} }; template class Lattice { public: SimdGrid *_grid; int checkerboard; std::vector > _odata; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; public: Lattice(SimdGrid *grid) : _grid(grid) { _odata.reserve(_grid->oSites()); assert((((uint64_t)&_odata[0])&0xF) ==0); checkerboard=0; } #include // overloading Grid::conformable but no conformable in Grid ...?:w template friend void conformable(const Lattice &lhs,const Lattice &rhs); // Performance difference between operator * and mult is troubling. // Auto move constructor seems to lose surprisingly much. // Site wise binary operations // We eliminate a temporary object assignment if use the mult,add,sub routines. // For the operator versions we rely on move constructor to eliminate the // vector copy back. template friend void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs); template friend void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs); template friend void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs); friend void axpy(Lattice &ret,double a,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]); } } friend void axpy(Lattice &ret,std::complex a,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]); } } inline friend Lattice operator / (const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss]; } return ret; }; template inline Lattice & operator = (const sobj & r){ #pragma omp parallel for for(int ss=0;ss<_grid->oSites();ss++){ this->_odata[ss]=r; } return *this; } // Poke a scalar object into the SIMD array template friend void pokeSite(const sobj &s,Lattice &l,std::vector &site){ typedef typename vobj::scalar_type stype; typedef typename vobj::vector_type vtype; assert( l.checkerboard == l._grid->CheckerBoard(site)); int o_index = l._grid->oIndex(site); int i_index = l._grid->iIndex(site); stype *v_ptr = (stype *)&l._odata[o_index]; stype *s_ptr = (stype *)&s; v_ptr = v_ptr + 2*i_index; for(int i=0;i friend void peekSite(sobj &s,const Lattice &l,std::vector &site){ typedef typename vobj::scalar_type stype; typedef typename vobj::vector_type vtype; assert( l.checkerboard== l._grid->CheckerBoard(site)); int o_index = l._grid->oIndex(site); int i_index = l._grid->iIndex(site); stype *v_ptr = (stype *)&l._odata[o_index]; stype *s_ptr = (stype *)&s; v_ptr = v_ptr + 2*i_index; for(int i=0;i &l){ Real *v_ptr = (Real *)&l._odata[0]; size_t v_len = l._grid->oSites()*sizeof(vobj); size_t d_len = v_len/sizeof(Real); for(int i=0;i &l){ Real *v_ptr = (Real *)&l._odata[0]; size_t o_len = l._grid->oSites(); size_t v_len = sizeof(vobj)/sizeof(vRealF); size_t vec_len = vRealF::Nsimd(); for(int i=0;i &l){ // Zero mean, unit variance. std::normal_distribution distribution(0.0,1.0); Real *v_ptr = (Real *)&l._odata[0]; size_t v_len = l._grid->oSites()*sizeof(vobj); size_t d_len = v_len/sizeof(Real); // Not a parallel RNG. Could make up some seed per 4d site, seed // per hypercube type scheme. for(int i=0;i operator -(const Lattice &r) { Lattice ret(r._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss]= -r._odata[ss]; } return ret; } // *=,+=,-= operators template inline Lattice &operator *=(const T &r) { *this = (*this)*r; return *this; } template inline Lattice &operator -=(const T &r) { *this = (*this)-r; return *this; } template inline Lattice &operator +=(const T &r) { *this = (*this)+r; return *this; } inline friend Lattice > _trace(const Lattice &lhs){ Lattice > ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = trace(lhs._odata[ss]); } return ret; }; inline friend Lattice > > trace(const Lattice &lhs){ Lattice > > ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = trace(lhs._odata[ss]); } return ret; }; inline friend Lattice adj(const Lattice &lhs){ Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = adj(lhs._odata[ss]); } return ret; }; inline friend Lattice conj(const Lattice &lhs){ Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = conj(lhs._odata[ss]); } return ret; }; // remove and insert a half checkerboard friend void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.checkerboard = cb; int ssh=0; #pragma omp parallel for for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; full._grid->CoordFromOsite(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { half._odata[ssh] = full._odata[ss]; ssh++; } } } friend void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.checkerboard; int ssh=0; #pragma omp parallel for for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; full._grid->CoordFromOsite(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { full._odata[ss]=half._odata[ssh]; ssh++; } } } }; // class Lattice template void conformable(const Lattice &lhs,const Lattice &rhs) { assert(lhs._grid == rhs._grid); assert(lhs.checkerboard == rhs.checkerboard); } template void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); uint32_t vec_len = lhs._grid->oSites(); #pragma omp parallel for for(int ss=0;ss void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); } } template void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); } } // Lattice BinOp Lattice, template inline auto operator * (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); mult(ret,lhs,rhs); return ret; } template inline auto operator + (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); add(ret,lhs,rhs); return ret; } template inline auto operator - (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); sub(ret,lhs,rhs); return ret; } // Scalar BinOp Lattice ;generate return type template inline auto operator * (const left &lhs,const Lattice &rhs) -> Lattice { std::cerr <<"Oscalar * Lattice calling mult"< ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs*rhs._odata[ss]; } return ret; } template inline auto operator + (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs+rhs._odata[ss]; } return ret; } template inline auto operator - (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs-rhs._odata[ss]; } return ret; } template inline auto operator * (const Lattice &lhs,const right &rhs) -> Lattice { std::cerr <<"Lattice * Oscalar calling mult"< ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]*rhs; } return ret; } template inline auto operator + (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]+rhs; } return ret; } template inline auto operator - (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]-rhs; } return ret; } } #endif