#ifndef GRID_LATTICE_H #define GRID_LATTICE_H #include "Grid.h" namespace dpo { // Permute the pointers 32bitx16 = 512 static int permute_map[4][16] = { { 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}, { 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}, { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}, { 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8} }; template class Lattice { public: SimdGrid *_grid; int checkerboard; std::vector > _odata; public: Lattice(SimdGrid *grid) : _grid(grid) { _odata.reserve(_grid->oSites()); if ( ((uint64_t)&_odata[0])&0xF) { exit(-1); } checkerboard=0; } #ifdef GRID_COMMS_NONE #include #endif #ifdef GRID_COMMS_FAKE #include #endif #ifdef GRID_COMMS_MPI #include #endif // overloading dpo::conformable but no conformable in dpo ...?:w template friend void conformable(const Lattice &lhs,const Lattice &rhs); // Performance difference between operator * and mult is troubling. // Auto move constructor seems to lose surprisingly much. // Site wise binary operations // We eliminate a temporary object assignment if use the mult,add,sub routines. // For the operator versions we rely on move constructor to eliminate the // vector copy back. template friend void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs); template friend void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs); template friend void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs); friend void axpy(Lattice &ret,double a,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]); } } friend void axpy(Lattice &ret,std::complex a,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]); } } inline friend Lattice operator / (const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss]; } return ret; }; template inline Lattice & operator = (const sobj & r){ #pragma omp parallel for for(int ss=0;ss<_grid->oSites();ss++){ this->_odata[ss]=r; } return *this; } // Poke a scalar object into the SIMD array template friend void pokeSite(const sobj &s,Lattice &l,std::vector &site){ if ( l.checkerboard != l._grid->CheckerBoard(site)){ printf("Poking wrong checkerboard\n"); exit(EXIT_FAILURE); } int o_index = l._grid->oIndex(site); int i_index = l._grid->iIndex(site); // BUGGY. This assumes complex real Real *v_ptr = (Real *)&l._odata[o_index]; Real *s_ptr = (Real *)&s; v_ptr = v_ptr + 2*i_index; for(int i=0;i friend void peekSite(sobj &s,const Lattice &l,std::vector &site){ // FIXME : define exceptions set and throw up. if ( l.checkerboard != l._grid->CheckerBoard(site)){ printf("Peeking wrong checkerboard\n"); exit(EXIT_FAILURE); } int o_index = l._grid->oIndex(site); int i_index = l._grid->iIndex(site); Real *v_ptr = (Real *)&l._odata[o_index]; Real *s_ptr = (Real *)&s; v_ptr = v_ptr + 2*i_index; for(int i=0;i &l){ Real *v_ptr = (Real *)&l._odata[0]; size_t v_len = l._grid->oSites()*sizeof(vobj); size_t d_len = v_len/sizeof(Real); for(int i=0;i &l){ Real *v_ptr = (Real *)&l._odata[0]; size_t o_len = l._grid->oSites(); size_t v_len = sizeof(vobj)/sizeof(vRealF); size_t vec_len = vRealF::Nsimd(); for(int i=0;i &l){ // Zero mean, unit variance. std::normal_distribution distribution(0.0,1.0); Real *v_ptr = (Real *)&l._odata[0]; size_t v_len = l._grid->oSites()*sizeof(vobj); size_t d_len = v_len/sizeof(Real); // Not a parallel RNG. Could make up some seed per 4d site, seed // per hypercube type scheme. for(int i=0;i operator -(const Lattice &r) { Lattice ret(r._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss]= -r._odata[ss]; } return ret; } // *=,+=,-= operators template inline Lattice &operator *=(const T &r) { *this = (*this)*r; return *this; } template inline Lattice &operator -=(const T &r) { *this = (*this)-r; return *this; } template inline Lattice &operator +=(const T &r) { *this = (*this)+r; return *this; } inline friend Lattice > _trace(const Lattice &lhs){ Lattice > ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = trace(lhs._odata[ss]); } return ret; }; inline friend Lattice > > trace(const Lattice &lhs){ Lattice > > ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = trace(lhs._odata[ss]); } return ret; }; inline friend Lattice adj(const Lattice &lhs){ Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = adj(lhs._odata[ss]); } return ret; }; inline friend Lattice conj(const Lattice &lhs){ Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ ret._odata[ss] = conj(lhs._odata[ss]); } return ret; }; // remove and insert a half checkerboard friend void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.checkerboard = cb; int ssh=0; #pragma omp parallel for for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; full._grid->CoordFromOsite(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { half._odata[ssh] = full._odata[ss]; ssh++; } } } friend void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.checkerboard; int ssh=0; #pragma omp parallel for for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; full._grid->CoordFromOsite(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { full._odata[ss]=half._odata[ssh]; ssh++; } } } }; // class Lattice /* Need to implement the multiplication return type matching S S -> S, S M -> M, M S -> M through all nested possibilities. template class lhs,template class rhs> class MultTypeSelector { template using ltype = lhs typedef lhs type; }; */ template void conformable(const Lattice &lhs,const Lattice &rhs) { assert(lhs._grid == rhs._grid); assert(lhs.checkerboard == rhs.checkerboard); } template void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); uint32_t vec_len = lhs._grid->oSites(); #pragma omp parallel for for(int ss=0;ss void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); } } template void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); #pragma omp parallel for for(int ss=0;ssoSites();ss++){ add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); } } // Lattice BinOp Lattice, template inline auto operator * (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); mult(ret,lhs,rhs); return ret; } template inline auto operator + (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); add(ret,lhs,rhs); return ret; } template inline auto operator - (const Lattice &lhs,const Lattice &rhs)-> Lattice { //NB mult performs conformable check. Do not reapply here for performance. Lattice ret(rhs._grid); sub(ret,lhs,rhs); return ret; } // Scalar BinOp Lattice ;generate return type template inline auto operator * (const left &lhs,const Lattice &rhs) -> Lattice { std::cerr <<"Oscalar * Lattice calling mult"< ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs*rhs._odata[ss]; } return ret; } template inline auto operator + (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs+rhs._odata[ss]; } return ret; } template inline auto operator - (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs-rhs._odata[ss]; } return ret; } template inline auto operator * (const Lattice &lhs,const right &rhs) -> Lattice { std::cerr <<"Lattice * Oscalar calling mult"< ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]*rhs; } return ret; } template inline auto operator + (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]+rhs; } return ret; } template inline auto operator - (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]-rhs; } return ret; } } #endif