mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/hmc_generalise
This commit is contained in:
		
							
								
								
									
										33
									
								
								lib/lattice/Lattice.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								lib/lattice/Lattice.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,33 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/Lattice.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#ifndef GRID_LATTICE_H
 | 
			
		||||
#define GRID_LATTICE_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/lattice/Lattice_base.h>
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -39,8 +39,7 @@ namespace Grid {
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
@@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
@@ -73,8 +71,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
@@ -89,8 +86,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
@@ -108,8 +104,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(lhs,ret);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mult(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
@@ -120,8 +115,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,lhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mac(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
@@ -132,8 +126,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,lhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
@@ -147,8 +140,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(lhs,ret);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
@@ -166,8 +158,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mult(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
@@ -182,8 +173,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mac(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
@@ -198,8 +188,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
@@ -213,8 +202,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
@@ -230,8 +218,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = x.checkerboard;
 | 
			
		||||
    conformable(ret,x);
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
@@ -245,8 +232,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    ret.checkerboard = x.checkerboard;
 | 
			
		||||
    conformable(ret,x);
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
 
 | 
			
		||||
@@ -121,8 +121,7 @@ public:
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
@@ -144,8 +143,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
@@ -167,8 +165,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      //vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,eval(ss,expr));
 | 
			
		||||
@@ -191,8 +188,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
@@ -213,8 +209,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
@@ -235,82 +230,79 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      vstream(_odata[ss] ,eval(ss,expr));
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Constructor requires "grid" passed.
 | 
			
		||||
    // what about a default grid?
 | 
			
		||||
    //////////////////////////////////////////////////////////////////
 | 
			
		||||
    Lattice(GridBase *grid) : _odata(grid->oSites()) {
 | 
			
		||||
        _grid = grid;
 | 
			
		||||
  //////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Constructor requires "grid" passed.
 | 
			
		||||
  // what about a default grid?
 | 
			
		||||
  //////////////////////////////////////////////////////////////////
 | 
			
		||||
  Lattice(GridBase *grid) : _odata(grid->oSites()) {
 | 
			
		||||
    _grid = grid;
 | 
			
		||||
    //        _odata.reserve(_grid->oSites());
 | 
			
		||||
    //        _odata.resize(_grid->oSites());
 | 
			
		||||
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
 | 
			
		||||
        assert((((uint64_t)&_odata[0])&0xF) ==0);
 | 
			
		||||
        checkerboard=0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Lattice(const Lattice& r){ // copy constructor
 | 
			
		||||
      _grid = r._grid;
 | 
			
		||||
      checkerboard = r.checkerboard;
 | 
			
		||||
    	_odata.resize(_grid->oSites());// essential
 | 
			
		||||
  	PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
          _odata[ss]=r._odata[ss];
 | 
			
		||||
        }  	
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    virtual ~Lattice(void) = default;
 | 
			
		||||
    assert((((uint64_t)&_odata[0])&0xF) ==0);
 | 
			
		||||
    checkerboard=0;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  Lattice(const Lattice& r){ // copy constructor
 | 
			
		||||
    _grid = r._grid;
 | 
			
		||||
    checkerboard = r.checkerboard;
 | 
			
		||||
    _odata.resize(_grid->oSites());// essential
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      _odata[ss]=r._odata[ss];
 | 
			
		||||
    }  	
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  virtual ~Lattice(void) = default;
 | 
			
		||||
    
 | 
			
		||||
    void reset(GridBase* grid) {
 | 
			
		||||
      if (_grid != grid) {
 | 
			
		||||
        _grid = grid;
 | 
			
		||||
        _odata.resize(grid->oSites());
 | 
			
		||||
        checkerboard = 0;
 | 
			
		||||
      }
 | 
			
		||||
  void reset(GridBase* grid) {
 | 
			
		||||
    if (_grid != grid) {
 | 
			
		||||
      _grid = grid;
 | 
			
		||||
      _odata.resize(grid->oSites());
 | 
			
		||||
      checkerboard = 0;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
            this->_odata[ss]=r;
 | 
			
		||||
        }
 | 
			
		||||
        return *this;
 | 
			
		||||
  template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      this->_odata[ss]=r;
 | 
			
		||||
    }
 | 
			
		||||
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
 | 
			
		||||
      this->checkerboard = r.checkerboard;
 | 
			
		||||
      conformable(*this,r);
 | 
			
		||||
      
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
            this->_odata[ss]=r._odata[ss];
 | 
			
		||||
        }
 | 
			
		||||
        return *this;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
 | 
			
		||||
    this->checkerboard = r.checkerboard;
 | 
			
		||||
    conformable(*this,r);
 | 
			
		||||
    
 | 
			
		||||
    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      this->_odata[ss]=r._odata[ss];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // *=,+=,-= operators inherit behvour from correspond */+/- operation
 | 
			
		||||
    template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
 | 
			
		||||
        *this = (*this)*r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
 | 
			
		||||
        *this = (*this)-r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
    template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
 | 
			
		||||
        *this = (*this)+r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
 }; // class Lattice
 | 
			
		||||
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // *=,+=,-= operators inherit behvour from correspond */+/- operation
 | 
			
		||||
  template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
 | 
			
		||||
    *this = (*this)*r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
 | 
			
		||||
    *this = (*this)-r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
 | 
			
		||||
    *this = (*this)+r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
}; // class Lattice
 | 
			
		||||
  
 | 
			
		||||
  template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
 | 
			
		||||
    std::vector<int> gcoor;
 | 
			
		||||
    typedef typename vobj::scalar_object sobj;
 | 
			
		||||
@@ -328,7 +320,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    }
 | 
			
		||||
    return stream;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -45,90 +45,87 @@ namespace Grid {
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class vfunctor,class lobj,class robj>  
 | 
			
		||||
    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<vInteger> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<vInteger> ret(rhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // compare lattice to scalar
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class vfunctor,class lobj,class robj> 
 | 
			
		||||
  template<class vfunctor,class lobj,class robj> 
 | 
			
		||||
    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<vInteger> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=op(lhs._odata[ss],rhs);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<vInteger> ret(lhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=op(lhs._odata[ss],rhs);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // compare scalar to lattice
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class vfunctor,class lobj,class robj> 
 | 
			
		||||
  template<class vfunctor,class lobj,class robj> 
 | 
			
		||||
    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<vInteger> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=op(lhs._odata[ss],rhs);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<vInteger> ret(rhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=op(lhs._odata[ss],rhs);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Map to functors
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Less than
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
   // Less than equal
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
   // Greater than 
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
  // Less than
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return LLComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
    return LSComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return SLComparison(vlt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Less than equal
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return LLComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
    return LSComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return SLComparison(vle<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Greater than 
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
    return LLComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
    return LSComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
  }
 | 
			
		||||
  template<class lobj,class robj>
 | 
			
		||||
    inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
   // Greater than equal
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  // Greater than equal
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vge<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
@@ -136,38 +133,37 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
     return LSComparison(vge<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vge<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
   
 | 
			
		||||
   // equal
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(veq<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(veq<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(veq<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
   
 | 
			
		||||
   
 | 
			
		||||
   // not equal
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return LLComparison(vne<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
 | 
			
		||||
     return LSComparison(vne<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
   template<class lobj,class robj>
 | 
			
		||||
   inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
 | 
			
		||||
     return SLComparison(vne<lobj,robj>(),lhs,rhs);
 | 
			
		||||
   }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -34,47 +34,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
    /////////////////////////////////////////////////////
 | 
			
		||||
    // Non site, reduced locally reduced routines
 | 
			
		||||
    /////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    // localNorm2,
 | 
			
		||||
    template<class vobj>
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  // Non site, reduced locally reduced routines
 | 
			
		||||
  /////////////////////////////////////////////////////
 | 
			
		||||
  
 | 
			
		||||
  // localNorm2,
 | 
			
		||||
  template<class vobj>
 | 
			
		||||
    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    // localInnerProduct
 | 
			
		||||
    template<class vobj>
 | 
			
		||||
  
 | 
			
		||||
  // localInnerProduct
 | 
			
		||||
  template<class vobj>
 | 
			
		||||
    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    // outerProduct Scalar x Scalar -> Scalar
 | 
			
		||||
    //              Vector x Vector -> Matrix
 | 
			
		||||
    template<class ll,class rr>
 | 
			
		||||
  
 | 
			
		||||
  // outerProduct Scalar x Scalar -> Scalar
 | 
			
		||||
  //              Vector x Vector -> Matrix
 | 
			
		||||
  template<class ll,class rr>
 | 
			
		||||
    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
 | 
			
		||||
    {
 | 
			
		||||
        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
     }
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -37,8 +37,7 @@ namespace Grid {
 | 
			
		||||
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<vobj> ret(r._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<r._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
 | 
			
		||||
      vstream(ret._odata[ss], -r._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@@ -74,8 +73,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
	   //      ret._odata[ss]=lhs*rhs._odata[ss];
 | 
			
		||||
@@ -86,8 +84,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
 | 
			
		||||
	vstream(ret._odata[ss],tmp);
 | 
			
		||||
	//	ret._odata[ss]=lhs+rhs._odata[ss];
 | 
			
		||||
@@ -98,11 +95,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
      //      ret._odata[ss]=lhs-rhs._odata[ss];
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
@@ -110,8 +105,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<lhs._grid->oSites(); ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
 | 
			
		||||
	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
 | 
			
		||||
	vstream(ret._odata[ss],tmp);
 | 
			
		||||
	//            ret._odata[ss]=lhs._odata[ss]*rhs;
 | 
			
		||||
@@ -122,8 +116,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
 | 
			
		||||
    {
 | 
			
		||||
        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
 | 
			
		||||
	  vstream(ret._odata[ss],tmp);
 | 
			
		||||
	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
 | 
			
		||||
@@ -134,15 +127,12 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
 | 
			
		||||
	  vstream(ret._odata[ss],tmp);
 | 
			
		||||
	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -44,22 +44,20 @@ namespace Grid {
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
 | 
			
		||||
      ret.checkerboard=lhs.checkerboard;
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    };
 | 
			
		||||
    template<int Index,class vobj>
 | 
			
		||||
       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
 | 
			
		||||
      auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
 | 
			
		||||
      ret.checkerboard=lhs.checkerboard;
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -68,18 +66,16 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    template<int Index,class vobj> 
 | 
			
		||||
    void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
 | 
			
		||||
    {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
 | 
			
		||||
	}      
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
 | 
			
		||||
      }      
 | 
			
		||||
    }
 | 
			
		||||
    template<int Index,class vobj>
 | 
			
		||||
      void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
 | 
			
		||||
    {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
 | 
			
		||||
	}      
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
 | 
			
		||||
      }      
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////
 | 
			
		||||
@@ -131,9 +127,6 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
      assert( l.checkerboard == l._grid->CheckerBoard(site));
 | 
			
		||||
 | 
			
		||||
      // FIXME
 | 
			
		||||
      //      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
 | 
			
		||||
 | 
			
		||||
      int rank,odx,idx;
 | 
			
		||||
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -40,8 +40,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
 | 
			
		||||
        Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = adj(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
@@ -49,13 +48,10 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
 | 
			
		||||
        Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = conjugate(lhs._odata[ss]);
 | 
			
		||||
	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	  ret._odata[ss] = conjugate(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -34,197 +34,191 @@ namespace Grid {
 | 
			
		||||
#ifdef GRID_WARN_SUBOPTIMAL
 | 
			
		||||
#warning "Optimisation alert all these reduction loops are NOT threaded "
 | 
			
		||||
#endif     
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Deterministic Reduction operations
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline RealD norm2(const Lattice<vobj> &arg) {
 | 
			
		||||
  ComplexD nrm = innerProduct(arg, arg);
 | 
			
		||||
  return std::real(nrm);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline ComplexD innerProduct(const Lattice<vobj> &left,
 | 
			
		||||
                             const Lattice<vobj> &right) {
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  scalar_type nrm;
 | 
			
		||||
 | 
			
		||||
  GridBase *grid = left._grid;
 | 
			
		||||
 | 
			
		||||
  std::vector<vector_type, alignedAllocator<vector_type> > sumarray(
 | 
			
		||||
      grid->SumArraySize());
 | 
			
		||||
  for (int i = 0; i < grid->SumArraySize(); i++) {
 | 
			
		||||
    sumarray[i] = zero;
 | 
			
		||||
  
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Deterministic Reduction operations
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template <class vobj>
 | 
			
		||||
  inline RealD norm2(const Lattice<vobj> &arg) {
 | 
			
		||||
    ComplexD nrm = innerProduct(arg, arg);
 | 
			
		||||
    return std::real(nrm);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for (int thr = 0; thr < grid->SumArraySize(); thr++) {
 | 
			
		||||
    int nwork, mywork, myoff;
 | 
			
		||||
    GridThread::GetWork(left._grid->oSites(), thr, mywork, myoff);
 | 
			
		||||
 | 
			
		||||
    decltype(innerProduct(left._odata[0], right._odata[0])) vnrm =
 | 
			
		||||
        zero;  // private to thread; sub summation
 | 
			
		||||
    for (int ss = myoff; ss < mywork + myoff; ss++) {
 | 
			
		||||
      vnrm = vnrm + innerProduct(left._odata[ss], right._odata[ss]);
 | 
			
		||||
  
 | 
			
		||||
  template <class vobj>
 | 
			
		||||
  inline ComplexD innerProduct(const Lattice<vobj> &left,
 | 
			
		||||
			       const Lattice<vobj> &right) {
 | 
			
		||||
    typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
    typedef typename vobj::vector_type vector_type;
 | 
			
		||||
    scalar_type nrm;
 | 
			
		||||
    
 | 
			
		||||
    GridBase *grid = left._grid;
 | 
			
		||||
    
 | 
			
		||||
    std::vector<vector_type, alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
 | 
			
		||||
    for (int i = 0; i < grid->SumArraySize(); i++) {
 | 
			
		||||
      sumarray[i] = zero;
 | 
			
		||||
    }
 | 
			
		||||
    sumarray[thr] = TensorRemove(vnrm);
 | 
			
		||||
  }
 | 
			
		||||
   
 | 
			
		||||
 | 
			
		||||
  vector_type vvnrm;
 | 
			
		||||
  vvnrm = zero;  // sum across threads
 | 
			
		||||
  for (int i = 0; i < grid->SumArraySize(); i++) {
 | 
			
		||||
    vvnrm = vvnrm + sumarray[i];
 | 
			
		||||
  }
 | 
			
		||||
  nrm = Reduce(vvnrm);  // sum across simd
 | 
			
		||||
  right._grid->GlobalSum(nrm);
 | 
			
		||||
  return nrm;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Op, class T1>
 | 
			
		||||
inline auto sum(const LatticeUnaryExpression<Op, T1> &expr) ->
 | 
			
		||||
    typename decltype(
 | 
			
		||||
        expr.first.func(eval(0, std::get<0>(expr.second))))::scalar_object {
 | 
			
		||||
  return sum(closure(expr));
 | 
			
		||||
    
 | 
			
		||||
    parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
 | 
			
		||||
      int nwork, mywork, myoff;
 | 
			
		||||
      GridThread::GetWork(left._grid->oSites(), thr, mywork, myoff);
 | 
			
		||||
      
 | 
			
		||||
      decltype(innerProduct(left._odata[0], right._odata[0])) vnrm=zero; // private to thread; sub summation
 | 
			
		||||
      for(int ss = myoff; ss<mywork + myoff; ss++){
 | 
			
		||||
	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
 | 
			
		||||
      }
 | 
			
		||||
      sumarray[thr]=TensorRemove(vnrm) ;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    vector_type vvnrm;
 | 
			
		||||
    vvnrm=zero;  // sum across threads
 | 
			
		||||
    for(int i=0; i < grid->SumArraySize(); i++){
 | 
			
		||||
      vvnrm = vvnrm + sumarray[i];
 | 
			
		||||
    } 
 | 
			
		||||
    nrm = Reduce(vvnrm);// sum across simd
 | 
			
		||||
    right._grid->GlobalSum(nrm);
 | 
			
		||||
    return nrm;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class Op, class T1>
 | 
			
		||||
  inline auto sum(const LatticeUnaryExpression<Op, T1> &expr) ->
 | 
			
		||||
    typename decltype(expr.first.func(eval(0, std::get<0>(expr.second))))::scalar_object {
 | 
			
		||||
    return sum(closure(expr));
 | 
			
		||||
  }
 | 
			
		||||
						  
 | 
			
		||||
    template<class Op,class T1,class T2>
 | 
			
		||||
      inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
 | 
			
		||||
    inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
 | 
			
		||||
      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
 | 
			
		||||
    {
 | 
			
		||||
      return sum(closure(expr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
						     
 | 
			
		||||
						     
 | 
			
		||||
    template<class Op,class T1,class T2,class T3>
 | 
			
		||||
      inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 | 
			
		||||
    inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 | 
			
		||||
      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
 | 
			
		||||
         eval(0,std::get<1>(expr.second)),
 | 
			
		||||
         eval(0,std::get<2>(expr.second))
 | 
			
		||||
         ))::scalar_object
 | 
			
		||||
					  eval(0,std::get<1>(expr.second)),
 | 
			
		||||
					  eval(0,std::get<2>(expr.second))
 | 
			
		||||
					  ))::scalar_object
 | 
			
		||||
    {
 | 
			
		||||
      return sum(closure(expr));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
						     
 | 
			
		||||
    template<class vobj>
 | 
			
		||||
    inline typename vobj::scalar_object sum(const Lattice<vobj> &arg){
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      GridBase *grid=arg._grid;
 | 
			
		||||
      int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
 | 
			
		||||
      for(int i=0;i<grid->SumArraySize();i++){
 | 
			
		||||
  sumarray[i]=zero;
 | 
			
		||||
	sumarray[i]=zero;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int thr=0;thr<grid->SumArraySize();thr++){
 | 
			
		||||
  int nwork, mywork, myoff;
 | 
			
		||||
  GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
 | 
			
		||||
 | 
			
		||||
  vobj vvsum=zero;
 | 
			
		||||
      
 | 
			
		||||
      parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
 | 
			
		||||
	int nwork, mywork, myoff;
 | 
			
		||||
	GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
 | 
			
		||||
	
 | 
			
		||||
	vobj vvsum=zero;
 | 
			
		||||
        for(int ss=myoff;ss<mywork+myoff; ss++){
 | 
			
		||||
    vvsum = vvsum + arg._odata[ss];
 | 
			
		||||
  }
 | 
			
		||||
  sumarray[thr]=vvsum;
 | 
			
		||||
	  vvsum = vvsum + arg._odata[ss];
 | 
			
		||||
	}
 | 
			
		||||
	sumarray[thr]=vvsum;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      vobj vsum=zero;  // sum across threads
 | 
			
		||||
      for(int i=0;i<grid->SumArraySize();i++){
 | 
			
		||||
  vsum = vsum+sumarray[i];
 | 
			
		||||
	vsum = vsum+sumarray[i];
 | 
			
		||||
      } 
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      typedef typename vobj::scalar_object sobj;
 | 
			
		||||
      sobj ssum=zero;
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      std::vector<sobj>               buf(Nsimd);
 | 
			
		||||
      extract(vsum,buf);
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
 | 
			
		||||
      arg._grid->GlobalSum(ssum);
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      return ssum;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  GridBase  *grid = Data._grid;
 | 
			
		||||
  assert(grid!=NULL);
 | 
			
		||||
 | 
			
		||||
  // FIXME
 | 
			
		||||
  // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  const int    Nd = grid->_ndimension;
 | 
			
		||||
  const int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert(orthogdim >= 0);
 | 
			
		||||
  assert(orthogdim < Nd);
 | 
			
		||||
 | 
			
		||||
  int fd=grid->_fdimensions[orthogdim];
 | 
			
		||||
  int ld=grid->_ldimensions[orthogdim];
 | 
			
		||||
  int rd=grid->_rdimensions[orthogdim];
 | 
			
		||||
 | 
			
		||||
  std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
 | 
			
		||||
  std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
 | 
			
		||||
  std::vector<sobj> extracted(Nsimd);     // splitting the SIMD
 | 
			
		||||
 | 
			
		||||
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
 | 
			
		||||
  for(int r=0;r<rd;r++){
 | 
			
		||||
    lvSum[r]=zero;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::vector<int>  coor(Nd);  
 | 
			
		||||
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
			
		||||
    int r = coor[orthogdim];
 | 
			
		||||
    lvSum[r]=lvSum[r]+Data._odata[ss];
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  // Sum across simd lanes in the plane, breaking out orthog dir.
 | 
			
		||||
  std::vector<int> icoor(Nd);
 | 
			
		||||
 | 
			
		||||
  for(int rt=0;rt<rd;rt++){
 | 
			
		||||
 | 
			
		||||
    extract(lvSum[rt],extracted);
 | 
			
		||||
 | 
			
		||||
    for(int idx=0;idx<Nsimd;idx++){
 | 
			
		||||
 | 
			
		||||
      grid->iCoorFromIindex(icoor,idx);
 | 
			
		||||
 | 
			
		||||
      int ldx =rt+icoor[orthogdim]*rd;
 | 
			
		||||
 | 
			
		||||
      lsSum[ldx]=lsSum[ldx]+extracted[idx];
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 | 
			
		||||
  {
 | 
			
		||||
    typedef typename vobj::scalar_object sobj;
 | 
			
		||||
    GridBase  *grid = Data._grid;
 | 
			
		||||
    assert(grid!=NULL);
 | 
			
		||||
    
 | 
			
		||||
    // FIXME
 | 
			
		||||
    // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    const int    Nd = grid->_ndimension;
 | 
			
		||||
    const int Nsimd = grid->Nsimd();
 | 
			
		||||
    
 | 
			
		||||
    assert(orthogdim >= 0);
 | 
			
		||||
    assert(orthogdim < Nd);
 | 
			
		||||
    
 | 
			
		||||
    int fd=grid->_fdimensions[orthogdim];
 | 
			
		||||
    int ld=grid->_ldimensions[orthogdim];
 | 
			
		||||
    int rd=grid->_rdimensions[orthogdim];
 | 
			
		||||
    
 | 
			
		||||
    std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
 | 
			
		||||
    std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
 | 
			
		||||
    std::vector<sobj> extracted(Nsimd);     // splitting the SIMD
 | 
			
		||||
    
 | 
			
		||||
    result.resize(fd); // And then global sum to return the same vector to every node for IO to file
 | 
			
		||||
    for(int r=0;r<rd;r++){
 | 
			
		||||
      lvSum[r]=zero;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    std::vector<int>  coor(Nd);  
 | 
			
		||||
    
 | 
			
		||||
    // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
    
 | 
			
		||||
    for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
      Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
			
		||||
      int r = coor[orthogdim];
 | 
			
		||||
      lvSum[r]=lvSum[r]+Data._odata[ss];
 | 
			
		||||
    }  
 | 
			
		||||
    
 | 
			
		||||
    // Sum across simd lanes in the plane, breaking out orthog dir.
 | 
			
		||||
    std::vector<int> icoor(Nd);
 | 
			
		||||
    
 | 
			
		||||
    for(int rt=0;rt<rd;rt++){
 | 
			
		||||
      
 | 
			
		||||
      extract(lvSum[rt],extracted);
 | 
			
		||||
      
 | 
			
		||||
      for(int idx=0;idx<Nsimd;idx++){
 | 
			
		||||
	
 | 
			
		||||
	grid->iCoorFromIindex(icoor,idx);
 | 
			
		||||
	
 | 
			
		||||
	int ldx =rt+icoor[orthogdim]*rd;
 | 
			
		||||
	
 | 
			
		||||
	lsSum[ldx]=lsSum[ldx]+extracted[idx];
 | 
			
		||||
	
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    // sum over nodes.
 | 
			
		||||
    sobj gsum;
 | 
			
		||||
    for(int t=0;t<fd;t++){
 | 
			
		||||
      int pt = t/ld; // processor plane
 | 
			
		||||
      int lt = t%ld;
 | 
			
		||||
      if ( pt == grid->_processor_coor[orthogdim] ) {
 | 
			
		||||
	gsum=lsSum[lt];
 | 
			
		||||
      } else {
 | 
			
		||||
	gsum=zero;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      grid->GlobalSum(gsum);
 | 
			
		||||
      
 | 
			
		||||
      result[t]=gsum;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // sum over nodes.
 | 
			
		||||
  sobj gsum;
 | 
			
		||||
  for(int t=0;t<fd;t++){
 | 
			
		||||
    int pt = t/ld; // processor plane
 | 
			
		||||
    int lt = t%ld;
 | 
			
		||||
    if ( pt == grid->_processor_coor[orthogdim] ) {
 | 
			
		||||
      gsum=lsSum[lt];
 | 
			
		||||
    } else {
 | 
			
		||||
      gsum=zero;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    grid->GlobalSum(gsum);
 | 
			
		||||
 | 
			
		||||
    result[t]=gsum;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -30,11 +30,19 @@
 | 
			
		||||
#define GRID_LATTICE_RNG_H
 | 
			
		||||
 | 
			
		||||
#include <random>
 | 
			
		||||
 | 
			
		||||
#ifdef RNG_SITMO
 | 
			
		||||
#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
#if defined(RNG_SITMO)
 | 
			
		||||
#define RNG_FAST_DISCARD
 | 
			
		||||
#else 
 | 
			
		||||
#undef  RNG_FAST_DISCARD
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////
 | 
			
		||||
  // Allow the RNG state to be less dense than the fine grid
 | 
			
		||||
  //////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -64,16 +72,19 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return multiplicity;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
// merge of April 11 2017
 | 
			
		||||
//<<<<<<< HEAD
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
 | 
			
		||||
  // this function is necessary for the LS vectorised field
 | 
			
		||||
  inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
 | 
			
		||||
  {
 | 
			
		||||
    int rngdims = coarse->_ndimension;
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
 | 
			
		||||
    int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
 | 
			
		||||
    // assumes that the higher dimensions are not using more processors
 | 
			
		||||
@@ -92,6 +103,7 @@ namespace Grid {
 | 
			
		||||
    return fine->lSites() / coarse->lSites();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
  // Wrap seed_seq to give common interface with random_device
 | 
			
		||||
  class fixedSeed {
 | 
			
		||||
  public:
 | 
			
		||||
@@ -108,89 +120,140 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
=======
 | 
			
		||||
>>>>>>> develop
 | 
			
		||||
  */
 | 
			
		||||
  
 | 
			
		||||
  // real scalars are one component
 | 
			
		||||
  template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen)
 | 
			
		||||
  template<class scalar,class distribution,class generator> 
 | 
			
		||||
  void fillScalar(scalar &s,distribution &dist,generator & gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=dist(gen);
 | 
			
		||||
  }
 | 
			
		||||
  template<class distribution,class generator> void fillScalar(ComplexF &s,distribution &dist, generator &gen)
 | 
			
		||||
  template<class distribution,class generator> 
 | 
			
		||||
  void fillScalar(ComplexF &s,distribution &dist, generator &gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=ComplexF(dist(gen),dist(gen));
 | 
			
		||||
  }
 | 
			
		||||
  template<class distribution,class generator> void fillScalar(ComplexD &s,distribution &dist,generator &gen)
 | 
			
		||||
  template<class distribution,class generator> 
 | 
			
		||||
  void fillScalar(ComplexD &s,distribution &dist,generator &gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=ComplexD(dist(gen),dist(gen));
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  class GridRNGbase {
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    int _seeded;
 | 
			
		||||
    // One generator per site.
 | 
			
		||||
    // Uniform and Gaussian distributions from these generators.
 | 
			
		||||
#ifdef RNG_RANLUX
 | 
			
		||||
    typedef uint64_t      RngStateType;
 | 
			
		||||
    typedef std::ranlux48 RngEngine;
 | 
			
		||||
    typedef uint64_t      RngStateType;
 | 
			
		||||
    static const int RngStateCount = 15;
 | 
			
		||||
#elif RNG_MT19937 
 | 
			
		||||
#endif 
 | 
			
		||||
#ifdef RNG_MT19937 
 | 
			
		||||
    typedef std::mt19937 RngEngine;
 | 
			
		||||
    typedef uint32_t     RngStateType;
 | 
			
		||||
    static const int     RngStateCount = std::mt19937::state_size;
 | 
			
		||||
#elif RNG_SITMO
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef RNG_SITMO
 | 
			
		||||
    typedef sitmo::prng_engine 	RngEngine;
 | 
			
		||||
    typedef uint64_t    	RngStateType;
 | 
			
		||||
    static const int    	RngStateCount = 4;
 | 
			
		||||
#endif
 | 
			
		||||
    std::vector<RngEngine>                             _generators;
 | 
			
		||||
    std::vector<std::uniform_real_distribution<RealD>> _uniform;
 | 
			
		||||
    std::vector<std::normal_distribution<RealD>>       _gaussian;
 | 
			
		||||
    std::vector<std::discrete_distribution<int32_t>>     _bernoulli;
 | 
			
		||||
 | 
			
		||||
    void GetState(std::vector<RngStateType> & saved,int gen) {
 | 
			
		||||
    std::vector<RngEngine>                             _generators;
 | 
			
		||||
    std::vector<std::uniform_real_distribution<RealD> > _uniform;
 | 
			
		||||
    std::vector<std::normal_distribution<RealD> >       _gaussian;
 | 
			
		||||
    std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
 | 
			
		||||
    std::vector<std::uniform_int_distribution<uint32_t> > _uid;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
    // support for parallel init
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
#ifdef RNG_FAST_DISCARD
 | 
			
		||||
    static void Skip(RngEngine &eng)
 | 
			
		||||
    {
 | 
			
		||||
      /////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
      // Skip by 2^40 elements between successive lattice sites
 | 
			
		||||
      // This goes by 10^12.
 | 
			
		||||
      // Consider quenched updating; likely never exceeding rate of 1000 sweeps
 | 
			
		||||
      // per second on any machine. This gives us of order 10^9 seconds, or 100 years
 | 
			
		||||
      // skip ahead.
 | 
			
		||||
      // For HMC unlikely to go at faster than a solve per second, and 
 | 
			
		||||
      // tens of seconds per trajectory so this is clean in all reasonable cases,
 | 
			
		||||
      // and margin of safety is orders of magnitude.
 | 
			
		||||
      // We could hack Sitmo to skip in the higher order words of state if necessary
 | 
			
		||||
      /////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
      uint64_t skip = 0x1; skip = skip<<40;
 | 
			
		||||
      eng.discard(skip);
 | 
			
		||||
    } 
 | 
			
		||||
#endif
 | 
			
		||||
    static RngEngine Reseed(RngEngine &eng)
 | 
			
		||||
    {
 | 
			
		||||
      std::vector<uint32_t> newseed;
 | 
			
		||||
      std::uniform_int_distribution<uint32_t> uid;
 | 
			
		||||
      return Reseed(eng,newseed,uid);
 | 
			
		||||
    }
 | 
			
		||||
    static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
 | 
			
		||||
			    std::uniform_int_distribution<uint32_t> &uid)
 | 
			
		||||
    {
 | 
			
		||||
      const int reseeds=4;
 | 
			
		||||
      
 | 
			
		||||
      newseed.resize(reseeds);
 | 
			
		||||
      for(int i=0;i<reseeds;i++){
 | 
			
		||||
	newseed[i] = uid(eng);
 | 
			
		||||
      }
 | 
			
		||||
      std::seed_seq sseq(newseed.begin(),newseed.end());
 | 
			
		||||
      return RngEngine(sseq);
 | 
			
		||||
    }    
 | 
			
		||||
 | 
			
		||||
    void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
 | 
			
		||||
      saved.resize(RngStateCount);
 | 
			
		||||
      std::stringstream ss;
 | 
			
		||||
      ss<<_generators[gen];
 | 
			
		||||
      ss<<eng;
 | 
			
		||||
      ss.seekg(0,ss.beg);
 | 
			
		||||
      for(int i=0;i<RngStateCount;i++){
 | 
			
		||||
        ss>>saved[i];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    void SetState(std::vector<RngStateType> & saved,int gen){
 | 
			
		||||
    void GetState(std::vector<RngStateType> & saved,int gen) {
 | 
			
		||||
      GetState(saved,_generators[gen]);
 | 
			
		||||
    }
 | 
			
		||||
    void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
 | 
			
		||||
      assert(saved.size()==RngStateCount);
 | 
			
		||||
      std::stringstream ss;
 | 
			
		||||
      for(int i=0;i<RngStateCount;i++){
 | 
			
		||||
        ss<< saved[i]<<" ";
 | 
			
		||||
      }
 | 
			
		||||
      ss.seekg(0,ss.beg);
 | 
			
		||||
      ss>>_generators[gen];
 | 
			
		||||
      ss>>eng;
 | 
			
		||||
    }
 | 
			
		||||
    void SetState(std::vector<RngStateType> & saved,int gen){
 | 
			
		||||
      SetState(saved,_generators[gen]);
 | 
			
		||||
    }
 | 
			
		||||
    void SetEngine(RngEngine &Eng, int gen){
 | 
			
		||||
      _generators[gen]=Eng;
 | 
			
		||||
    }
 | 
			
		||||
    void GetEngine(RngEngine &Eng, int gen){
 | 
			
		||||
      Eng=_generators[gen];
 | 
			
		||||
    }
 | 
			
		||||
    template<class source> void Seed(source &src, int gen)
 | 
			
		||||
    {
 | 
			
		||||
      _generators[gen] = RngEngine(src);
 | 
			
		||||
    }    
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  class GridSerialRNG : public GridRNGbase {
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    // FIXME ... do we require lockstep draws of randoms 
 | 
			
		||||
    // from all nodes keeping seeds consistent.
 | 
			
		||||
    // place a barrier/broadcast in the fill routine
 | 
			
		||||
    template<class source> void Seed(source &src)
 | 
			
		||||
    {
 | 
			
		||||
      typename source::result_type init = src();
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&init,sizeof(init));
 | 
			
		||||
      _generators[0] = RngEngine(init);
 | 
			
		||||
      _seeded=1;
 | 
			
		||||
    }    
 | 
			
		||||
 | 
			
		||||
    GridSerialRNG() : GridRNGbase() {
 | 
			
		||||
      _generators.resize(1);
 | 
			
		||||
      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
 | 
			
		||||
      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
 | 
			
		||||
      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
 | 
			
		||||
      _seeded=0;
 | 
			
		||||
      _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
 | 
			
		||||
 | 
			
		||||
      typedef typename sobj::scalar_type scalar_type;
 | 
			
		||||
@@ -203,7 +266,7 @@ namespace Grid {
 | 
			
		||||
      for(int idx=0;idx<words;idx++){
 | 
			
		||||
  fillScalar(buf[idx],dist[0],_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
@@ -257,21 +320,16 @@ namespace Grid {
 | 
			
		||||
      RealD *pointer=(RealD *)&l;
 | 
			
		||||
      dist[0].reset();
 | 
			
		||||
      for(int i=0;i<vRealD::Nsimd();i++){
 | 
			
		||||
  fillScalar(pointer[i],dist[0],_generators[0]);
 | 
			
		||||
	fillScalar(pointer[i],dist[0],_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    void SeedRandomDevice(void){
 | 
			
		||||
      std::random_device rd;
 | 
			
		||||
      Seed(rd);
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    void SeedFixedIntegers(const std::vector<int> &seeds){
 | 
			
		||||
      fixedSeed src(seeds);
 | 
			
		||||
      Seed(src);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
 | 
			
		||||
      std::seed_seq src(seeds.begin(),seeds.end());
 | 
			
		||||
      Seed(src,0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  class GridParallelRNG : public GridRNGbase {
 | 
			
		||||
@@ -279,7 +337,6 @@ namespace Grid {
 | 
			
		||||
    double _time_counter;
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    GridBase *_grid;
 | 
			
		||||
    unsigned int _vol;
 | 
			
		||||
 | 
			
		||||
@@ -295,61 +352,11 @@ namespace Grid {
 | 
			
		||||
      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
 | 
			
		||||
      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
 | 
			
		||||
      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
 | 
			
		||||
      _seeded = 0;
 | 
			
		||||
 | 
			
		||||
      _time_counter = 0.0;
 | 
			
		||||
 | 
			
		||||
      _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
 | 
			
		||||
 | 
			
		||||
    // This loop could be made faster to avoid the Ahmdahl by
 | 
			
		||||
    // i)  seed generators on each timeslice, for x=y=z=0;
 | 
			
		||||
    // ii) seed generators on each z for x=y=0
 | 
			
		||||
    // iii)seed generators on each y,z for x=0
 | 
			
		||||
    // iv) seed generators on each y,z,x 
 | 
			
		||||
    // made possible by physical indexing.
 | 
			
		||||
    template<class source> void Seed(source &src)
 | 
			
		||||
    {
 | 
			
		||||
      std::vector<int> gcoor;
 | 
			
		||||
 | 
			
		||||
      int gsites = _grid->_gsites;
 | 
			
		||||
 | 
			
		||||
      typename source::result_type init = src();
 | 
			
		||||
      RngEngine pseeder(init);
 | 
			
		||||
      std::uniform_int_distribution<uint64_t> ui;
 | 
			
		||||
 | 
			
		||||
      for(int gidx=0;gidx<gsites;gidx++){
 | 
			
		||||
 | 
			
		||||
        int rank,o_idx,i_idx;
 | 
			
		||||
        _grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 | 
			
		||||
        _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 | 
			
		||||
        
 | 
			
		||||
        int l_idx=generator_idx(o_idx,i_idx);
 | 
			
		||||
 | 
			
		||||
        const int num_rand_seed=16;
 | 
			
		||||
        std::vector<int> site_seeds(num_rand_seed);
 | 
			
		||||
        for(int i=0;i<site_seeds.size();i++){
 | 
			
		||||
          site_seeds[i]= ui(pseeder);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        _grid->Broadcast(0,(void *)&site_seeds[0],sizeof(int)*site_seeds.size());
 | 
			
		||||
 | 
			
		||||
        if( rank == _grid->ThisRank() ){
 | 
			
		||||
          fixedSeed ssrc(site_seeds);
 | 
			
		||||
          typename source::result_type sinit = ssrc();
 | 
			
		||||
          _generators[l_idx] = RngEngine(sinit);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      _seeded=1;
 | 
			
		||||
    }    
 | 
			
		||||
 | 
			
		||||
    //FIXME implement generic IO and create state save/restore
 | 
			
		||||
    //void SaveState(const std::string<char> &file);
 | 
			
		||||
    //void LoadState(const std::string<char> &file);
 | 
			
		||||
 | 
			
		||||
    template <class vobj, class distribution>
 | 
			
		||||
    inline void fill(Lattice<vobj> &l, std::vector<distribution> &dist) {
 | 
			
		||||
      typedef typename vobj::scalar_object scalar_object;
 | 
			
		||||
      typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
      typedef typename vobj::vector_type vector_type;
 | 
			
		||||
@@ -357,14 +364,11 @@ namespace Grid {
 | 
			
		||||
      double inner_time_counter = usecond();
 | 
			
		||||
 | 
			
		||||
      int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
 | 
			
		||||
      int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too
 | 
			
		||||
      int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity
 | 
			
		||||
      int words  = sizeof(scalar_object) / sizeof(scalar_type);
 | 
			
		||||
 | 
			
		||||
      int Nsimd = _grid->Nsimd();// guaranteed to be the same for l._grid too
 | 
			
		||||
      int osites = _grid->oSites();// guaranteed to be <= l._grid->oSites() by a factor multiplicity
 | 
			
		||||
      int words = sizeof(scalar_object) / sizeof(scalar_type);
 | 
			
		||||
 | 
			
		||||
      PARALLEL_FOR_LOOP
 | 
			
		||||
      for (int ss = 0; ss < osites; ss++) {
 | 
			
		||||
 | 
			
		||||
      parallel_for(int ss=0;ss<osites;ss++){
 | 
			
		||||
        std::vector<scalar_object> buf(Nsimd);
 | 
			
		||||
        for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
 | 
			
		||||
 | 
			
		||||
@@ -386,13 +390,79 @@ namespace Grid {
 | 
			
		||||
      _time_counter += usecond()- inner_time_counter;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    void SeedRandomDevice(void) {
 | 
			
		||||
      std::random_device rd;
 | 
			
		||||
      Seed(rd);
 | 
			
		||||
    }
 | 
			
		||||
    void SeedFixedIntegers(const std::vector<int> &seeds) {
 | 
			
		||||
      fixedSeed src(seeds);
 | 
			
		||||
      Seed(src);
 | 
			
		||||
    void SeedFixedIntegers(const std::vector<int> &seeds){
 | 
			
		||||
 | 
			
		||||
      // Everyone generates the same seed_seq based on input seeds
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
 | 
			
		||||
 | 
			
		||||
      std::seed_seq source(seeds.begin(),seeds.end());
 | 
			
		||||
 | 
			
		||||
      RngEngine master_engine(source);
 | 
			
		||||
 | 
			
		||||
#ifdef RNG_FAST_DISCARD
 | 
			
		||||
      ////////////////////////////////////////////////
 | 
			
		||||
      // Skip ahead through a single stream.
 | 
			
		||||
      // Applicable to SITMO and other has based/crypto RNGs
 | 
			
		||||
      // Should be applicable to Mersenne Twister, but the C++11
 | 
			
		||||
      // MT implementation does not implement fast discard even though
 | 
			
		||||
      // in principle this is possible
 | 
			
		||||
      ////////////////////////////////////////////////
 | 
			
		||||
      std::vector<int> gcoor;
 | 
			
		||||
      int rank,o_idx,i_idx;
 | 
			
		||||
 | 
			
		||||
      // Everybody loops over global volume.
 | 
			
		||||
      for(int gidx=0;gidx<_grid->_gsites;gidx++){
 | 
			
		||||
 | 
			
		||||
	Skip(master_engine); // Skip to next RNG sequence
 | 
			
		||||
 | 
			
		||||
	// Where is it?
 | 
			
		||||
	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 | 
			
		||||
	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 | 
			
		||||
 | 
			
		||||
	// If this is one of mine we take it
 | 
			
		||||
	if( rank == _grid->ThisRank() ){
 | 
			
		||||
	  int l_idx=generator_idx(o_idx,i_idx);
 | 
			
		||||
	  _generators[l_idx] = master_engine;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
#else 
 | 
			
		||||
      ////////////////////////////////////////////////////////////////
 | 
			
		||||
      // Machine and thread decomposition dependent seeding is efficient
 | 
			
		||||
      // and maximally parallel; but NOT reproducible from machine to machine. 
 | 
			
		||||
      // Not ideal, but fastest way to reseed all nodes.
 | 
			
		||||
      ////////////////////////////////////////////////////////////////
 | 
			
		||||
      {
 | 
			
		||||
	// Obtain one Reseed per processor
 | 
			
		||||
	int Nproc = _grid->ProcessorCount();
 | 
			
		||||
	std::vector<RngEngine> seeders(Nproc);
 | 
			
		||||
	int me= _grid->ThisRank();
 | 
			
		||||
	for(int p=0;p<Nproc;p++){
 | 
			
		||||
	  seeders[p] = Reseed(master_engine);
 | 
			
		||||
	}
 | 
			
		||||
	master_engine = seeders[me];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      {
 | 
			
		||||
	// Obtain one reseeded generator per thread
 | 
			
		||||
	int Nthread = GridThread::GetThreads();
 | 
			
		||||
	std::vector<RngEngine> seeders(Nthread);
 | 
			
		||||
	for(int t=0;t<Nthread;t++){
 | 
			
		||||
	  seeders[t] = Reseed(master_engine);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	parallel_for(int t=0;t<Nthread;t++) {
 | 
			
		||||
	  // set up one per local site in threaded fashion
 | 
			
		||||
	  std::vector<uint32_t> newseeds;
 | 
			
		||||
	  std::uniform_int_distribution<uint32_t> uid;	
 | 
			
		||||
	  for(int l=0;l<_grid->lSites();l++) {
 | 
			
		||||
	    if ( (l%Nthread)==t ) {
 | 
			
		||||
	      _generators[l] = Reseed(seeders[t],newseeds,uid);
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void Report(){
 | 
			
		||||
@@ -400,31 +470,39 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Support for rigorous test of RNG's
 | 
			
		||||
    // Return uniform random uint32_t from requested site generator
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    uint32_t GlobalU01(int gsite){
 | 
			
		||||
 | 
			
		||||
      uint32_t the_number;
 | 
			
		||||
      // who
 | 
			
		||||
      std::vector<int> gcoor;
 | 
			
		||||
      int rank,o_idx,i_idx;
 | 
			
		||||
      _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
 | 
			
		||||
      _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 | 
			
		||||
 | 
			
		||||
      // draw
 | 
			
		||||
      int l_idx=generator_idx(o_idx,i_idx);
 | 
			
		||||
      if( rank == _grid->ThisRank() ){
 | 
			
		||||
	the_number = _uid[l_idx](_generators[l_idx]);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // share & return
 | 
			
		||||
      _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
 | 
			
		||||
      return the_number;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){
 | 
			
		||||
    rng.fill(l,rng._uniform);
 | 
			
		||||
  }
 | 
			
		||||
  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
 | 
			
		||||
  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
 | 
			
		||||
  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
 | 
			
		||||
 | 
			
		||||
  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l){
 | 
			
		||||
    rng.fill(l,rng._gaussian);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){
 | 
			
		||||
    rng.fill(l,rng._bernoulli);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
 | 
			
		||||
    rng.fill(l,rng._uniform);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
 | 
			
		||||
    rng.fill(l,rng._gaussian);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){
 | 
			
		||||
    rng.fill(l,rng._bernoulli);
 | 
			
		||||
  }
 | 
			
		||||
  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
 | 
			
		||||
  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
 | 
			
		||||
  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -42,8 +42,7 @@ namespace Grid {
 | 
			
		||||
      -> Lattice<decltype(trace(lhs._odata[0]))>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = trace(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
@@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
 
 | 
			
		||||
@@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
 | 
			
		||||
    half.checkerboard = cb;
 | 
			
		||||
    int ssh=0;
 | 
			
		||||
    //PARALLEL_FOR_LOOP
 | 
			
		||||
    //parallel_for
 | 
			
		||||
    for(int ss=0;ss<full._grid->oSites();ss++){
 | 
			
		||||
      std::vector<int> coor;
 | 
			
		||||
      int cbos;
 | 
			
		||||
@@ -68,7 +68,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
 | 
			
		||||
    int cb = half.checkerboard;
 | 
			
		||||
    int ssh=0;
 | 
			
		||||
    //PARALLEL_FOR_LOOP
 | 
			
		||||
    //parallel_for
 | 
			
		||||
    for(int ss=0;ss<full._grid->oSites();ss++){
 | 
			
		||||
      std::vector<int> coor;
 | 
			
		||||
      int cbos;
 | 
			
		||||
@@ -153,8 +153,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
 | 
			
		||||
    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int sf=0;sf<fine->oSites();sf++){
 | 
			
		||||
  parallel_for(int sf=0;sf<fine->oSites();sf++){
 | 
			
		||||
    
 | 
			
		||||
    int sc;
 | 
			
		||||
    std::vector<int> coor_c(_ndimension);
 | 
			
		||||
@@ -186,8 +185,7 @@ template<class vobj,class CComplex>
 | 
			
		||||
 | 
			
		||||
  fine_inner = localInnerProduct(fineX,fineY);
 | 
			
		||||
  blockSum(coarse_inner,fine_inner);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<coarse->oSites();ss++){
 | 
			
		||||
  parallel_for(int ss=0;ss<coarse->oSites();ss++){
 | 
			
		||||
    CoarseInner._odata[ss] = coarse_inner._odata[ss];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -333,9 +331,6 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vvobj::scalar_object ssobj;
 | 
			
		||||
 | 
			
		||||
  sobj s;
 | 
			
		||||
  ssobj ss;
 | 
			
		||||
 | 
			
		||||
  GridBase *ig = in._grid;
 | 
			
		||||
  GridBase *og = out._grid;
 | 
			
		||||
 | 
			
		||||
@@ -347,10 +342,13 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
 | 
			
		||||
  for(int d=0;d<no;d++){
 | 
			
		||||
    assert(ig->_processors[d]  == og->_processors[d]);
 | 
			
		||||
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
 | 
			
		||||
    assert(ig->lSites() == og->lSites());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<ig->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<ig->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    ssobj ss;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> lcoor(ni);
 | 
			
		||||
    ig->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    peekLocalSite(s,in,lcoor);
 | 
			
		||||
@@ -364,7 +362,6 @@ template<class vobj>
 | 
			
		||||
void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj s;
 | 
			
		||||
 | 
			
		||||
  GridBase *lg = lowDim._grid;
 | 
			
		||||
  GridBase *hg = higherDim._grid;
 | 
			
		||||
@@ -386,17 +383,16 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  // Guido: check the threading here
 | 
			
		||||
  //PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    dl=0;
 | 
			
		||||
    int ddl=0;
 | 
			
		||||
    hcoor[orthog] = slice;
 | 
			
		||||
    for(int d=0;d<nh;d++){
 | 
			
		||||
      if ( d!=orthog ) { 
 | 
			
		||||
	hcoor[d]=lcoor[dl++];
 | 
			
		||||
	hcoor[d]=lcoor[ddl++];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    peekLocalSite(s,lowDim,lcoor);
 | 
			
		||||
@@ -408,7 +404,6 @@ template<class vobj>
 | 
			
		||||
void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj s;
 | 
			
		||||
 | 
			
		||||
  GridBase *lg = lowDim._grid;
 | 
			
		||||
  GridBase *hg = higherDim._grid;
 | 
			
		||||
@@ -429,16 +424,16 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  //PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    dl=0;
 | 
			
		||||
    int ddl=0;
 | 
			
		||||
    hcoor[orthog] = slice;
 | 
			
		||||
    for(int d=0;d<nh;d++){
 | 
			
		||||
      if ( d!=orthog ) { 
 | 
			
		||||
	hcoor[d]=lcoor[dl++];
 | 
			
		||||
	hcoor[d]=lcoor[ddl++];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    peekLocalSite(s,higherDim,hcoor);
 | 
			
		||||
@@ -452,7 +447,6 @@ template<class vobj>
 | 
			
		||||
void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj s;
 | 
			
		||||
 | 
			
		||||
  GridBase *lg = lowDim._grid;
 | 
			
		||||
  GridBase *hg = higherDim._grid;
 | 
			
		||||
@@ -469,8 +463,8 @@ void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  //PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
@@ -488,7 +482,6 @@ template<class vobj>
 | 
			
		||||
void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj s;
 | 
			
		||||
 | 
			
		||||
  GridBase *lg = lowDim._grid;
 | 
			
		||||
  GridBase *hg = higherDim._grid;
 | 
			
		||||
@@ -505,8 +498,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  //PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
  parallel_for(int idx=0;idx<lg->lSites();idx++){
 | 
			
		||||
    sobj s;
 | 
			
		||||
    std::vector<int> lcoor(nl);
 | 
			
		||||
    std::vector<int> hcoor(nh);
 | 
			
		||||
    lg->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
@@ -574,8 +567,7 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
 | 
			
		||||
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
 | 
			
		||||
  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
 | 
			
		||||
    //Assemble vector of pointers to output elements
 | 
			
		||||
    std::vector<sobj*> out_ptrs(in_nsimd);
 | 
			
		||||
 | 
			
		||||
@@ -623,8 +615,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 | 
			
		||||
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
 | 
			
		||||
  unvectorizeToLexOrdArray(in_slex_conv, in);
 | 
			
		||||
    
 | 
			
		||||
  PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
 | 
			
		||||
  parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
 | 
			
		||||
    std::vector<int> out_ocoor(ndim);
 | 
			
		||||
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
 | 
			
		||||
 | 
			
		||||
@@ -642,10 +633,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 | 
			
		||||
    merge(out._odata[out_oidx], ptrs, 0);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -40,27 +40,24 @@ namespace Grid {
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class vobj>
 | 
			
		||||
    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
 | 
			
		||||
        Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = transpose(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
    };
 | 
			
		||||
    Lattice<vobj> ret(lhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss] = transpose(lhs._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  };
 | 
			
		||||
    
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Index level dependent transpose
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<int Index,class vobj>
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Index level dependent transpose
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<int Index,class vobj>
 | 
			
		||||
    inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
            ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
 | 
			
		||||
        }
 | 
			
		||||
        return ret;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 | 
			
		||||
    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
  };
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -37,8 +37,7 @@ namespace Grid {
 | 
			
		||||
    Lattice<obj> ret(rhs._grid);
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss]=pow(rhs._odata[ss],y);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@@ -47,8 +46,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    Lattice<obj> ret(rhs._grid);
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss]=mod(rhs._odata[ss],y);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@@ -58,8 +56,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    Lattice<obj> ret(rhs._grid);
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss]=div(rhs._odata[ss],y);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
@@ -69,8 +66,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    Lattice<obj> ret(rhs._grid);
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
 | 
			
		||||
    }
 | 
			
		||||
    return ret;
 | 
			
		||||
 
 | 
			
		||||
@@ -56,8 +56,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
 | 
			
		||||
  std::vector<scalar_object> truevals (Nsimd);
 | 
			
		||||
  std::vector<scalar_object> falsevals(Nsimd);
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
 | 
			
		||||
  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
 | 
			
		||||
 | 
			
		||||
    extract(iftrue._odata[ss]   ,truevals);
 | 
			
		||||
    extract(iffalse._odata[ss]  ,falsevals);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user