Grid/lib/Simd.h

    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./lib/Simd.h

    Copyright (C) 2015

Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
#ifndef GRID_SIMD_H
#define GRID_SIMD_H

////////////////////////////////////////////////////////////////////////
// Define scalar and vector floating point types
//
// Scalar:   RealF, RealD, ComplexF, ComplexD
//
// Vector:  vRealF, vRealD, vComplexF, vComplexD
//
// Vector types are arch dependent
////////////////////////////////////////////////////////////////////////


#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
#define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)

#define RotateBit (0x100)

namespace Grid {

  typedef uint32_t Integer;

  typedef  float  RealF;
  typedef  double RealD;
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
  typedef RealD   Real;
#else
  typedef RealF  Real;
#endif

  typedef std::complex<RealF> ComplexF;
  typedef std::complex<RealD> ComplexD;
  typedef std::complex<Real>  Complex;

  inline RealF adj(const RealF  & r){ return r; }
  inline RealF conjugate(const RealF  & r){ return r; }
  inline RealF real(const RealF  & r){ return r; }

  inline RealD adj(const RealD  & r){ return r; }
  inline RealD conjugate(const RealD  & r){ return r; }
  inline RealD real(const RealD  & r){ return r; }

  inline RealD sqrt(const RealD  & r){ return std::sqrt(r); }

  inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
  inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
  inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
  inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }

  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }

  inline ComplexD Reduce(const ComplexD& r){ return r; }
  inline ComplexF Reduce(const ComplexF& r){ return r; }
  inline RealD Reduce(const RealD& r){ return r; }
  inline RealF Reduce(const RealF& r){ return r; }

  inline RealD toReal(const ComplexD& r){ return real(r); }
  inline RealF toReal(const ComplexF& r){ return real(r); }
  inline RealD toReal(const RealD& r){ return r; }
  inline RealF toReal(const RealF& r){ return r; }

  
  ////////////////////////////////////////////////////////////////////////////////
  //Provide support functions for basic real and complex data types required by Grid
  //Single and double precision versions. Should be able to template this once only.
  ////////////////////////////////////////////////////////////////////////////////
  inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
  inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
  inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
  inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
  // conjugate already supported for complex
  
  inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
  inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
  inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
  inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
  
  //conjugate already supported for complex
  
  inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
  inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
  
  inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
  inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
  inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
  inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
  
  inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
  inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
  inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
  inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
  
  inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
  inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
  inline void vstream(RealF &l, const RealF &r){ l=r;}
  inline void vstream(RealD &l, const RealD &r){ l=r;}
  
  
  class Zero{};
  static Zero zero;
  template<class itype> inline void zeroit(itype &arg){ arg=zero;};
  template<>            inline void zeroit(ComplexF &arg){ arg=0; };
  template<>            inline void zeroit(ComplexD &arg){ arg=0; };
  template<>            inline void zeroit(RealF &arg){ arg=0; };
  template<>            inline void zeroit(RealD &arg){ arg=0; };
  

  //////////////////////////////////////////////////////////
  // Permute
  // Permute 0 every ABCDEFGH -> BA DC FE HG
  // Permute 1 every ABCDEFGH -> CD AB GH EF
  // Permute 2 every ABCDEFGH -> EFGH ABCD
  // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
  // Permute 4 possible on half precision @512bit vectors.
  //
  // Defined inside SIMD specialization files
  //////////////////////////////////////////////////////////
  template<class VectorSIMD>
    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);

};

#include <simd/Grid_vector_types.h>
#include <simd/Grid_vector_unops.h>

namespace Grid {
  // Default precision
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
  typedef vRealD vReal;
  typedef vComplexD vComplex;
#else
  typedef vRealF vReal;
  typedef vComplexF vComplex;
#endif

 
  inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
    int nn=vComplexF::Nsimd();
    std::vector<ComplexF,alignedAllocator<ComplexF> > buf(nn);
    vstore(o,&buf[0]);
    stream<<"<";
    for(int i=0;i<nn;i++){
      stream<<buf[i];
      if(i<nn-1) stream<<",";
    }
    stream<<">";
    return stream;
  }
 
  inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
    int nn=vComplexD::Nsimd();
    std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
    vstore(o,&buf[0]);
    stream<<"<";
    for(int i=0;i<nn;i++){
      stream<<buf[i];
      if(i<nn-1) stream<<",";
    }
    stream<<">";
    return stream;
  }

  inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
    int nn=vRealF::Nsimd();
    std::vector<RealF,alignedAllocator<RealF> > buf(nn);
    vstore(o,&buf[0]);
    stream<<"<";
    for(int i=0;i<nn;i++){
      stream<<buf[i];
      if(i<nn-1) stream<<",";
    }
    stream<<">";
    return stream;
  }

  inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){
    int nn=vRealD::Nsimd();
    std::vector<RealD,alignedAllocator<RealD> > buf(nn);
    vstore(o,&buf[0]);
    stream<<"<";
    for(int i=0;i<nn;i++){
      stream<<buf[i];
      if(i<nn-1) stream<<",";
    }
    stream<<">";
    return stream;
  }


}
#endif
Global edit adding copyright and license info to every source file. 2016-01-02 14:51:32 +00:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./lib/Simd.h`

			`Copyright (C) 2015`

			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`
			`Author: neo <cossu@post.kek.jp>`
			`Author: paboyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
Better organisation 2015-03-04 05:12:19 +00:00			`#ifndef GRID_SIMD_H`
			`#define GRID_SIMD_H`

			`////////////////////////////////////////////////////////////////////////`
			`// Define scalar and vector floating point types`
			`//`
			`// Scalar: RealF, RealD, ComplexF, ComplexD`
			`//`
			`// Vector: vRealF, vRealD, vComplexF, vComplexD`
			`//`
			`// Vector types are arch dependent`
			`////////////////////////////////////////////////////////////////////////`
ICPC and GCC5 fixes 2015-05-15 11:35:02 +01:00
Bringing in LatticeInteger with the idea of implemented predicated assignment, subsets etc. c.f the QDP++ "where" syntax 2015-04-06 06:30:48 +01:00
Updating to modify non-inlining permute routines and hopefully get better reg use and enhance performance. 2015-09-25 16:55:04 +01:00			`#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)\|(B<<4)\|(C<<2)\|(D))`
Avx512 changes for assembler kernels 2016-03-27 05:24:07 +01:00			`#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)\|(" #B "<<4)\|(" #C "<<2)\|(" #D "))"`
Updating to modify non-inlining permute routines and hopefully get better reg use and enhance performance. 2015-09-25 16:55:04 +01:00			`#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)\|(B<<6)\|(C<<5)\|(D<<4)\|(E<<3)\|(F<<2)\|(G<<4)\|(H))`
			`#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)`
			`#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)`

Rotate is a temporary hack. Would like to merge ALL permutes as rotates of length 2, and make any rotate active over any subset of lane bits. This is hard, and requires general permute; current intrinsics mean this is only really possible for specific case by case encodings as presently performed. Intel could produce a general permute.. would help. IBM did it in VMX. 2016-04-19 23:15:34 +01:00			`#define RotateBit (0x100)`

Renamed the namespace to Grid 2015-04-03 05:29:54 +01:00			`namespace Grid {`
Better organisation 2015-03-04 05:12:19 +00:00
Fix a regression failure on Mobius; chroma regression added 2015-12-10 22:55:00 +00:00			`typedef uint32_t Integer;`

Better organisation 2015-03-04 05:12:19 +00:00			`typedef float RealF;`
			`typedef double RealD;`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`#ifdef GRID_DEFAULT_PRECISION_DOUBLE`
			`typedef RealD Real;`
			`#else`
			`typedef RealF Real;`
			`#endif`

Better organisation 2015-03-04 05:12:19 +00:00			`typedef std::complex<RealF> ComplexF;`
			`typedef std::complex<RealD> ComplexD;`
Big updates with progress towards wilson matrix 2015-04-26 15:51:09 +01:00			`typedef std::complex<Real> Complex;`
Fixing the Checkerboarding cshift. Implemented "fake" communications in preparation for the leap to MPI. 2015-03-29 20:35:37 +01:00
Better organisation 2015-03-04 05:31:44 +00:00			`inline RealF adj(const RealF & r){ return r; }`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`inline RealF conjugate(const RealF & r){ return r; }`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`inline RealF real(const RealF & r){ return r; }`

			`inline RealD adj(const RealD & r){ return r; }`
Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`inline RealD conjugate(const RealD & r){ return r; }`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`inline RealD real(const RealD & r){ return r; }`

Conjugate residual algorithm; some more unary functions 2015-06-08 12:04:59 +01:00			`inline RealD sqrt(const RealD & r){ return std::sqrt(r); }`

Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. 2015-05-19 13:57:35 +01:00			`inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }`
			`inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }`
			`inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }`
			`inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }`

			`inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }`
			`inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }`
Reduce now going through MPI. 2015-04-14 22:40:40 +01:00			`inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }`
			`inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }`
Lattice matrix exponential ok 2015-06-17 12:41:07 +01:00
			`inline ComplexD Reduce(const ComplexD& r){ return r; }`
			`inline ComplexF Reduce(const ComplexF& r){ return r; }`
			`inline RealD Reduce(const RealD& r){ return r; }`
			`inline RealF Reduce(const RealF& r){ return r; }`

			`inline RealD toReal(const ComplexD& r){ return real(r); }`
			`inline RealF toReal(const ComplexF& r){ return real(r); }`
			`inline RealD toReal(const RealD& r){ return r; }`
			`inline RealF toReal(const RealF& r){ return r; }`


More cleanup of Grid_simd.h 2015-05-26 05:54:34 +01:00
			`////////////////////////////////////////////////////////////////////////////////`
			`//Provide support functions for basic real and complex data types required by Grid`
			`//Single and double precision versions. Should be able to template this once only.`
			`////////////////////////////////////////////////////////////////////////////////`
			`inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD __restrict__ x){ y = (a) (x)+(y); };`
			`inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD __restrict__ r){ y = (l) (*r);}`
			`inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD __restrict__ r){ y = (l) - (r);}`
			`inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD __restrict__ r){ y = (l) + (r);}`
			`// conjugate already supported for complex`

			`inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF __restrict__ x){ y = (a) (x)+(y); }`
			`inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF __restrict__ r){ y = (l) (*r); }`
			`inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF __restrict__ r){ y = (l) - (r); }`
			`inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF __restrict__ r){ y = (l) + (r); }`

			`//conjugate already supported for complex`

			`inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}`
			`inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}`
			`inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}`
			`inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}`
			`inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}`
			`inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}`
			`inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}`
			`inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}`

			`inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD __restrict__ x){ y = (a) (x)+(y);}`
			`inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD __restrict__ r){ y = (l) (*r);}`
			`inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD __restrict__ r){ y = (l) - (r);}`
			`inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD __restrict__ r){ y = (l) + (r);}`

			`inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF __restrict__ x){ y = (a) (x)+(y); }`
			`inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF __restrict__ r){ y = (l) (*r); }`
			`inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF __restrict__ r){ y = (l) - (r); }`
			`inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF __restrict__ r){ y = (l) + (r); }`

			`inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}`
			`inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}`
			`inline void vstream(RealF &l, const RealF &r){ l=r;}`
			`inline void vstream(RealD &l, const RealD &r){ l=r;}`


Better organisation 2015-03-04 05:12:19 +00:00			`class Zero{};`
			`static Zero zero;`
Improving the trace support to support any index tracing and simplifying implmentation in some ways 2015-04-16 14:47:28 +01:00			`template<class itype> inline void zeroit(itype &arg){ arg=zero;};`
			`template<> inline void zeroit(ComplexF &arg){ arg=0; };`
			`template<> inline void zeroit(ComplexD &arg){ arg=0; };`
			`template<> inline void zeroit(RealF &arg){ arg=0; };`
			`template<> inline void zeroit(RealD &arg){ arg=0; };`
More cleanup of Grid_simd.h 2015-05-26 05:54:34 +01:00
Included Gpermute in the new Grid_simd.h file style. Now tested for SSE4. OK 2015-05-27 04:11:44 +01:00
			`//////////////////////////////////////////////////////////`
			`// Permute`
			`// Permute 0 every ABCDEFGH -> BA DC FE HG`
			`// Permute 1 every ABCDEFGH -> CD AB GH EF`
			`// Permute 2 every ABCDEFGH -> EFGH ABCD`
			`// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)`
			`// Permute 4 possible on half precision @512bit vectors.`
			`//`
			`// Defined inside SIMD specialization files`
			`//////////////////////////////////////////////////////////`
			`template<class VectorSIMD>`
			`inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);`

Major rework of extract/merge/permute processing debugged and working. 2015-04-06 11:26:24 +01:00			`};`
Bringing in LatticeInteger with the idea of implemented predicated assignment, subsets etc. c.f the QDP++ "where" syntax 2015-04-06 06:30:48 +01:00
Completed implementation of new Grid_simd classes Tested performance for SSE4, Ok. AVX1/2, AVX512 yet untested 2015-05-22 09:33:15 +01:00			`#include <simd/Grid_vector_types.h>`
Conjugate residual algorithm; some more unary functions 2015-06-08 12:04:59 +01:00			`#include <simd/Grid_vector_unops.h>`
Better organisation 2015-03-04 05:12:19 +00:00
Stencil code pretty much shaken out. Beginning of inner product and norm2. 2015-04-14 20:22:04 +01:00			`namespace Grid {`
			`// Default precision`
Some bug fixes 2015-04-14 23:20:16 +01:00			`#ifdef GRID_DEFAULT_PRECISION_DOUBLE`
Stencil code pretty much shaken out. Beginning of inner product and norm2. 2015-04-14 20:22:04 +01:00			`typedef vRealD vReal;`
			`typedef vComplexD vComplex;`
Some bug fixes 2015-04-14 23:20:16 +01:00			`#else`
			`typedef vRealF vReal;`
			`typedef vComplexF vComplex;`
			`#endif`
cout IO for all types 2015-05-13 09:24:10 +01:00

			`inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){`
			`int nn=vComplexF::Nsimd();`
			`std::vector<ComplexF,alignedAllocator<ComplexF> > buf(nn);`
			`vstore(o,&buf[0]);`
			`stream<<"<";`
			`for(int i=0;i<nn;i++){`
			`stream<<buf[i];`
			`if(i<nn-1) stream<<",";`
			`}`
			`stream<<">";`
			`return stream;`
			`}`

			`inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){`
			`int nn=vComplexD::Nsimd();`
			`std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);`
			`vstore(o,&buf[0]);`
			`stream<<"<";`
			`for(int i=0;i<nn;i++){`
			`stream<<buf[i];`
			`if(i<nn-1) stream<<",";`
			`}`
			`stream<<">";`
			`return stream;`
			`}`

			`inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){`
			`int nn=vRealF::Nsimd();`
			`std::vector<RealF,alignedAllocator<RealF> > buf(nn);`
			`vstore(o,&buf[0]);`
			`stream<<"<";`
			`for(int i=0;i<nn;i++){`
			`stream<<buf[i];`
			`if(i<nn-1) stream<<",";`
			`}`
			`stream<<">";`
			`return stream;`
			`}`

			`inline std::ostream& operator<< (std::ostream& stream, const vRealD &o){`
			`int nn=vRealD::Nsimd();`
			`std::vector<RealD,alignedAllocator<RealD> > buf(nn);`
			`vstore(o,&buf[0]);`
			`stream<<"<";`
			`for(int i=0;i<nn;i++){`
			`stream<<buf[i];`
			`if(i<nn-1) stream<<",";`
			`}`
			`stream<<">";`
			`return stream;`
			`}`


Stencil code pretty much shaken out. Beginning of inner product and norm2. 2015-04-14 20:22:04 +01:00			`}`
Better organisation 2015-03-04 05:12:19 +00:00			`#endif`