mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Big updates with progress towards wilson matrix
This commit is contained in:
		
							
								
								
									
										18
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								TODO
									
									
									
									
									
								
							@@ -2,6 +2,10 @@
 | 
			
		||||
  - use protocol buffers? replace xmlReader/Writer ec..
 | 
			
		||||
  - Binary use htonll, htonl
 | 
			
		||||
 | 
			
		||||
* Reduce implemention is poor
 | 
			
		||||
* Bug in SeedFixedIntegers gives same output on each site.
 | 
			
		||||
* Bug in RNG with complex numbers ; only filling real values; need helper function -- DONE
 | 
			
		||||
 | 
			
		||||
* Stencil operator support                           -----Initial thoughts, trial implementation DONE.
 | 
			
		||||
                                                     -----some simple tests that Stencil matches Cshift.
 | 
			
		||||
                                                     -----do all permute in comms phase, so that copy permute
 | 
			
		||||
@@ -11,6 +15,7 @@
 | 
			
		||||
 | 
			
		||||
* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
 | 
			
		||||
 | 
			
		||||
* Strong test for norm2, conj and all primitive types.
 | 
			
		||||
 | 
			
		||||
* Consider switch std::vector to boost arrays or something lighter weight
 | 
			
		||||
  boost::multi_array<type, 3> A()...    to replace multi1d, multi2d etc..
 | 
			
		||||
@@ -33,10 +38,21 @@
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
* Make the Tensor types and Complex etc... play more nicely.
 | 
			
		||||
* TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
 | 
			
		||||
 | 
			
		||||
  - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
 | 
			
		||||
  QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
 | 
			
		||||
  want to introduce a syntax that does not require this.
 | 
			
		||||
 | 
			
		||||
  - Reductions that contract indices on a site should always demote the tensor structure.
 | 
			
		||||
    norm2(), innerProduct.
 | 
			
		||||
 | 
			
		||||
  - Result of Sum(), SliceSum // spatial sums
 | 
			
		||||
              trace, traceIndex etc.. do not.
 | 
			
		||||
 | 
			
		||||
  - problem arises because "trace" returns Lattice<TComplex> moving everything down to Scalar,
 | 
			
		||||
    and then Sum and SliceSum to not remove the Scalars. This would be fixed if we 
 | 
			
		||||
    template specialize the scalar scalar scalar sum and SliceSum,  on the basis of being
 | 
			
		||||
    pure scalar.
 | 
			
		||||
 | 
			
		||||
* Optimise the extract/merge SIMD routines; Azusa??
 | 
			
		||||
 - I have collated into single location at least.
 | 
			
		||||
 
 | 
			
		||||
@@ -11,6 +11,7 @@
 | 
			
		||||
#define GRID_H
 | 
			
		||||
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
#include <complex>
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
/* lib/Grid_config.h.in.  Generated from configure.ac by autoheader.  */
 | 
			
		||||
 | 
			
		||||
/* AVX */
 | 
			
		||||
#define AVX1 1
 | 
			
		||||
/* #undef AVX1 */
 | 
			
		||||
 | 
			
		||||
/* AVX2 */
 | 
			
		||||
/* #undef AVX2 */
 | 
			
		||||
@@ -77,7 +77,7 @@
 | 
			
		||||
#define PACKAGE_VERSION "1.0"
 | 
			
		||||
 | 
			
		||||
/* SSE4 */
 | 
			
		||||
/* #undef SSE4 */
 | 
			
		||||
#define SSE4 1
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the ANSI C header files. */
 | 
			
		||||
#define STDC_HEADERS 1
 | 
			
		||||
 
 | 
			
		||||
@@ -26,10 +26,15 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef  float  RealF;
 | 
			
		||||
  typedef  double RealD;
 | 
			
		||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 | 
			
		||||
  typedef RealD   Real;
 | 
			
		||||
#else
 | 
			
		||||
  typedef RealF  Real;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  typedef std::complex<RealF> ComplexF;
 | 
			
		||||
  typedef std::complex<RealD> ComplexD;
 | 
			
		||||
 | 
			
		||||
  typedef std::complex<Real>  Complex;
 | 
			
		||||
 | 
			
		||||
  inline RealF adj(const RealF  & r){ return r; }
 | 
			
		||||
  inline RealF conj(const RealF  & r){ return r; }
 | 
			
		||||
@@ -63,8 +68,8 @@ namespace Grid {
 | 
			
		||||
    //conj already supported for complex
 | 
			
		||||
 | 
			
		||||
    inline ComplexF timesI(const ComplexF r)     { return(r*ComplexF(0.0,1.0));}
 | 
			
		||||
    inline ComplexF timesMinusI(const ComplexF r){ return(r*ComplexF(0.0,-1.0));}
 | 
			
		||||
    inline ComplexD timesI(const ComplexD r)     { return(r*ComplexD(0.0,1.0));}
 | 
			
		||||
    inline ComplexF timesMinusI(const ComplexF r){ return(r*ComplexF(0.0,-1.0));}
 | 
			
		||||
    inline ComplexD timesMinusI(const ComplexD r){ return(r*ComplexD(0.0,-1.0));}
 | 
			
		||||
 | 
			
		||||
    inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
 | 
			
		||||
@@ -280,15 +285,11 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  // Default precision
 | 
			
		||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 | 
			
		||||
  typedef RealD   Real;
 | 
			
		||||
  typedef vRealD vReal;
 | 
			
		||||
  typedef vComplexD vComplex;
 | 
			
		||||
  typedef std::complex<Real>  Complex;
 | 
			
		||||
#else
 | 
			
		||||
  typedef RealF  Real;
 | 
			
		||||
  typedef vRealF vReal;
 | 
			
		||||
  typedef vComplexF vComplex;
 | 
			
		||||
  typedef std::complex<Real>  Complex;
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -47,6 +47,101 @@ namespace Grid {
 | 
			
		||||
    int from_rank;
 | 
			
		||||
  } ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there is no need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj,class cobj,class compressor> void 
 | 
			
		||||
Gather_plane_simple (Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                    // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                    // offset in buffer
 | 
			
		||||
 | 
			
		||||
    // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	buffer[bo++]=compress(rhs._odata[so+o+b]);
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                      // offset in buffer
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
			
		||||
	if ( ocb &cbmask ) {
 | 
			
		||||
	  buffer[bo]=compress(rhs._odata[so+o+b]);
 | 
			
		||||
	  bo++;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there *is* need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class cobj,class vobj,class compressor> void 
 | 
			
		||||
  Gather_plane_extract(Lattice<vobj> &rhs,std::vector<typename cobj::scalar_type *> pointers,int dimension,int plane,int cbmask,compressor &compress)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                    // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                    // offset in buffer
 | 
			
		||||
 | 
			
		||||
    // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	cobj temp;
 | 
			
		||||
	temp=compress(rhs._odata[so+o+b]);
 | 
			
		||||
	extract(temp,pointers);
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                      // offset in buffer
 | 
			
		||||
    
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
	if ( ocb & cbmask ) {
 | 
			
		||||
	  cobj temp; 
 | 
			
		||||
	  temp =compress(rhs._odata[so+o+b]);
 | 
			
		||||
	  extract(temp,pointers);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
@@ -86,8 +181,8 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      // Could allow a functional munging of the halo to another type during the comms.
 | 
			
		||||
      // this could implement the 16bit/32bit/64bit compression.
 | 
			
		||||
      template<class vobj> void HaloExchange(Lattice<vobj> &source,
 | 
			
		||||
					     std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf)
 | 
			
		||||
      template<class vobj,class cobj, class compressor> void 
 | 
			
		||||
	HaloExchange(Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
 | 
			
		||||
      {
 | 
			
		||||
	// conformable(source._grid,_grid);
 | 
			
		||||
	assert(source._grid==_grid);
 | 
			
		||||
@@ -95,12 +190,10 @@ namespace Grid {
 | 
			
		||||
	int u_comm_offset=0;
 | 
			
		||||
 | 
			
		||||
	// Gather all comms buffers
 | 
			
		||||
	typedef typename vobj::vector_type vector_type;
 | 
			
		||||
	typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
 | 
			
		||||
	for(int point = 0 ; point < _npoints; point++) {
 | 
			
		||||
 | 
			
		||||
	  printf("Point %d \n",point);fflush(stdout);
 | 
			
		||||
	  compress.Point(point);
 | 
			
		||||
 | 
			
		||||
	  int dimension    = _directions[point];
 | 
			
		||||
	  int displacement = _distances[point];
 | 
			
		||||
	  
 | 
			
		||||
@@ -126,33 +219,30 @@ namespace Grid {
 | 
			
		||||
	    sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
 | 
			
		||||
	    if ( sshift[0] == sshift[1] ) {
 | 
			
		||||
	      if (splice_dim) {
 | 
			
		||||
		printf("splice 0x3 \n");fflush(stdout);
 | 
			
		||||
		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
 | 
			
		||||
		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
 | 
			
		||||
	      } else { 
 | 
			
		||||
		printf("NO splice 0x3 \n");fflush(stdout);
 | 
			
		||||
		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
 | 
			
		||||
		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
 | 
			
		||||
	      }
 | 
			
		||||
	    } else {
 | 
			
		||||
	      if(splice_dim){
 | 
			
		||||
		printf("splice 0x1,2 \n");fflush(stdout);
 | 
			
		||||
		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes
 | 
			
		||||
		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration
 | 
			
		||||
		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
 | 
			
		||||
		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
 | 
			
		||||
	      } else {
 | 
			
		||||
		printf("NO splice 0x1,2 \n");fflush(stdout);
 | 
			
		||||
		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);
 | 
			
		||||
		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);
 | 
			
		||||
		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
 | 
			
		||||
		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
 | 
			
		||||
	      }
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      template<class vobj> void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
			
		||||
						 std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
 | 
			
		||||
						 int &u_comm_offset)
 | 
			
		||||
      template<class vobj,class cobj, class compressor> 
 | 
			
		||||
        void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
			
		||||
			      std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 | 
			
		||||
			      int &u_comm_offset,compressor & compress)
 | 
			
		||||
	{
 | 
			
		||||
	  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
	  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
	  typedef typename cobj::vector_type vector_type;
 | 
			
		||||
	  typedef typename cobj::scalar_type scalar_type;
 | 
			
		||||
	  
 | 
			
		||||
	  GridBase *grid=_grid;
 | 
			
		||||
	  assert(rhs._grid==_grid);
 | 
			
		||||
@@ -169,31 +259,26 @@ namespace Grid {
 | 
			
		||||
	  
 | 
			
		||||
	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
			
		||||
	  
 | 
			
		||||
	  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); // hmm...
 | 
			
		||||
	  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
 | 
			
		||||
	  std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
 | 
			
		||||
	  std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
 | 
			
		||||
	  
 | 
			
		||||
	  int cb= (cbmask==0x2)? 1 : 0;
 | 
			
		||||
	  int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
 | 
			
		||||
	  
 | 
			
		||||
	  for(int x=0;x<rd;x++){       
 | 
			
		||||
	    
 | 
			
		||||
	    printf("GatherStartComms x %d/%d\n",x,rd);fflush(stdout);
 | 
			
		||||
	    int offnode = ( x+sshift >= rd );
 | 
			
		||||
	    int sx        = (x+sshift)%rd;
 | 
			
		||||
	    int comm_proc = (x+sshift)/rd;
 | 
			
		||||
	    
 | 
			
		||||
	    if (offnode) {
 | 
			
		||||
	      
 | 
			
		||||
	      printf("GatherStartComms offnode x %d\n",x);fflush(stdout);
 | 
			
		||||
	      int words = send_buf.size();
 | 
			
		||||
	      if (cbmask != 0x3) words=words>>1;
 | 
			
		||||
	    
 | 
			
		||||
	      int bytes = words * sizeof(vobj);
 | 
			
		||||
	      int bytes = words * sizeof(cobj);
 | 
			
		||||
 | 
			
		||||
	      printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout);
 | 
			
		||||
	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
 | 
			
		||||
 | 
			
		||||
	      printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout);
 | 
			
		||||
	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
 | 
			
		||||
 | 
			
		||||
	      int rank           = _grid->_processor;
 | 
			
		||||
	      int recv_from_rank;
 | 
			
		||||
@@ -219,14 +304,14 @@ namespace Grid {
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      template<class vobj>
 | 
			
		||||
      template<class vobj,class cobj, class compressor> 
 | 
			
		||||
	void  GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
			
		||||
				   std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
 | 
			
		||||
				   int &u_comm_offset)
 | 
			
		||||
				   std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 | 
			
		||||
				   int &u_comm_offset,compressor &compress)
 | 
			
		||||
	{
 | 
			
		||||
	  const int Nsimd = _grid->Nsimd();
 | 
			
		||||
	  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
	  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
	  typedef typename cobj::vector_type vector_type;
 | 
			
		||||
	  typedef typename cobj::scalar_type scalar_type;
 | 
			
		||||
	  
 | 
			
		||||
	  int fd = _grid->_fdimensions[dimension];
 | 
			
		||||
	  int rd = _grid->_rdimensions[dimension];
 | 
			
		||||
@@ -245,7 +330,7 @@ namespace Grid {
 | 
			
		||||
	  // Simd direction uses an extract/merge pair
 | 
			
		||||
	  ///////////////////////////////////////////////
 | 
			
		||||
	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
			
		||||
	  int words = sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
	  int words = sizeof(cobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
	  /*   FIXME ALTERNATE BUFFER DETERMINATION */
 | 
			
		||||
	  std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) ); 
 | 
			
		||||
@@ -285,7 +370,7 @@ namespace Grid {
 | 
			
		||||
	      for(int i=0;i<Nsimd;i++){
 | 
			
		||||
		pointers[Nsimd-1-i] = (scalar_type *)&send_buf_extract[i][0];
 | 
			
		||||
	      }
 | 
			
		||||
	      Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
 | 
			
		||||
	      Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
 | 
			
		||||
	      
 | 
			
		||||
	      for(int i=0;i<Nsimd;i++){
 | 
			
		||||
		
 | 
			
		||||
 
 | 
			
		||||
@@ -18,6 +18,7 @@ libGrid_a_SOURCES =\
 | 
			
		||||
	 Grid_init.cc\
 | 
			
		||||
	stencil/Grid_stencil_common.cc\
 | 
			
		||||
	qcd/Grid_qcd_dirac.cc\
 | 
			
		||||
	qcd/Grid_qcd_wilson_dop.cc\
 | 
			
		||||
	$(extra_sources)
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
 
 | 
			
		||||
@@ -79,18 +79,18 @@ namespace Grid {
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  template<class left,class right>
 | 
			
		||||
    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
 | 
			
		||||
    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
 | 
			
		||||
  {
 | 
			
		||||
    //NB mult performs conformable check. Do not reapply here for performance.
 | 
			
		||||
    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
    Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
    add(ret,lhs,rhs);
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  template<class left,class right>
 | 
			
		||||
    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
 | 
			
		||||
    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
 | 
			
		||||
  {
 | 
			
		||||
    //NB mult performs conformable check. Do not reapply here for performance.
 | 
			
		||||
    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
    Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
    sub(ret,lhs,rhs);
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
@@ -107,9 +107,9 @@ namespace Grid {
 | 
			
		||||
        return ret;
 | 
			
		||||
  }
 | 
			
		||||
  template<class left,class right>
 | 
			
		||||
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
 | 
			
		||||
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	ret._odata[ss]=lhs+rhs._odata[ss];
 | 
			
		||||
@@ -117,9 +117,9 @@ namespace Grid {
 | 
			
		||||
        return ret;
 | 
			
		||||
    }
 | 
			
		||||
  template<class left,class right>
 | 
			
		||||
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
 | 
			
		||||
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
 | 
			
		||||
  {
 | 
			
		||||
    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
      ret._odata[ss]=lhs-rhs._odata[ss];
 | 
			
		||||
@@ -137,9 +137,9 @@ namespace Grid {
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    template<class left,class right>
 | 
			
		||||
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
 | 
			
		||||
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
 | 
			
		||||
    {
 | 
			
		||||
        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
 | 
			
		||||
        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	  ret._odata[ss]=lhs._odata[ss]+rhs;
 | 
			
		||||
@@ -147,9 +147,9 @@ namespace Grid {
 | 
			
		||||
        return ret;
 | 
			
		||||
    }
 | 
			
		||||
    template<class left,class right>
 | 
			
		||||
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
 | 
			
		||||
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
 | 
			
		||||
    {
 | 
			
		||||
      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
 | 
			
		||||
      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 | 
			
		||||
	ret._odata[ss]=lhs._odata[ss]-rhs;
 | 
			
		||||
 
 | 
			
		||||
@@ -14,7 +14,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      typedef typename vobj::scalar_type scalar;
 | 
			
		||||
      typedef typename vobj::vector_type vector;
 | 
			
		||||
      decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
 | 
			
		||||
      decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm; 
 | 
			
		||||
      scalar nrm;
 | 
			
		||||
      //FIXME make this loop parallelisable
 | 
			
		||||
      vnrm=zero;
 | 
			
		||||
@@ -33,10 +33,11 @@ namespace Grid {
 | 
			
		||||
      //->decltype(innerProduct(left._odata[0],right._odata[0]))
 | 
			
		||||
    {
 | 
			
		||||
      typedef typename vobj::scalar_type scalar;
 | 
			
		||||
      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
 | 
			
		||||
      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm; 
 | 
			
		||||
 | 
			
		||||
      scalar nrm;
 | 
			
		||||
      //FIXME make this loop parallelisable
 | 
			
		||||
      vnrm=zero;
 | 
			
		||||
      for(int ss=0;ss<left._grid->oSites(); ss++){
 | 
			
		||||
	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
 | 
			
		||||
      }
 | 
			
		||||
@@ -94,8 +95,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
			
		||||
  int ld=grid->_ldimensions[orthogdim];
 | 
			
		||||
  int rd=grid->_rdimensions[orthogdim];
 | 
			
		||||
 | 
			
		||||
  sobj szero; szero=zero;
 | 
			
		||||
 | 
			
		||||
  std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
 | 
			
		||||
  std::vector<sobj> lsSum(ld,sobj(zero)); // sum across these down to scalars
 | 
			
		||||
  std::vector<sobj> lsSum(ld,szero); // sum across these down to scalars
 | 
			
		||||
  std::vector<sobj> extracted(Nsimd);     // splitting the SIMD
 | 
			
		||||
 | 
			
		||||
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
 | 
			
		||||
 
 | 
			
		||||
@@ -26,6 +26,8 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  class GridRNGbase {
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
@@ -62,6 +64,21 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // real scalars are one component
 | 
			
		||||
    template<class scalar,class distribution> void fillScalar(scalar &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=dist(_generators[0]);
 | 
			
		||||
    }
 | 
			
		||||
    template<class distribution> void fillScalar(ComplexF &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=ComplexF(dist(_generators[0]),dist(_generators[0]));
 | 
			
		||||
    }
 | 
			
		||||
    template<class distribution> void fillScalar(ComplexD &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=ComplexD(dist(_generators[0]),dist(_generators[0]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template <class sobj,class distribution> inline void fill(sobj &l,distribution &dist){
 | 
			
		||||
 | 
			
		||||
      typedef typename sobj::scalar_type scalar_type;
 | 
			
		||||
@@ -71,13 +88,60 @@ namespace Grid {
 | 
			
		||||
      scalar_type *buf = (scalar_type *) & l;
 | 
			
		||||
 | 
			
		||||
      for(int idx=0;idx<words;idx++){
 | 
			
		||||
	buf[idx] = dist(_generators[0]);
 | 
			
		||||
	fillScalar(buf[idx],dist);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template <class distribution>  inline void fill(ComplexF &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(ComplexD &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(RealF &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(RealD &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    // vector fill
 | 
			
		||||
    template <class distribution>  inline void fill(vComplexF &l,distribution &dist){
 | 
			
		||||
      RealF *pointer=(RealF *)&l;
 | 
			
		||||
      for(int i=0;i<2*vComplexF::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vComplexD &l,distribution &dist){
 | 
			
		||||
      RealD *pointer=(RealD *)&l;
 | 
			
		||||
      for(int i=0;i<2*vComplexD::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vRealF &l,distribution &dist){
 | 
			
		||||
      RealF *pointer=(RealF *)&l;
 | 
			
		||||
      for(int i=0;i<vRealF::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vRealD &l,distribution &dist){
 | 
			
		||||
      RealD *pointer=(RealD *)&l;
 | 
			
		||||
      for(int i=0;i<vRealD::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    void SeedRandomDevice(void){
 | 
			
		||||
      std::random_device rd;
 | 
			
		||||
      Seed(rd);
 | 
			
		||||
@@ -186,7 +250,6 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){
 | 
			
		||||
    rng.fill(l,rng._uniform);
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -11,21 +11,21 @@ namespace Grid {
 | 
			
		||||
// multiplication by fundamental scalar type
 | 
			
		||||
template<class l,int N> inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iVector<l,N>::tensor_reduced srhs(rhs);
 | 
			
		||||
  typename iVector<l,N>::tensor_reduced srhs; srhs=rhs;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iVector<l,N> operator * (const typename iScalar<l>::scalar_type lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type &rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
 | 
			
		||||
  typename iMatrix<l,N>::tensor_reduced srhs; srhs=rhs;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator * (const typename iScalar<l>::scalar_type & lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
 | 
			
		||||
@@ -35,24 +35,24 @@ template<class l,int N> inline iMatrix<l,N> operator * (const typename iScalar<l
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t; t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l> inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
 | 
			
		||||
@@ -62,24 +62,26 @@ template<class l,int N> inline iMatrix<l,N> operator * (double lhs,const iMatrix
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,ComplexD rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l> inline iScalar<l> operator * (ComplexD lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,ComplexD rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iVector<l,N> operator * (ComplexD lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,ComplexD rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator * (ComplexD lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
 | 
			
		||||
@@ -89,24 +91,24 @@ template<class l,int N> inline iMatrix<l,N> operator * (ComplexD lhs,const iMatr
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;  t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l> inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs*srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
 | 
			
		||||
@@ -118,14 +120,14 @@ template<class l,int N> inline iMatrix<l,N> operator * (Integer lhs,const iMatri
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class l,int N> inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
 | 
			
		||||
  return lhs+srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
 | 
			
		||||
  typename iMatrix<l,N>::tensor_reduced srhs; srhs=rhs;
 | 
			
		||||
  return lhs+srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
 | 
			
		||||
@@ -135,16 +137,16 @@ template<class l,int N> inline iMatrix<l,N> operator + (const typename iScalar<l
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t; t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
 | 
			
		||||
  return lhs+srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l> inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs+srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
 | 
			
		||||
@@ -155,8 +157,8 @@ template<class l,int N> inline iMatrix<l,N> operator + (double lhs,const iMatrix
 | 
			
		||||
 | 
			
		||||
template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t; t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
 | 
			
		||||
  return lhs+srhs;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -164,8 +166,8 @@ template<class l> inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rh
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs+srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
 | 
			
		||||
@@ -176,23 +178,23 @@ template<class l,int N> inline iMatrix<l,N> operator + (Integer lhs,const iMatri
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class l,int N> inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
 | 
			
		||||
  return lhs-srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs(lhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
 | 
			
		||||
  return slhs-rhs;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
 | 
			
		||||
  return lhs-srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs(lhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
 | 
			
		||||
  return slhs-rhs;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -201,27 +203,27 @@ template<class l,int N> inline iMatrix<l,N> operator - (const typename iScalar<l
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t; t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
 | 
			
		||||
  return lhs-srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l> inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(lhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs(t);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs;slhs=t;
 | 
			
		||||
  return slhs-rhs;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs-srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(lhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs(t);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs;slhs=t;
 | 
			
		||||
  return slhs-rhs;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -230,26 +232,26 @@ template<class l,int N> inline iMatrix<l,N> operator - (double lhs,const iMatrix
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t; t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
 | 
			
		||||
  return lhs-srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l> inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(lhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=lhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs;slhs=t;
 | 
			
		||||
  return slhs-rhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(rhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=rhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
 | 
			
		||||
  return lhs-srhs;
 | 
			
		||||
}
 | 
			
		||||
template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs) 
 | 
			
		||||
{
 | 
			
		||||
  typename iScalar<l>::scalar_type t(lhs);
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs(t);
 | 
			
		||||
  typename iScalar<l>::scalar_type t;t=lhs;
 | 
			
		||||
  typename iScalar<l>::tensor_reduced slhs;slhs=t;
 | 
			
		||||
  return slhs-rhs;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -17,7 +17,8 @@ namespace Grid {
 | 
			
		||||
    auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
 | 
			
		||||
    {
 | 
			
		||||
        typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
 | 
			
		||||
        iScalar<ret_t> ret=zero;
 | 
			
		||||
        iScalar<ret_t> ret;
 | 
			
		||||
	ret=zero;
 | 
			
		||||
        for(int c1=0;c1<N;c1++){
 | 
			
		||||
            ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
 | 
			
		||||
        }
 | 
			
		||||
@@ -27,8 +28,9 @@ namespace Grid {
 | 
			
		||||
    auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
 | 
			
		||||
    {
 | 
			
		||||
        typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
 | 
			
		||||
        iScalar<ret_t> ret=zero;
 | 
			
		||||
        iScalar<ret_t> ret;
 | 
			
		||||
        iScalar<ret_t> tmp;
 | 
			
		||||
	ret=zero;
 | 
			
		||||
        for(int c1=0;c1<N;c1++){
 | 
			
		||||
        for(int c2=0;c2<N;c2++){
 | 
			
		||||
	  ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
 | 
			
		||||
 
 | 
			
		||||
@@ -8,6 +8,16 @@ namespace Grid {
 | 
			
		||||
// These can be composed to form tensor products of internal indices.
 | 
			
		||||
///////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
// It is useful to NOT have any constructors
 | 
			
		||||
// so that these classes assert "is_pod<class> == true"
 | 
			
		||||
// because then the standard C++ valarray container eliminates fill overhead on new allocation and 
 | 
			
		||||
// non-move copying.
 | 
			
		||||
//
 | 
			
		||||
// However note that doing this eliminates some syntactical sugar such as 
 | 
			
		||||
// calling the constructor explicitly or implicitly
 | 
			
		||||
//
 | 
			
		||||
#define TENSOR_IS_POD
 | 
			
		||||
 | 
			
		||||
template<class vtype> class iScalar
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
@@ -25,16 +35,22 @@ public:
 | 
			
		||||
  // Scalar no action
 | 
			
		||||
  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
 | 
			
		||||
 | 
			
		||||
    iScalar(){};
 | 
			
		||||
    
 | 
			
		||||
#ifndef TENSOR_IS_POD
 | 
			
		||||
  iScalar(){;};
 | 
			
		||||
  iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
 | 
			
		||||
 | 
			
		||||
  iScalar(const Zero &z){ *this = zero; };
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    iScalar<vtype> & operator= (const Zero &hero){
 | 
			
		||||
      zeroit(*this);
 | 
			
		||||
      return *this;
 | 
			
		||||
    }
 | 
			
		||||
    iScalar<vtype> & operator= (const scalar_type s){
 | 
			
		||||
      _internal=s;
 | 
			
		||||
      return *this;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    friend void zeroit(iScalar<vtype> &that){
 | 
			
		||||
        zeroit(that._internal);
 | 
			
		||||
    }
 | 
			
		||||
@@ -114,8 +130,10 @@ public:
 | 
			
		||||
 | 
			
		||||
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
 | 
			
		||||
 | 
			
		||||
#ifndef TENSOR_IS_POD
 | 
			
		||||
  iVector(const Zero &z){ *this = zero; };
 | 
			
		||||
  iVector() {};// Empty constructure
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    iVector<vtype,N> & operator= (const Zero &hero){
 | 
			
		||||
        zeroit(*this);
 | 
			
		||||
@@ -185,8 +203,11 @@ public:
 | 
			
		||||
 | 
			
		||||
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
 | 
			
		||||
 | 
			
		||||
#ifndef TENSOR_IS_POD
 | 
			
		||||
  iMatrix(const Zero &z){ *this = zero; };
 | 
			
		||||
  iMatrix() {};
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  iMatrix<vtype,N> & operator= (const Zero &hero){
 | 
			
		||||
    zeroit(*this);
 | 
			
		||||
    return *this;
 | 
			
		||||
 
 | 
			
		||||
@@ -8,6 +8,7 @@ namespace QCD {
 | 
			
		||||
    static const int Ns=4;
 | 
			
		||||
    static const int Nd=4;
 | 
			
		||||
    static const int Nhs=2; // half spinor
 | 
			
		||||
    static const int Nds=8; // double stored gauge field
 | 
			
		||||
 | 
			
		||||
    static const int CbRed  =0;
 | 
			
		||||
    static const int CbBlack=1;
 | 
			
		||||
@@ -28,79 +29,216 @@ namespace QCD {
 | 
			
		||||
    //
 | 
			
		||||
    // That probably makes for GridRedBlack4dCartesian grid.
 | 
			
		||||
 | 
			
		||||
    // s,sp,c,spc,lc
 | 
			
		||||
    template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
 | 
			
		||||
    template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
 | 
			
		||||
    template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 | 
			
		||||
    template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 | 
			
		||||
    template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 | 
			
		||||
    template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 | 
			
		||||
    template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 | 
			
		||||
    template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
 | 
			
		||||
    template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
 | 
			
		||||
 | 
			
		||||
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
 | 
			
		||||
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
 | 
			
		||||
 | 
			
		||||
    // Spin matrix
 | 
			
		||||
    typedef iSpinMatrix<Complex  >          SpinMatrix;
 | 
			
		||||
    typedef iSpinMatrix<ComplexF >          SpinMatrixF;
 | 
			
		||||
    typedef iSpinMatrix<ComplexD >          SpinMatrixD;
 | 
			
		||||
 | 
			
		||||
    typedef iSpinMatrix<vComplex >          vSpinMatrix;
 | 
			
		||||
    typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
 | 
			
		||||
    typedef iSpinMatrix<vComplexD>          vSpinMatrixD;
 | 
			
		||||
 | 
			
		||||
    // Colour Matrix
 | 
			
		||||
    typedef iColourMatrix<Complex  >        ColourMatrix;
 | 
			
		||||
    typedef iColourMatrix<ComplexF >        ColourMatrixF;
 | 
			
		||||
    typedef iColourMatrix<ComplexD >        ColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    typedef iColourMatrix<vComplex >        vColourMatrix;
 | 
			
		||||
    typedef iColourMatrix<vComplexF>        vColourMatrixF;
 | 
			
		||||
    typedef iColourMatrix<vComplexD>        vColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    // SpinColour matrix
 | 
			
		||||
    typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
 | 
			
		||||
    typedef iSpinColourMatrix<ComplexF >    SpinColourMatrixF;
 | 
			
		||||
    typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 | 
			
		||||
    typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
 | 
			
		||||
    typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    // LorentzColour
 | 
			
		||||
    typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
 | 
			
		||||
    typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
 | 
			
		||||
    typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    typedef iSpinVector<Complex >           SpinVector;
 | 
			
		||||
    typedef iColourVector<Complex >         ColourVector;
 | 
			
		||||
    typedef iSpinColourVector<Complex >     SpinColourVector;
 | 
			
		||||
    typedef iHalfSpinVector<Complex >       HalfSpinVector;
 | 
			
		||||
    typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    typedef iSpinMatrix<vComplex >          vSpinMatrix;
 | 
			
		||||
    typedef iColourMatrix<vComplex >        vColourMatrix;
 | 
			
		||||
    typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 | 
			
		||||
    typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
 | 
			
		||||
    typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
 | 
			
		||||
    typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    // DoubleStored gauge field
 | 
			
		||||
    typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
 | 
			
		||||
    typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
 | 
			
		||||
    typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 | 
			
		||||
    typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 | 
			
		||||
    typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    // Spin vector
 | 
			
		||||
    typedef iSpinVector<Complex >           SpinVector;
 | 
			
		||||
    typedef iSpinVector<ComplexF>           SpinVectorF;
 | 
			
		||||
    typedef iSpinVector<ComplexD>           SpinVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef iSpinVector<vComplex >           vSpinVector;
 | 
			
		||||
    typedef iColourVector<vComplex >         vColourVector;
 | 
			
		||||
    typedef iSpinColourVector<vComplex >     vSpinColourVector;
 | 
			
		||||
    typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
 | 
			
		||||
    typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 | 
			
		||||
    typedef iSpinVector<vComplexF>           vSpinVectorF;
 | 
			
		||||
    typedef iSpinVector<vComplexD>           vSpinVectorD;
 | 
			
		||||
 | 
			
		||||
    // Colour vector
 | 
			
		||||
    typedef iColourVector<Complex >         ColourVector;
 | 
			
		||||
    typedef iColourVector<ComplexF>         ColourVectorF;
 | 
			
		||||
    typedef iColourVector<ComplexD>         ColourVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef iColourVector<vComplex >         vColourVector;
 | 
			
		||||
    typedef iColourVector<vComplexF>         vColourVectorF;
 | 
			
		||||
    typedef iColourVector<vComplexD>         vColourVectorD;
 | 
			
		||||
 | 
			
		||||
    // SpinColourVector
 | 
			
		||||
    typedef iSpinColourVector<Complex >     SpinColourVector;
 | 
			
		||||
    typedef iSpinColourVector<ComplexF>     SpinColourVectorF;
 | 
			
		||||
    typedef iSpinColourVector<ComplexD>     SpinColourVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef iSpinColourVector<vComplex >     vSpinColourVector;
 | 
			
		||||
    typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
 | 
			
		||||
    typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;
 | 
			
		||||
 | 
			
		||||
    // HalfSpin vector
 | 
			
		||||
    typedef iHalfSpinVector<Complex >       HalfSpinVector;
 | 
			
		||||
    typedef iHalfSpinVector<ComplexF>       HalfSpinVectorF;
 | 
			
		||||
    typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
 | 
			
		||||
    typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
 | 
			
		||||
    typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;
 | 
			
		||||
 | 
			
		||||
    // HalfSpinColour vector
 | 
			
		||||
    typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
 | 
			
		||||
    typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
 | 
			
		||||
    typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 | 
			
		||||
    
 | 
			
		||||
    typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 | 
			
		||||
    typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 | 
			
		||||
    typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 | 
			
		||||
    
 | 
			
		||||
    // singlets
 | 
			
		||||
    typedef iSinglet<Complex >         TComplex;    // FIXME This is painful. Tensor singlet complex type.
 | 
			
		||||
    typedef iSinglet<vComplex >        vTComplex;   // what if we don't know the tensor structure
 | 
			
		||||
    typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
 | 
			
		||||
    typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tensor singlet complex type.
 | 
			
		||||
 | 
			
		||||
    typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
 | 
			
		||||
    typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
 | 
			
		||||
    typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure
 | 
			
		||||
 | 
			
		||||
    typedef iSinglet<Real >            TReal;       // Shouldn't need these; can I make it work without?
 | 
			
		||||
    typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
 | 
			
		||||
    typedef iSinglet<RealD>            TRealD;       // Shouldn't need these; can I make it work without?
 | 
			
		||||
 | 
			
		||||
    typedef iSinglet<vReal >           vTReal;      
 | 
			
		||||
    typedef iSinglet<vInteger >        vTInteger;
 | 
			
		||||
    typedef iSinglet<vRealF>           vTRealF;      
 | 
			
		||||
    typedef iSinglet<vRealD>           vTRealD;      
 | 
			
		||||
 | 
			
		||||
    typedef iSinglet<vInteger>         vTInteger;
 | 
			
		||||
    typedef iSinglet<Integer >         TInteger;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vTReal>            LatticeReal;
 | 
			
		||||
    typedef Lattice<vTComplex>         LatticeComplex;
 | 
			
		||||
    typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
 | 
			
		||||
 | 
			
		||||
    // Lattices of these
 | 
			
		||||
    typedef Lattice<vColourMatrix>          LatticeColourMatrix;
 | 
			
		||||
    typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
 | 
			
		||||
    typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
 | 
			
		||||
    typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
 | 
			
		||||
    typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
 | 
			
		||||
    typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
 | 
			
		||||
    typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
 | 
			
		||||
    typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
 | 
			
		||||
    typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    // DoubleStored gauge field
 | 
			
		||||
    typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
 | 
			
		||||
    typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
 | 
			
		||||
    typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vSpinVector>            LatticeSpinVector;
 | 
			
		||||
    typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
 | 
			
		||||
    typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vColourVector>          LatticeColourVector;
 | 
			
		||||
    typedef Lattice<vColourVectorF>         LatticeColourVectorF;
 | 
			
		||||
    typedef Lattice<vColourVectorD>         LatticeColourVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
 | 
			
		||||
    typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
 | 
			
		||||
    typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
 | 
			
		||||
    typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
 | 
			
		||||
    typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vHalfSpinColourVector>  LatticeHalfSpinColourVector;
 | 
			
		||||
    typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
 | 
			
		||||
    typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vTReal>            LatticeReal;
 | 
			
		||||
    typedef Lattice<vTRealF>           LatticeRealF;
 | 
			
		||||
    typedef Lattice<vTRealD>           LatticeRealD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vTComplex>         LatticeComplex;
 | 
			
		||||
    typedef Lattice<vTComplexF>        LatticeComplexF;
 | 
			
		||||
    typedef Lattice<vTComplexD>        LatticeComplexD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////
 | 
			
		||||
    // Physical names for things
 | 
			
		||||
    ///////////////////////////////////////////
 | 
			
		||||
    typedef Lattice<vHalfSpinColourVector> LatticeHalfFermion;
 | 
			
		||||
    typedef Lattice<vSpinColourVector>     LatticeFermion;
 | 
			
		||||
    typedef LatticeHalfSpinColourVector  LatticeHalfFermion;
 | 
			
		||||
    typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
 | 
			
		||||
    typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vSpinColourMatrix>     LatticePropagator;
 | 
			
		||||
    typedef Lattice<vLorentzColourMatrix>  LatticeGaugeField;
 | 
			
		||||
    typedef LatticeSpinColourVector      LatticeFermion;
 | 
			
		||||
    typedef LatticeSpinColourVectorF     LatticeFermionF;
 | 
			
		||||
    typedef LatticeSpinColourVectorD     LatticeFermionD;
 | 
			
		||||
 | 
			
		||||
    typedef LatticeSpinColourMatrix                LatticePropagator;
 | 
			
		||||
    typedef LatticeSpinColourMatrixF               LatticePropagatorF;
 | 
			
		||||
    typedef LatticeSpinColourMatrixD               LatticePropagatorD;
 | 
			
		||||
 | 
			
		||||
    typedef LatticeLorentzColourMatrix             LatticeGaugeField;
 | 
			
		||||
    typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
 | 
			
		||||
    typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;
 | 
			
		||||
 | 
			
		||||
    typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
 | 
			
		||||
    typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
 | 
			
		||||
    typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;
 | 
			
		||||
 | 
			
		||||
    // Uhgg... typing this hurt  ;)
 | 
			
		||||
    // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
 | 
			
		||||
    typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
 | 
			
		||||
    typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
 | 
			
		||||
    typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
 | 
			
		||||
    typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
 | 
			
		||||
    typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Peek and Poke named after physics attributes
 | 
			
		||||
@@ -157,11 +295,14 @@ namespace QCD {
 | 
			
		||||
      return peekIndex<LorentzIndex>(rhs,i,j);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // FIXME transpose Colour, transpose Spin, traceColour traceSpin
 | 
			
		||||
 | 
			
		||||
}   //namespace QCD
 | 
			
		||||
} // Grid
 | 
			
		||||
 | 
			
		||||
#include <qcd/Grid_qcd_dirac.h>
 | 
			
		||||
#include <qcd/Grid_qcd_2spinor.h>
 | 
			
		||||
//#include <qcd/Grid_qcd_pauli.h>
 | 
			
		||||
#include <qcd/Grid_qcd_wilson_dop.h>
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -17,8 +17,14 @@ namespace QCD {
 | 
			
		||||
      GammaZ, 
 | 
			
		||||
      GammaT,  
 | 
			
		||||
      Gamma5,
 | 
			
		||||
      //      GammaXGamma5, 
 | 
			
		||||
      //      GammaYGamma5, 
 | 
			
		||||
      MinusIdentity,                        
 | 
			
		||||
      MinusGammaX, 
 | 
			
		||||
      MinusGammaY, 
 | 
			
		||||
      MinusGammaZ, 
 | 
			
		||||
      MinusGammaT,  
 | 
			
		||||
      MinusGamma5
 | 
			
		||||
      //      GammaXGamma5, // Rest are composite (willing to take hit for two calls sequentially)
 | 
			
		||||
      //      GammaYGamma5, // as they are less commonly used.
 | 
			
		||||
      //      GammaZGamma5, 
 | 
			
		||||
      //      GammaTGamma5,  
 | 
			
		||||
      //      SigmaXY,
 | 
			
		||||
@@ -27,12 +33,6 @@ namespace QCD {
 | 
			
		||||
      //      SigmaXT,
 | 
			
		||||
      //      SigmaYT,
 | 
			
		||||
      //      SigmaZT,
 | 
			
		||||
      MinusIdentity,                        
 | 
			
		||||
      MinusGammaX, 
 | 
			
		||||
      MinusGammaY, 
 | 
			
		||||
      MinusGammaZ, 
 | 
			
		||||
      MinusGammaT,  
 | 
			
		||||
      MinusGamma5
 | 
			
		||||
      //      MinusGammaXGamma5, easiest to form by composition
 | 
			
		||||
      //      MinusGammaYGamma5, as performance is not critical for these
 | 
			
		||||
      //      MinusGammaZGamma5, 
 | 
			
		||||
@@ -54,7 +54,6 @@ namespace QCD {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    /* Gx
 | 
			
		||||
     *  0 0  0  i    
 | 
			
		||||
     *  0 0  i  0    
 | 
			
		||||
 
 | 
			
		||||
@@ -1,157 +1,220 @@
 | 
			
		||||
#ifnfdef GRID_QCD_WILSON_DOP_H
 | 
			
		||||
#define  GRID_QCD_WILSON_DOP_H
 | 
			
		||||
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
const std::vector<int> WilsonMatrix::directions   ({0,1,2,3, 0, 1, 2, 3,0});
 | 
			
		||||
const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1,0});
 | 
			
		||||
 | 
			
		||||
  // Should be in header?
 | 
			
		||||
static const int WilsonMatrix::Xp = 0;
 | 
			
		||||
static const int WilsonMatrix::Yp = 1;
 | 
			
		||||
static const int WilsonMatrix::Zp = 2;
 | 
			
		||||
static const int WilsonMatrix::Tp = 3;
 | 
			
		||||
static const int WilsonMatrix::Xm = 4;
 | 
			
		||||
static const int WilsonMatrix::Ym = 5;
 | 
			
		||||
static const int WilsonMatrix::Zm = 6;
 | 
			
		||||
static const int WilsonMatrix::Tm = 7;
 | 
			
		||||
static const int WilsonMatrix::X0 = 8;
 | 
			
		||||
static const int WilsonMatrix::npoint=9;
 | 
			
		||||
const int WilsonMatrix::Xp = 0;
 | 
			
		||||
const int WilsonMatrix::Yp = 1;
 | 
			
		||||
const int WilsonMatrix::Zp = 2;
 | 
			
		||||
const int WilsonMatrix::Tp = 3;
 | 
			
		||||
const int WilsonMatrix::Xm = 4;
 | 
			
		||||
const int WilsonMatrix::Ym = 5;
 | 
			
		||||
const int WilsonMatrix::Zm = 6;
 | 
			
		||||
const int WilsonMatrix::Tm = 7;
 | 
			
		||||
  //const int WilsonMatrix::X0 = 8;
 | 
			
		||||
 | 
			
		||||
  class WilsonCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
    int mu;
 | 
			
		||||
    
 | 
			
		||||
WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,int _mass)
 | 
			
		||||
  : Stencil((&Umu._grid,npoint,0,directions,displacements),
 | 
			
		||||
    void Point(int p) { mu=p;};
 | 
			
		||||
    vHalfSpinColourVector operator () (vSpinColourVector &in)
 | 
			
		||||
    {
 | 
			
		||||
      vHalfSpinColourVector ret;
 | 
			
		||||
      switch(mu) {
 | 
			
		||||
      case WilsonMatrix::Xp:
 | 
			
		||||
	spProjXp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Yp:
 | 
			
		||||
	spProjYp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Zp:
 | 
			
		||||
	spProjZp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Tp:
 | 
			
		||||
	spProjTp(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Xm:
 | 
			
		||||
	spProjXm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Ym:
 | 
			
		||||
	spProjYm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Zm:
 | 
			
		||||
	spProjZm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      case WilsonMatrix::Tm:
 | 
			
		||||
	spProjTm(ret,in);
 | 
			
		||||
	break;
 | 
			
		||||
      default: 
 | 
			
		||||
	assert(0);
 | 
			
		||||
	break;
 | 
			
		||||
      }
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,double _mass)
 | 
			
		||||
  : Stencil(Umu._grid,npoint,0,directions,displacements),
 | 
			
		||||
    mass(_mass),
 | 
			
		||||
    Umu(_Umu)
 | 
			
		||||
    Umu(_Umu._grid)
 | 
			
		||||
{
 | 
			
		||||
  // Allocate the required comms buffer
 | 
			
		||||
  grid = _Umu._grid;
 | 
			
		||||
  comm_buf.resize(Stencil._unified_buffer_size);
 | 
			
		||||
  DoubleStore(Umu,_Umu);
 | 
			
		||||
}
 | 
			
		||||
      
 | 
			
		||||
void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
 | 
			
		||||
{
 | 
			
		||||
  LatticeColourMatrix U(grid);
 | 
			
		||||
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U = peekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
    pokeIndex<LorentzIndex>(Uds,U,mu);
 | 
			
		||||
    U = adj(Cshift(U,mu,-1));
 | 
			
		||||
    pokeIndex<LorentzIndex>(Uds,U,mu+4);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::multiply(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  
 | 
			
		||||
  Dhop(in,out);
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Stencil.HaloExchange(in,comm_buf);
 | 
			
		||||
  //  Stencil.HaloExchange(in,comm_buf);
 | 
			
		||||
 | 
			
		||||
  for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
 | 
			
		||||
    int offset,local;
 | 
			
		||||
 | 
			
		||||
    vSpinColourVector result;
 | 
			
		||||
    vHalfSpinColourVector UChi;
 | 
			
		||||
 | 
			
		||||
    vHalfSpinColourVector  chi;    
 | 
			
		||||
    vHalfSpinColourVector Uchi;
 | 
			
		||||
    vHalfSpinColourVector *chi_p;
 | 
			
		||||
    // Xp
 | 
			
		||||
    offset = Stencil._offsets [Xp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xp][ss];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      Uchi = U[]*spProjXp(in._odata[offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      Uchi = U[]*comm_buf._odata[offset]
 | 
			
		||||
    }
 | 
			
		||||
    result = ReconXp(Uchi);
 | 
			
		||||
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjXp(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
    } 
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)());
 | 
			
		||||
    spReconXp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
    // Yp
 | 
			
		||||
    offset = Stencil._offsets [Yp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Yp][ss];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      Uchi = U[]*spProjYp(in._odata[offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      Uchi = U[]*comm_buf._odata[offset]
 | 
			
		||||
      spProjYp(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
    } 
 | 
			
		||||
    result+= ReconYp(Uchi);
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Yp)),&(*chi_p)());
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zp
 | 
			
		||||
    offset = Stencil._offsets [Zp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zp][ss];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      Uchi = U[]*spProjZp(in._odata[offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      Uchi = U[]*comm_buf._odata[offset]
 | 
			
		||||
      spProjZp(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
    } 
 | 
			
		||||
    result+= ReconZp(Uchi);
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Zp)),&(*chi_p)() );
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tp
 | 
			
		||||
    offset = Stencil._offsets [Tp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tp][ss];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      Uchi = U[]*spProjTp(in._odata[offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      Uchi = U[]*comm_buf._odata[offset]
 | 
			
		||||
      spProjTp(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
    } 
 | 
			
		||||
    result+= ReconTp(Uchi);
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)());
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Xm
 | 
			
		||||
    offset = Stencil._offsets [Xm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xm][ss];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      Uchi = U[]*spProjXm(in._odata[offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      Uchi = U[]*comm_buf._odata[offset]
 | 
			
		||||
      spProjXm(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
    } 
 | 
			
		||||
    result+= ReconXm(Uchi);
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Ym
 | 
			
		||||
    offset = Stencil._offsets [Ym][ss];
 | 
			
		||||
    local  = Stencil._is_local[Ym][ss];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      Uchi = U[]*spProjYm(in._odata[offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      Uchi = U[]*comm_buf._odata[offset]
 | 
			
		||||
      spProjYm(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
    } 
 | 
			
		||||
    result+= ReconYm(Uchi);
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Ym)),&(*chi_p)());
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zm
 | 
			
		||||
    offset = Stencil._offsets [Zm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zm][ss];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      Uchi = U[]*spProjZm(in._odata[offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      Uchi = U[]*comm_buf._odata[offset]
 | 
			
		||||
      spProjZm(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
    } 
 | 
			
		||||
    result+= ReconZm(Uchi);
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Zm)),&(*chi_p)());
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tm
 | 
			
		||||
    offset = Stencil._offsets [Tm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tm][ss];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      Uchi = U[]*spProjTm(in._odata[offset]);
 | 
			
		||||
    } else {
 | 
			
		||||
      Uchi = U[]*comm_buf._odata[offset]
 | 
			
		||||
      spProjTm(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
    } 
 | 
			
		||||
    result+= ReconTm(Uchi);
 | 
			
		||||
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Tm)),&(*chi_p)());
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
#endif
 | 
			
		||||
    out._odata[ss] = result;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MpcDag   (const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::Mpc      (const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MpcDagMpc(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MDagM    (const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
#ifnfdef GRID_QCD_WILSON_DOP_H
 | 
			
		||||
#ifndef  GRID_QCD_WILSON_DOP_H
 | 
			
		||||
#define  GRID_QCD_WILSON_DOP_H
 | 
			
		||||
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
@@ -21,21 +21,23 @@ namespace Grid {
 | 
			
		||||
      GridBase                     *grid;
 | 
			
		||||
 | 
			
		||||
      // Copy of the gauge field 
 | 
			
		||||
      LatticeGaugeField             Umu;
 | 
			
		||||
      LatticeDoubledGaugeField             Umu;
 | 
			
		||||
 | 
			
		||||
      //Defines the stencil
 | 
			
		||||
      CartesianStencil              Stencil; 
 | 
			
		||||
      static const int npoint=9;
 | 
			
		||||
      static const std::vector<int> directions   ;
 | 
			
		||||
      static const std::vector<int> displacements;
 | 
			
		||||
 | 
			
		||||
      static const int Xp,Xm,Yp,Ym,Zp,Zm,Tp,Tm;
 | 
			
		||||
 | 
			
		||||
      // Comms buffer
 | 
			
		||||
      std::vector<vSpinColourVector,alignedAllocator<vSpinColourVector> >  comm_buf;
 | 
			
		||||
      std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  comm_buf;
 | 
			
		||||
 | 
			
		||||
      // Constructor
 | 
			
		||||
      WilsonMatrix(LatticeGaugeField &Umu,int mass);
 | 
			
		||||
      WilsonMatrix(LatticeGaugeField &Umu,double mass);
 | 
			
		||||
 | 
			
		||||
      // DoubleStore
 | 
			
		||||
      void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
 | 
			
		||||
 | 
			
		||||
      // override multiply
 | 
			
		||||
      void multiply(const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
 
 | 
			
		||||
@@ -252,12 +252,12 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
            vComplexD ret ; vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	    //	    addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
 | 
			
		||||
	    //            __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
 | 
			
		||||
	    //             ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
 | 
			
		||||
            ret.v = _mm256_addsub_pd(ret.v,in.v);
 | 
			
		||||
	    zvec tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
 | 
			
		||||
	    ret.v    =_mm256_shuffle_pd(tmp,tmp,0x5);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
            ret.v = _mm_addsub_pd(ret.v,in.v);
 | 
			
		||||
	    zvec tmp = _mm_addsub_pd(ret.v,_mm_shuffle_pd(in.v,in.v,0x1));
 | 
			
		||||
	    ret.v    = _mm_shuffle_pd(tmp,tmp,0x1);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	    ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);             
 | 
			
		||||
@@ -268,48 +268,41 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
            return ret;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        friend inline vComplexD timesI(const vComplexD &in){
 | 
			
		||||
        friend inline vComplexD timesMinusI(const vComplexD &in){
 | 
			
		||||
	  vComplexD ret; vzero(ret);
 | 
			
		||||
	  vComplexD tmp;
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	  cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
 | 
			
		||||
	  /*
 | 
			
		||||
             IF IMM0[0] = 0
 | 
			
		||||
             THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
 | 
			
		||||
             IF IMM0[1] = 0
 | 
			
		||||
             THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
 | 
			
		||||
             IF IMM0[2] = 0
 | 
			
		||||
             THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
 | 
			
		||||
             IF IMM0[3] = 0
 | 
			
		||||
             THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI;
 | 
			
		||||
	  */
 | 
			
		||||
          ret.v    =_mm256_shuffle_ps(tmp,tmp,0x5);
 | 
			
		||||
	  tmp.v    =_mm256_addsub_pd(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v    =_mm256_shuffle_pd(tmp.v,tmp.v,0x5);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	  cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v    =_mm_shuffle_ps(tmp,tmp,0x5);
 | 
			
		||||
	  tmp.v    =_mm_addsub_pd(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v    =_mm_shuffle_pd(tmp.v,tmp.v,0x1);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
          ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag 
 | 
			
		||||
	  ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK
 | 
			
		||||
          ret.v = _mm512_mask_sub_pd(in.v,0xaaaa,ret.v,in.v); // real -imag 
 | 
			
		||||
	  ret.v = _mm512_swizzle_pd(ret.v, _MM_SWIZ_REG_CDAB);// OK
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
            assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
	  return ret;
 | 
			
		||||
	}
 | 
			
		||||
        friend inline vComplexD timesMinusI(const vComplexD &in){
 | 
			
		||||
 | 
			
		||||
        friend inline vComplexD timesI(const vComplexD &in){
 | 
			
		||||
	  vComplexD ret; vzero(ret);
 | 
			
		||||
	  vComplexD tmp;
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5);
 | 
			
		||||
          ret.v    =_mm256_addsub_ps(ret.v,tmp); // i,-r
 | 
			
		||||
	  tmp.v    =_mm256_shuffle_pd(in.v,in.v,0x5);
 | 
			
		||||
          ret.v    =_mm256_addsub_pd(ret.v,tmp.v); // i,-r
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	  cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5);
 | 
			
		||||
          ret.v    =_mm_addsub_ps(ret.v,tmp); // r,-i
 | 
			
		||||
	  tmp.v    =_mm_shuffle_pd(in.v,in.v,0x1);
 | 
			
		||||
          ret.v    =_mm_addsub_pd(ret.v,tmp.v); // r,-i
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
          cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK
 | 
			
		||||
	  ret.v    = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag 
 | 
			
		||||
          tmp.v    = _mm512_swizzle_pd(in.v, _MM_SWIZ_REG_CDAB);// OK
 | 
			
		||||
	  ret.v    = _mm512_mask_sub_pd(tmp.v,0xaaaa,ret.v,tmp.v); // real -imag 
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
            assert(0);
 | 
			
		||||
 
 | 
			
		||||
@@ -214,10 +214,10 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
       {
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	   union {
 | 
			
		||||
	     __m128 v1;    // SSE 4 x float vector
 | 
			
		||||
	     cvec v1;    // SSE 4 x float vector
 | 
			
		||||
	     float f[4];  // scalar array of 4 floats
 | 
			
		||||
	   } u128;
 | 
			
		||||
	   u128.v1= _mm_add_ps(v, _mm_shuffle_ps(v, v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
 | 
			
		||||
	   u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
 | 
			
		||||
	   return ComplexF(u128.f[0], u128.f[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX1
 | 
			
		||||
@@ -329,13 +329,15 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
        friend inline vComplexF conj(const vComplexF &in){
 | 
			
		||||
            vComplexF ret ; vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	    //             cvec tmp;
 | 
			
		||||
	    //             tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
 | 
			
		||||
	    //             ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
	    ret.v = _mm256_addsub_ps(ret.v,in.v);
 | 
			
		||||
	    cvec tmp;
 | 
			
		||||
	    tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
 | 
			
		||||
	    ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
            ret.v = _mm_addsub_ps(ret.v,in.v);
 | 
			
		||||
	    cvec tmp;
 | 
			
		||||
	    tmp = _mm_addsub_ps(ret.v,_mm_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
 | 
			
		||||
	    ret.v=_mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag 
 | 
			
		||||
@@ -345,15 +347,16 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
#endif
 | 
			
		||||
            return ret;
 | 
			
		||||
        }
 | 
			
		||||
        friend inline vComplexF timesI(const vComplexF &in){
 | 
			
		||||
	  vComplexF ret; vzero(ret);
 | 
			
		||||
        friend inline vComplexF timesMinusI(const vComplexF &in){
 | 
			
		||||
	  vComplexF ret; 
 | 
			
		||||
	  vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	  cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v = _mm256_shuffle_ps(tmp,tmp,0x5);
 | 
			
		||||
          ret.v = _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	  cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v = _mm_shuffle_ps(tmp,tmp,0x5);
 | 
			
		||||
          ret.v = _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
          ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag 
 | 
			
		||||
@@ -364,14 +367,14 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
#endif
 | 
			
		||||
	  return ret;
 | 
			
		||||
	}
 | 
			
		||||
        friend inline vComplexF timesMinusI(const vComplexF &in){
 | 
			
		||||
        friend inline vComplexF timesI(const vComplexF &in){
 | 
			
		||||
	  vComplexF ret; vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5);
 | 
			
		||||
          ret.v = _mm256_addsub_ps(ret.v,tmp); // i,-r
 | 
			
		||||
	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));//i,r
 | 
			
		||||
          ret.v    =_mm256_addsub_ps(ret.v,tmp);     //i,-r
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	  cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5);
 | 
			
		||||
	  cvec tmp =_mm_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
          ret.v = _mm_addsub_ps(ret.v,tmp); // r,-i
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
@@ -443,5 +446,8 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
    inline vComplexF trace(const vComplexF &arg){
 | 
			
		||||
        return arg;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -146,7 +146,7 @@ namespace Grid {
 | 
			
		||||
            ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
            ret.v = _mm_set_pd(a[0],a[1]);
 | 
			
		||||
            ret.v = _mm_set_pd(a[1],a[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 | 
			
		||||
@@ -186,6 +186,15 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
       friend inline RealD Reduce(const vRealD & in)
 | 
			
		||||
       {
 | 
			
		||||
#if defined (SSE4)
 | 
			
		||||
	 // FIXME Hack
 | 
			
		||||
	 const RealD * ptr =(const RealD *)  ∈
 | 
			
		||||
	 RealD ret = 0; 
 | 
			
		||||
	 for(int i=0;i<vRealD::Nsimd();i++){
 | 
			
		||||
	   ret = ret+ptr[i];
 | 
			
		||||
	 }
 | 
			
		||||
	 return ret;
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX1) || defined(AVX2)
 | 
			
		||||
	 typedef union  {
 | 
			
		||||
	   uint64_t l;
 | 
			
		||||
 
 | 
			
		||||
@@ -175,7 +175,7 @@ namespace Grid {
 | 
			
		||||
            ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
            ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
 | 
			
		||||
            ret.v = _mm_set_ps(a[3],a[2],a[1],a[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            ret.v = _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
 | 
			
		||||
@@ -220,6 +220,15 @@ friend inline void vstore(const vRealF &ret, float *a){
 | 
			
		||||
        }
 | 
			
		||||
       friend inline RealF Reduce(const vRealF & in)
 | 
			
		||||
       {
 | 
			
		||||
#if defined (SSE4)
 | 
			
		||||
	 // FIXME Hack
 | 
			
		||||
	 const RealF * ptr = (const RealF *) ∈
 | 
			
		||||
	 RealF ret = 0; 
 | 
			
		||||
	 for(int i=0;i<vRealF::Nsimd();i++){
 | 
			
		||||
	   ret = ret+ptr[i];
 | 
			
		||||
	 }
 | 
			
		||||
	 return ret;
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX1) || defined(AVX2)
 | 
			
		||||
            __attribute__ ((aligned(32))) float c_[16];
 | 
			
		||||
            __m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
 | 
			
		||||
 
 | 
			
		||||
@@ -121,8 +121,9 @@ namespace Grid {
 | 
			
		||||
      int simd_layout     = _grid->_simd_layout[dimension];
 | 
			
		||||
      int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
			
		||||
      
 | 
			
		||||
      assert(simd_layout==1);
 | 
			
		||||
      //      assert(simd_layout==1); // Why?
 | 
			
		||||
      assert(comm_dim==1);
 | 
			
		||||
      shift = (shift + fd) %fd;
 | 
			
		||||
      assert(shift>=0);
 | 
			
		||||
      assert(shift<fd);
 | 
			
		||||
      
 | 
			
		||||
 
 | 
			
		||||
@@ -5,6 +5,11 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
@@ -22,16 +27,27 @@ int main (int argc, char ** argv)
 | 
			
		||||
  GridSerialRNG            sRNG;
 | 
			
		||||
  sRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
  SpinMatrix ident=zero;
 | 
			
		||||
  SpinMatrix ident; ident=zero;
 | 
			
		||||
  SpinMatrix rnd  ; random(sRNG,rnd);
 | 
			
		||||
 | 
			
		||||
  SpinMatrix ll=zero;
 | 
			
		||||
  SpinMatrix rr=zero;
 | 
			
		||||
  SpinMatrix ll; ll=zero;
 | 
			
		||||
  SpinMatrix rr; rr=zero;
 | 
			
		||||
  SpinMatrix result;
 | 
			
		||||
 | 
			
		||||
  SpinVector lv; random(sRNG,lv);
 | 
			
		||||
  SpinVector rv; random(sRNG,rv);
 | 
			
		||||
 | 
			
		||||
  std::cout << " Is pod " << std::is_pod<SpinVector>::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod double   " << std::is_pod<double>::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod ComplexF " << std::is_pod<ComplexF>::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod scal<double> " << std::is_pod<scal<double> >::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod Scalar<double> " << std::is_pod<iScalar<double> >::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod Scalar<ComplexF> " << std::is_pod<iScalar<ComplexF> >::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod Scalar<vComplexF> " << std::is_pod<iScalar<vComplexF> >::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod Scalar<vComplexD> " << std::is_pod<iScalar<vComplexD> >::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod Scalar<vRealF> " << std::is_pod<iScalar<vRealF> >::value  << std::endl;
 | 
			
		||||
  std::cout << " Is pod Scalar<vRealD> " << std::is_pod<iScalar<vRealD> >::value  << std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int a=0;a<Ns;a++){
 | 
			
		||||
    ident()(a,a) = 1.0;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -50,7 +50,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  double vol = Fine.gSites();
 | 
			
		||||
  Complex PlaqScale(1.0/vol/6.0/3.0);
 | 
			
		||||
 | 
			
		||||
@@ -58,7 +57,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  sliceSum(Plaq,Plaq_T,Nd-1);
 | 
			
		||||
  int Nt = Plaq_T.size();
 | 
			
		||||
 | 
			
		||||
  TComplex Plaq_T_sum=zero;
 | 
			
		||||
  TComplex Plaq_T_sum; 
 | 
			
		||||
  Plaq_T_sum=zero;
 | 
			
		||||
  for(int t=0;t<Nt;t++){
 | 
			
		||||
    Plaq_T_sum = Plaq_T_sum+Plaq_T[t];
 | 
			
		||||
    Complex Pt=TensorRemove(Plaq_T[t]);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										166
									
								
								tests/Grid_simd.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										166
									
								
								tests/Grid_simd.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,166 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#include <parallelIO/GridNerscIO.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
class funcPlus {
 | 
			
		||||
public:
 | 
			
		||||
  funcPlus() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1+i2;}
 | 
			
		||||
  std::string name(void) const { return std::string("Plus"); }
 | 
			
		||||
};
 | 
			
		||||
class funcMinus {
 | 
			
		||||
public:
 | 
			
		||||
  funcMinus() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1-i2;}
 | 
			
		||||
  std::string name(void) const { return std::string("Minus"); }
 | 
			
		||||
};
 | 
			
		||||
class funcTimes {
 | 
			
		||||
public:
 | 
			
		||||
  funcTimes() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
 | 
			
		||||
  std::string name(void) const { return std::string("Times"); }
 | 
			
		||||
};
 | 
			
		||||
class funcConj {
 | 
			
		||||
public:
 | 
			
		||||
  funcConj() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conj(i1);}
 | 
			
		||||
  std::string name(void) const { return std::string("Conj"); }
 | 
			
		||||
};
 | 
			
		||||
class funcAdj {
 | 
			
		||||
public:
 | 
			
		||||
  funcAdj() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = adj(i1);}
 | 
			
		||||
  std::string name(void) const { return std::string("Adj"); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class funcTimesI {
 | 
			
		||||
public:
 | 
			
		||||
  funcTimesI() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesI(i1);}
 | 
			
		||||
  std::string name(void) const { return std::string("timesI"); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class funcTimesMinusI {
 | 
			
		||||
public:
 | 
			
		||||
  funcTimesMinusI() {};
 | 
			
		||||
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesMinusI(i1);}
 | 
			
		||||
  std::string name(void) const { return std::string("timesMinusI"); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class scal, class vec,class functor > 
 | 
			
		||||
void Tester(const functor &func)
 | 
			
		||||
{
 | 
			
		||||
  GridSerialRNG          sRNG;
 | 
			
		||||
  sRNG.SeedRandomDevice();
 | 
			
		||||
  
 | 
			
		||||
  int Nsimd = vec::Nsimd();
 | 
			
		||||
 | 
			
		||||
  std::vector<scal> input1(Nsimd);
 | 
			
		||||
  std::vector<scal> input2(Nsimd);
 | 
			
		||||
  std::vector<scal> result(Nsimd);
 | 
			
		||||
  std::vector<scal> reference(Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<vec,alignedAllocator<vec> > buf(3);
 | 
			
		||||
  vec & v_input1 = buf[0];
 | 
			
		||||
  vec & v_input2 = buf[1];
 | 
			
		||||
  vec & v_result = buf[2];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    random(sRNG,input1[i]);
 | 
			
		||||
    random(sRNG,input2[i]);
 | 
			
		||||
    random(sRNG,result[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Gmerge(v_input1,input1);
 | 
			
		||||
  Gmerge(v_input2,input2);
 | 
			
		||||
  Gmerge(v_result,result);
 | 
			
		||||
 | 
			
		||||
  func(v_result,v_input1,v_input2);
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) {
 | 
			
		||||
    func(reference[i],input1[i],input2[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Gextract(v_result,result);
 | 
			
		||||
  std::cout << " " << func.name()<<std::endl;
 | 
			
		||||
 | 
			
		||||
  int ok=0;
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    if ( abs(reference[i]-result[i])>0){
 | 
			
		||||
      std::cout<< "*****" << std::endl;
 | 
			
		||||
      std::cout<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
 | 
			
		||||
      ok++;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( ok==0 ) std::cout << " OK!" <<std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> simd_layout({1,1,2,2});
 | 
			
		||||
  std::vector<int> mpi_layout ({1,1,1,1});
 | 
			
		||||
  std::vector<int> latt_size  ({8,8,8,8});
 | 
			
		||||
    
 | 
			
		||||
  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
 | 
			
		||||
  // Insist that operations on random scalars gives
 | 
			
		||||
  // identical results to on vectors.
 | 
			
		||||
 | 
			
		||||
  Tester<RealD,vRealD>(funcPlus());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vComplexF "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcTimesI());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcTimesMinusI());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcPlus());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcMinus());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcTimes());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcConj());
 | 
			
		||||
  Tester<ComplexF,vComplexF>(funcAdj());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vComplexD "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcTimesI());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcTimesMinusI());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcPlus());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcMinus());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcTimes());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcConj());
 | 
			
		||||
  Tester<ComplexD,vComplexD>(funcAdj());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vRealF "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Tester<RealF,vRealF>(funcPlus());
 | 
			
		||||
  Tester<RealF,vRealF>(funcMinus());
 | 
			
		||||
  Tester<RealF,vRealF>(funcTimes());
 | 
			
		||||
  Tester<RealF,vRealF>(funcAdj());
 | 
			
		||||
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout << "Testing vRealD "<<std::endl;
 | 
			
		||||
  std::cout << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
  Tester<RealD,vRealD>(funcPlus());
 | 
			
		||||
  Tester<RealD,vRealD>(funcMinus());
 | 
			
		||||
  Tester<RealD,vRealD>(funcTimes());
 | 
			
		||||
  Tester<RealD,vRealD>(funcAdj());
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -4,48 +4,25 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
class SimpleCompressor {
 | 
			
		||||
public:
 | 
			
		||||
  void Point(int) {};
 | 
			
		||||
 | 
			
		||||
  vobj operator() (vobj &arg) {
 | 
			
		||||
    return arg;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt_size  (4);
 | 
			
		||||
  std::vector<int> simd_layout(4);
 | 
			
		||||
  std::vector<int> mpi_layout (4);
 | 
			
		||||
  std::vector<int> simd_layout({1,1,2,2});
 | 
			
		||||
  std::vector<int> mpi_layout ({2,2,2,2});
 | 
			
		||||
  std::vector<int> latt_size  ({8,8,8,8});
 | 
			
		||||
 | 
			
		||||
  int omp=1;
 | 
			
		||||
  int lat=8;
 | 
			
		||||
 | 
			
		||||
  mpi_layout[0]=1;
 | 
			
		||||
  mpi_layout[1]=1;
 | 
			
		||||
  mpi_layout[2]=1;
 | 
			
		||||
  mpi_layout[3]=1;
 | 
			
		||||
 | 
			
		||||
    latt_size[0] = lat;
 | 
			
		||||
    latt_size[1] = lat;
 | 
			
		||||
    latt_size[2] = lat;
 | 
			
		||||
    latt_size[3] = lat;
 | 
			
		||||
  double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
    
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
    simd_layout[0] = 1;
 | 
			
		||||
    simd_layout[1] = 2;
 | 
			
		||||
    simd_layout[2] = 2;
 | 
			
		||||
    simd_layout[3] = 2;
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
    simd_layout[0] = 1;
 | 
			
		||||
    simd_layout[1] = 1;
 | 
			
		||||
    simd_layout[2] = 2;
 | 
			
		||||
    simd_layout[3] = 2;
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (SSE2)
 | 
			
		||||
    simd_layout[0] = 1;
 | 
			
		||||
    simd_layout[1] = 1;
 | 
			
		||||
    simd_layout[2] = 1;
 | 
			
		||||
    simd_layout[3] = 2;
 | 
			
		||||
#endif
 | 
			
		||||
    
 | 
			
		||||
  GridCartesian Fine(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridParallelRNG       fRNG(&Fine);
 | 
			
		||||
@@ -86,7 +63,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
	fflush(stdout);
 | 
			
		||||
	std::vector<vColourMatrix,alignedAllocator<vColourMatrix> >  comm_buf(myStencil._unified_buffer_size);
 | 
			
		||||
	printf("calling halo exchange\n");fflush(stdout);
 | 
			
		||||
	myStencil.HaloExchange(Foo,comm_buf);
 | 
			
		||||
	SimpleCompressor<vColourMatrix> compress;
 | 
			
		||||
	myStencil.HaloExchange(Foo,comm_buf,compress);
 | 
			
		||||
 | 
			
		||||
	Bar = Cshift(Foo,dir,disp);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										69
									
								
								tests/Grid_wilson.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								tests/Grid_wilson.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,69 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#include <parallelIO/GridNerscIO.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> simd_layout({1,1,2,2});
 | 
			
		||||
  std::vector<int> mpi_layout ({1,1,1,1});
 | 
			
		||||
  std::vector<int> latt_size  ({8,8,8,8});
 | 
			
		||||
    
 | 
			
		||||
  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);
 | 
			
		||||
  //  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  pRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src(&Grid); random(pRNG,src);
 | 
			
		||||
  LatticeFermion result(&Grid); result=zero;
 | 
			
		||||
  LatticeFermion    ref(&Grid);    ref=zero;
 | 
			
		||||
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,&Grid);
 | 
			
		||||
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = 1.0;
 | 
			
		||||
    pokeIndex<3>(Umu,U[mu],mu);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  {
 | 
			
		||||
    int mu=0;
 | 
			
		||||
    //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
 | 
			
		||||
    ref =  src;
 | 
			
		||||
    ref = U[0]*Cshift(ref,0,1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  WilsonMatrix Dw(Umu,mass);
 | 
			
		||||
  std::cout << "Calling Dw"<<std::endl;
 | 
			
		||||
  Dw.multiply(src,result);
 | 
			
		||||
  std::cout << "Called Dw"<<std::endl;
 | 
			
		||||
  std::cout << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int ss=0;ss<10;ss++ ){
 | 
			
		||||
    for(int i=0;i<Ns;i++){
 | 
			
		||||
      for(int j=0;j<Nc;j++){
 | 
			
		||||
	ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);
 | 
			
		||||
	ComplexF * res_p = (ComplexF *)&result._odata[ss]()(i)(j);
 | 
			
		||||
	std::cout << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ref = ref -result;
 | 
			
		||||
  std::cout << "norm diff   "<< norm2(ref)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_srcdir)/lib
 | 
			
		||||
#
 | 
			
		||||
# Test code
 | 
			
		||||
#
 | 
			
		||||
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma
 | 
			
		||||
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_wilson Grid_simd
 | 
			
		||||
 | 
			
		||||
Grid_main_SOURCES = Grid_main.cc
 | 
			
		||||
Grid_main_LDADD = -lGrid
 | 
			
		||||
@@ -21,3 +21,9 @@ Grid_gamma_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_stencil_SOURCES = Grid_stencil.cc
 | 
			
		||||
Grid_stencil_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_wilson_SOURCES = Grid_wilson.cc
 | 
			
		||||
Grid_wilson_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_simd_SOURCES = Grid_simd.cc
 | 
			
		||||
Grid_simd_LDADD = -lGrid
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user