mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Shaken out stencil to the point where I think wilson dslash is correct.
Need to audit code carefully, consolidate between stencil and cshift, and then benchmark and optimise.
This commit is contained in:
		@@ -101,12 +101,12 @@ namespace Grid {
 | 
			
		||||
      std::vector<Integer> vlhs(vInteger::Nsimd());   // Use functors to reduce this to single implementation
 | 
			
		||||
      std::vector<Integer> vrhs(vInteger::Nsimd());
 | 
			
		||||
      vInteger ret;
 | 
			
		||||
      extract(lhs,vlhs);
 | 
			
		||||
      extract(rhs,vrhs);
 | 
			
		||||
      extract<vInteger,Integer>(lhs,vlhs);
 | 
			
		||||
      extract<vInteger,Integer>(rhs,vrhs);
 | 
			
		||||
      for(int s=0;s<vInteger::Nsimd();s++){
 | 
			
		||||
	vlhs[s] = sop(vlhs[s],vrhs[s]);
 | 
			
		||||
      }
 | 
			
		||||
      merge(ret,vlhs);
 | 
			
		||||
      merge<vInteger,Integer>(ret,vlhs);
 | 
			
		||||
      return ret;
 | 
			
		||||
    }
 | 
			
		||||
    inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										193
									
								
								lib/Grid_extract.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										193
									
								
								lib/Grid_extract.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,193 @@
 | 
			
		||||
#ifndef GRID_EXTRACT_H
 | 
			
		||||
#define GRID_EXTRACT_H
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
// Generic extract/merge/permute
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
namespace Grid{
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Extract/merge a fundamental vector type, to pointer array with offset
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void extract(typename std::enable_if<isGridTensor<vsimd>::notvalue, const vsimd >::type * y, 
 | 
			
		||||
		    std::vector<scalar *> &extracted,int offset){
 | 
			
		||||
  // FIXME: bounce off memory is painful
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  scalar*buf = (scalar *)y;
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    extracted[i][offset] = buf[i*s];
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge simd vector from array of scalars to pointer array with offset
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void merge(typename std::enable_if<isGridTensor<vsimd>::notvalue, vsimd >::type * y, 
 | 
			
		||||
		  std::vector<scalar *> &extracted,int offset){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  scalar *buf =(scalar *) y;
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    for(int ii=0;ii<s;ii++){
 | 
			
		||||
      buf[i*s+ii]=extracted[i][offset];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Extract a fundamental vector type to scalar array 
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void extract(typename std::enable_if<isGridTensor<vsimd>::notvalue, const vsimd >::type  &y,std::vector<scalar> &extracted){
 | 
			
		||||
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  scalar *buf = (scalar *)&y;
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    for(int ii=0;ii<s;ii++){
 | 
			
		||||
      extracted[i]=buf[i*s+ii];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge simd vector from array of scalars
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void merge(typename std::enable_if<isGridTensor<vsimd>::notvalue, vsimd >::type  &y,std::vector<scalar> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
  scalar *buf = (scalar *)&y;
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    for(int ii=0;ii<s;ii++){
 | 
			
		||||
      buf[i*s+ii]=extracted[i];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void AmergeA(typename std::enable_if<isGridTensor<vsimd>::notvalue, vsimd >::type  &y,std::vector<scalar> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  scalar *buf = (scalar *)&y;
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    for(int ii=0;ii<s;ii++){
 | 
			
		||||
      buf[i*s+ii]=extracted[i];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Extract to contiguous array scalar object
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> inline void extract(const vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  extracted.resize(Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) 
 | 
			
		||||
    pointers[i] =(scalar_type *)& extracted[i];
 | 
			
		||||
 | 
			
		||||
  vector_type *vp = (vector_type *)&vec;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    extract<vector_type,scalar_type>(&vp[w],pointers,w);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Extract to a bunch of scalar object pointers, with offset
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert(extracted.size()==Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) {
 | 
			
		||||
    pointers[i] =(scalar_type *)& extracted[i][offset];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  vector_type *vp = (vector_type *)&vec;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    extract<vector_type,scalar_type>(&vp[w],pointers,w);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge a contiguous array of scalar objects
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  assert(extracted.size()==Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) 
 | 
			
		||||
    pointers[i] =(scalar_type *)& extracted[i];
 | 
			
		||||
  
 | 
			
		||||
  vector_type *vp = (vector_type *)&vec;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    merge<vector_type,scalar_type>(&vp[w],pointers,w);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Merge a bunch of different scalar object pointers, with offset
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  assert(extracted.size()==Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) 
 | 
			
		||||
    pointers[i] =(scalar_type *)& extracted[i][offset];
 | 
			
		||||
  
 | 
			
		||||
  vector_type *vp = (vector_type *)&vec;
 | 
			
		||||
  assert((void *)vp!=NULL);
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    merge<vector_type,scalar_type>(&vp[w],pointers,w);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
@@ -95,6 +95,7 @@ public:
 | 
			
		||||
#include <lattice/Grid_lattice_reduction.h>
 | 
			
		||||
#include <lattice/Grid_lattice_peekpoke.h>
 | 
			
		||||
#include <lattice/Grid_lattice_reality.h>
 | 
			
		||||
#include <Grid_extract.h>
 | 
			
		||||
#include <lattice/Grid_lattice_coordinate.h>
 | 
			
		||||
#include <lattice/Grid_lattice_rng.h>
 | 
			
		||||
#include <lattice/Grid_lattice_transfer.h>
 | 
			
		||||
 
 | 
			
		||||
@@ -133,68 +133,6 @@ namespace Grid {
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
// Generic extract/merge/permute
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
 | 
			
		||||
  // FIXME: bounce off memory is painful
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
 | 
			
		||||
 | 
			
		||||
  vstore(y,&buf[0]);
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    *extracted[i] = buf[i*s];
 | 
			
		||||
    extracted[i]++;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
 | 
			
		||||
 | 
			
		||||
  vstore(y,&buf[0]);
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    extracted[i] = buf[i*s];
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar> buf(Nsimd); 
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    for(int ii=0;ii<s;ii++){
 | 
			
		||||
      buf[i*s+ii]=*extracted[i];
 | 
			
		||||
    }
 | 
			
		||||
    extracted[i]++;
 | 
			
		||||
  }
 | 
			
		||||
  vset(y,&buf[0]); 
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar> buf(Nsimd); 
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    for(int ii=0;ii<s;ii++){
 | 
			
		||||
      buf[i*s+ii]=extracted[i];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  vset(y,&buf[0]); 
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////
 | 
			
		||||
// Permute
 | 
			
		||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
 | 
			
		||||
 
 | 
			
		||||
@@ -48,100 +48,6 @@ namespace Grid {
 | 
			
		||||
  } ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there is no need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj,class cobj,class compressor> void 
 | 
			
		||||
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                    // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                    // offset in buffer
 | 
			
		||||
 | 
			
		||||
    // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	buffer[bo++]=compress(rhs._odata[so+o+b]);
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                      // offset in buffer
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
			
		||||
	if ( ocb &cbmask ) {
 | 
			
		||||
	  buffer[bo]=compress(rhs._odata[so+o+b]);
 | 
			
		||||
	  bo++;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there *is* need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class cobj,class vobj,class compressor> void 
 | 
			
		||||
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_type *> pointers,int dimension,int plane,int cbmask,compressor &compress)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                    // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                    // offset in buffer
 | 
			
		||||
 | 
			
		||||
    // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	cobj temp;
 | 
			
		||||
	temp=compress(rhs._odata[so+o+b]);
 | 
			
		||||
	extract(temp,pointers);
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                      // offset in buffer
 | 
			
		||||
    
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
	if ( ocb & cbmask ) {
 | 
			
		||||
	  cobj temp; 
 | 
			
		||||
	  temp =compress(rhs._odata[so+o+b]);
 | 
			
		||||
	  extract(temp,pointers);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
@@ -184,6 +90,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
      template<class vobj,class cobj, class compressor> void 
 | 
			
		||||
	HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
 | 
			
		||||
      {
 | 
			
		||||
	std::cout<< "HaloExchange comm_buf.size()="<< u_comm_buf.size()<<" unified_buffer_size"<< _unified_buffer_size<< std::endl;
 | 
			
		||||
	// conformable(source._grid,_grid);
 | 
			
		||||
	assert(source._grid==_grid);
 | 
			
		||||
	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
 | 
			
		||||
@@ -234,6 +141,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	std::cout<< "HaloExchange complete"<< std::endl;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      template<class vobj,class cobj, class compressor> 
 | 
			
		||||
@@ -318,6 +226,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
 | 
			
		||||
	  typedef typename cobj::vector_type vector_type;
 | 
			
		||||
	  typedef typename cobj::scalar_type scalar_type;
 | 
			
		||||
	  typedef typename cobj::scalar_object scalar_object;
 | 
			
		||||
	  
 | 
			
		||||
	  int fd = _grid->_fdimensions[dimension];
 | 
			
		||||
	  int rd = _grid->_rdimensions[dimension];
 | 
			
		||||
@@ -340,12 +249,12 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
	  int words = sizeof(cobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
	  /*   FIXME ALTERNATE BUFFER DETERMINATION */
 | 
			
		||||
	  std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) ); 
 | 
			
		||||
	  std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
 | 
			
		||||
	  int bytes = buffer_size*words*sizeof(scalar_type);
 | 
			
		||||
	  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); 
 | 
			
		||||
	  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 | 
			
		||||
	  int bytes = buffer_size*sizeof(scalar_object);
 | 
			
		||||
 | 
			
		||||
	  std::vector<scalar_type *> pointers(Nsimd);  //
 | 
			
		||||
	  std::vector<scalar_type *> rpointers(Nsimd); // received pointers
 | 
			
		||||
	  std::vector<scalar_object *> pointers(Nsimd);  //
 | 
			
		||||
	  std::vector<scalar_object *> rpointers(Nsimd); // received pointers
 | 
			
		||||
	  
 | 
			
		||||
	  ///////////////////////////////////////////
 | 
			
		||||
	  // Work out what to send where
 | 
			
		||||
@@ -357,9 +266,12 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
	  // loop over outer coord planes orthog to dim
 | 
			
		||||
	  for(int x=0;x<rd;x++){       
 | 
			
		||||
 | 
			
		||||
	    int any_offnode = ( ((x+sshift)%fd) >= rd );
 | 
			
		||||
	    std::cout<<"any_offnode ="<<any_offnode<<std::endl;
 | 
			
		||||
	    if ( any_offnode ) {
 | 
			
		||||
	      // FIXME call local permute copy if none are offnode.
 | 
			
		||||
	      for(int i=0;i<Nsimd;i++){       
 | 
			
		||||
	      pointers[i] = (scalar_type *)&send_buf_extract[i][0];
 | 
			
		||||
		pointers[i] = &send_buf_extract[i][0];
 | 
			
		||||
	      }
 | 
			
		||||
	      int sx   = (x+sshift)%rd;
 | 
			
		||||
	      
 | 
			
		||||
@@ -368,15 +280,19 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
	      std::cout<< "Gathered "<<std::endl;
 | 
			
		||||
	      for(int i=0;i<Nsimd;i++){
 | 
			
		||||
		
 | 
			
		||||
		std::vector<int> icoor;
 | 
			
		||||
		_grid->iCoorFromIindex(icoor,i);
 | 
			
		||||
 | 
			
		||||
		int inner_bit = (Nsimd>>(permute_type+1));
 | 
			
		||||
		int ic= (i&inner_bit)? 1:0;
 | 
			
		||||
		assert(ic==icoor[dimension]);
 | 
			
		||||
 | 
			
		||||
		int my_coor          = rd*ic + x;
 | 
			
		||||
		int nbr_coor         = my_coor+sshift;
 | 
			
		||||
		int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
 | 
			
		||||
 | 
			
		||||
	      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
 | 
			
		||||
	      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
 | 
			
		||||
		int nbr_lcoor= (nbr_coor%ld);
 | 
			
		||||
		int nbr_ic   = (nbr_lcoor)/rd;    // inner coord of peer
 | 
			
		||||
		int nbr_ox   = (nbr_lcoor%rd);    // outer coord of peer
 | 
			
		||||
		int nbr_lane = (i&(~inner_bit));
 | 
			
		||||
		
 | 
			
		||||
		int recv_from_rank;
 | 
			
		||||
@@ -385,6 +301,9 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
		if (nbr_ic) nbr_lane|=inner_bit;
 | 
			
		||||
		assert (sx == nbr_ox);
 | 
			
		||||
 | 
			
		||||
   std::cout<<"nbr_proc "<<nbr_proc<< " x "<<x<<" nbr_x "<<nbr_ox << " lane "<<i << " nbr_lane "<<nbr_lane
 | 
			
		||||
	    << " nbr_ic "<<nbr_ic  << " mycoor "<< my_coor<< " nbr_coor "<<nbr_coor<<std::endl;
 | 
			
		||||
		
 | 
			
		||||
		if(nbr_proc){
 | 
			
		||||
		  
 | 
			
		||||
		  std::cout<< "MPI sending "<<std::endl;
 | 
			
		||||
@@ -397,18 +316,23 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
					bytes);
 | 
			
		||||
		  std::cout<< "MPI complete "<<std::endl;
 | 
			
		||||
		  
 | 
			
		||||
		rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
 | 
			
		||||
		  rpointers[i] = &recv_buf_extract[i][0];
 | 
			
		||||
		  std::cout<<"lane "<<i<<" data "<<*( (Real *) rpointers[i])<<std::endl;
 | 
			
		||||
		} else { 
 | 
			
		||||
		rpointers[i] = (scalar_type *)&send_buf_extract[nbr_lane][0];
 | 
			
		||||
		  rpointers[i] = &send_buf_extract[nbr_lane][0];
 | 
			
		||||
		  std::cout<<"lane "<<i<<" data "<<*( (Real *) rpointers[i])<<std::endl;
 | 
			
		||||
		}
 | 
			
		||||
	      }
 | 
			
		||||
 | 
			
		||||
	      // Here we don't want to scatter, just place into a buffer.
 | 
			
		||||
	    std::cout<< "merging "<<std::endl;
 | 
			
		||||
	    for(int i=0;i<buffer_size;i++){
 | 
			
		||||
	      merge(u_comm_buf[u_comm_offset+i],rpointers);
 | 
			
		||||
	    }
 | 
			
		||||
	      std::cout<< "merging u_comm_offset "<< u_comm_offset<<" comm_buf_size" << u_comm_buf.size() <<std::endl;
 | 
			
		||||
 | 
			
		||||
	      for(int i=0;i<buffer_size;i++){
 | 
			
		||||
		assert(u_comm_offset+i<_unified_buffer_size);
 | 
			
		||||
		merge(u_comm_buf[u_comm_offset+i],rpointers,i);
 | 
			
		||||
	      }
 | 
			
		||||
	      u_comm_offset+=buffer_size;
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
  };
 | 
			
		||||
 
 | 
			
		||||
@@ -15,6 +15,7 @@ inline void where(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj
 | 
			
		||||
  conformable(iftrue,ret);
 | 
			
		||||
 | 
			
		||||
  GridBase *grid=iftrue._grid;
 | 
			
		||||
  typedef typename vobj::scalar_object scalar_object;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  typedef typename iobj::vector_type mask_type;
 | 
			
		||||
@@ -23,27 +24,21 @@ inline void where(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj
 | 
			
		||||
  const int words = sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  std::vector<Integer> mask(Nsimd);
 | 
			
		||||
  std::vector<std::vector<scalar_type> > truevals (Nsimd,std::vector<scalar_type>(words) );
 | 
			
		||||
  std::vector<std::vector<scalar_type> > falsevals(Nsimd,std::vector<scalar_type>(words) );
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  std::vector<scalar_object> truevals (Nsimd);
 | 
			
		||||
  std::vector<scalar_object> falsevals(Nsimd);
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
 | 
			
		||||
 | 
			
		||||
    for(int s=0;s<Nsimd;s++) pointers[s] = & truevals[s][0];
 | 
			
		||||
    extract(iftrue._odata[ss]   ,pointers);
 | 
			
		||||
 | 
			
		||||
    for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
 | 
			
		||||
    extract(iffalse._odata[ss]  ,pointers);
 | 
			
		||||
 | 
			
		||||
    extract(TensorRemove(predicate._odata[ss]),mask);
 | 
			
		||||
    extract(iftrue._odata[ss]   ,truevals);
 | 
			
		||||
    extract(iffalse._odata[ss]  ,falsevals);
 | 
			
		||||
    extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
 | 
			
		||||
 | 
			
		||||
    for(int s=0;s<Nsimd;s++){
 | 
			
		||||
      if (mask[s]) pointers[s]=&truevals[s][0];
 | 
			
		||||
      else         pointers[s]=&falsevals[s][0];
 | 
			
		||||
      if (mask[s]) falsevals[s]=truevals[s];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    merge(ret._odata[ss],pointers);
 | 
			
		||||
    merge(ret._odata[ss],falsevals);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -2,91 +2,97 @@
 | 
			
		||||
#define _GRID_CSHIFT_COMMON_H_
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
class SimpleCompressor {
 | 
			
		||||
public:
 | 
			
		||||
  void Point(int) {};
 | 
			
		||||
 | 
			
		||||
  vobj operator() (const vobj &arg) {
 | 
			
		||||
    return arg;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there is no need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj,class cobj,class compressor> void 
 | 
			
		||||
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
    cbmask = 0x3;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
  int bo  = 0;                                      // offset in buffer
 | 
			
		||||
  
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
			
		||||
      if ( ocb &cbmask ) {
 | 
			
		||||
	buffer[bo]=compress(rhs._odata[so+o+b]);
 | 
			
		||||
	bo++;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there *is* need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class cobj,class vobj,class compressor> void 
 | 
			
		||||
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_object *> pointers,int dimension,int plane,int cbmask,compressor &compress)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
    cbmask = 0x3;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
  int bo  = 0;                                      // offset in buffer
 | 
			
		||||
    
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
      int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
 | 
			
		||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
      if ( ocb & cbmask ) {
 | 
			
		||||
	cobj temp; 
 | 
			
		||||
	temp =compress(rhs._odata[so+o+b]);
 | 
			
		||||
	extract<cobj>(temp,pointers,offset);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
    o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there is no need to SIMD split
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                    // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                    // offset in buffer
 | 
			
		||||
 | 
			
		||||
    // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	buffer[bo++]=rhs._odata[so+o+b];
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                      // offset in buffer
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
			
		||||
	if ( ocb &cbmask ) {
 | 
			
		||||
	  buffer[bo]=rhs._odata[so+o+b];
 | 
			
		||||
	  bo++;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  SimpleCompressor<vobj> dontcompress;
 | 
			
		||||
  Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there *is* need to SIMD split
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
 template<class vobj,class scalar_type> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
 | 
			
		||||
template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                    // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                    // offset in buffer
 | 
			
		||||
 | 
			
		||||
    // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	extract(rhs._odata[so+o+b],pointers);
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                      // offset in buffer
 | 
			
		||||
    
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
	if ( ocb & cbmask ) {
 | 
			
		||||
	  extract(rhs._odata[so+o+b],pointers);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  SimpleCompressor<vobj> dontcompress;
 | 
			
		||||
  Gather_plane_extract<vobj,vobj,decltype(dontcompress)>(rhs,pointers,dimension,plane,cbmask,dontcompress);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
@@ -97,21 +103,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                    // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                    // offset in buffer
 | 
			
		||||
 | 
			
		||||
    // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	rhs._odata[so+o+b]=buffer[bo++];
 | 
			
		||||
    cbmask=0x3;
 | 
			
		||||
  }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
@@ -130,31 +123,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
 | 
			
		||||
    o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// Scatter for when there *is* need to SIMD split
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
 template<class vobj,class scalar_type> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
 | 
			
		||||
 template<class vobj,class cobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<cobj *> pointers,int dimension,int plane,int cbmask)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int o   = 0;                                    // relative offset to base within plane
 | 
			
		||||
    int bo  = 0;                                    // offset in buffer
 | 
			
		||||
 | 
			
		||||
    // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	merge(rhs._odata[so+o+b],pointers);
 | 
			
		||||
    cbmask=0x3;
 | 
			
		||||
  }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int o   = 0;                                      // relative offset to base within plane
 | 
			
		||||
@@ -164,16 +143,16 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
 | 
			
		||||
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
 | 
			
		||||
      int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
			
		||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
      if ( ocb&cbmask ) {
 | 
			
		||||
	  merge(rhs._odata[so+o+b],pointers);
 | 
			
		||||
	merge(rhs._odata[so+o+b],pointers,offset);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
    o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// local to node block strided copies
 | 
			
		||||
@@ -183,21 +162,9 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int o   = 0;                                     // relative offset to base within plane
 | 
			
		||||
    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int lo  = lplane*lhs._grid->_ostride[dimension]; // offset in buffer
 | 
			
		||||
 | 
			
		||||
  // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
 | 
			
		||||
      }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    cbmask=0x3;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  } else {
 | 
			
		||||
 | 
			
		||||
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
@@ -216,31 +183,15 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
 | 
			
		||||
    }
 | 
			
		||||
    o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
 | 
			
		||||
    int o   = 0;                                     // relative offset to base within plane
 | 
			
		||||
    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
    int lo  = lplane*rhs._grid->_ostride[dimension]; // offset in buffer
 | 
			
		||||
 | 
			
		||||
  // Simple block stride gather of SIMD objects
 | 
			
		||||
#pragma omp parallel for collapse(2)
 | 
			
		||||
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
 | 
			
		||||
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
 | 
			
		||||
    cbmask=0x3;
 | 
			
		||||
  }
 | 
			
		||||
      o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  } else {
 | 
			
		||||
 | 
			
		||||
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
@@ -251,7 +202,6 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
 | 
			
		||||
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 | 
			
		||||
      
 | 
			
		||||
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
 | 
			
		||||
      if ( ocb&cbmask ) {
 | 
			
		||||
	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
 | 
			
		||||
      }
 | 
			
		||||
@@ -259,8 +209,6 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &r
 | 
			
		||||
    }
 | 
			
		||||
    o +=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -133,6 +133,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
 | 
			
		||||
  GridBase *grid=rhs._grid;
 | 
			
		||||
  const int Nsimd = grid->Nsimd();
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  typedef typename vobj::scalar_object scalar_object;
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
   
 | 
			
		||||
  int fd = grid->_fdimensions[dimension];
 | 
			
		||||
@@ -155,12 +156,12 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
 | 
			
		||||
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
 | 
			
		||||
  int words = sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
 | 
			
		||||
  std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
 | 
			
		||||
  int bytes = buffer_size*words*sizeof(scalar_type);
 | 
			
		||||
  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 | 
			
		||||
  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 | 
			
		||||
  int bytes = buffer_size*sizeof(scalar_object);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);  // 
 | 
			
		||||
  std::vector<scalar_type *> rpointers(Nsimd); // received pointers
 | 
			
		||||
  std::vector<scalar_object *>  pointers(Nsimd);  // 
 | 
			
		||||
  std::vector<scalar_object *> rpointers(Nsimd); // received pointers
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  // Work out what to send where
 | 
			
		||||
@@ -172,9 +173,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
 | 
			
		||||
  for(int x=0;x<rd;x++){       
 | 
			
		||||
 | 
			
		||||
    // FIXME call local permute copy if none are offnode.
 | 
			
		||||
 | 
			
		||||
    for(int i=0;i<Nsimd;i++){       
 | 
			
		||||
      pointers[i] = (scalar_type *)&send_buf_extract[i][0];
 | 
			
		||||
      pointers[i] = &send_buf_extract[i][0];
 | 
			
		||||
    }
 | 
			
		||||
    int sx   = (x+sshift)%rd;
 | 
			
		||||
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
 | 
			
		||||
@@ -208,9 +208,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &r
 | 
			
		||||
			     recv_from_rank,
 | 
			
		||||
			     bytes);
 | 
			
		||||
 | 
			
		||||
	rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
 | 
			
		||||
	rpointers[i] = &recv_buf_extract[i][0];
 | 
			
		||||
      } else { 
 | 
			
		||||
	rpointers[i] = (scalar_type *)&send_buf_extract[nbr_lane][0];
 | 
			
		||||
	rpointers[i] = &send_buf_extract[nbr_lane][0];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -5,21 +5,23 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
 | 
			
		||||
    {
 | 
			
		||||
      typedef typename iobj::scalar_object scalar_object;
 | 
			
		||||
      typedef typename iobj::scalar_type scalar_type;
 | 
			
		||||
      typedef typename iobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
      GridBase *grid = l._grid;
 | 
			
		||||
      int Nsimd = grid->iSites();
 | 
			
		||||
 | 
			
		||||
      std::vector<int> gcoor;
 | 
			
		||||
      std::vector<scalar_type> mergebuf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> mergeptr(Nsimd);
 | 
			
		||||
 | 
			
		||||
      vector_type vI;
 | 
			
		||||
      for(int o=0;o<grid->oSites();o++){
 | 
			
		||||
	for(int i=0;i<grid->iSites();i++){
 | 
			
		||||
	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
 | 
			
		||||
	  mergebuf[i]=gcoor[mu];
 | 
			
		||||
	  mergeptr[i]=&mergebuf[i];
 | 
			
		||||
	  mergebuf[i]=(Integer)gcoor[mu];
 | 
			
		||||
	}
 | 
			
		||||
	merge(vI,mergeptr);
 | 
			
		||||
	AmergeA<vector_type,scalar_type>(vI,mergebuf);
 | 
			
		||||
	l._odata[o]=vI;
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 
 | 
			
		||||
@@ -94,15 +94,12 @@ namespace Grid {
 | 
			
		||||
      grid->Broadcast(grid->BossRank(),s);
 | 
			
		||||
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
 | 
			
		||||
      // extract-modify-merge cycle is easiest way and this is not perf critical
 | 
			
		||||
      if ( rank == grid->ThisRank() ) {
 | 
			
		||||
	for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
	extract(l._odata[odx],pointers);
 | 
			
		||||
	extract(l._odata[odx],buf);
 | 
			
		||||
	buf[idx] = s;
 | 
			
		||||
	for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
	merge(l._odata[odx],pointers);
 | 
			
		||||
	merge(l._odata[odx],buf);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      return;
 | 
			
		||||
@@ -127,13 +124,12 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      int rank,odx,idx;
 | 
			
		||||
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
 | 
			
		||||
      extract(l._odata[odx],pointers);
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      extract(l._odata[odx],buf);
 | 
			
		||||
 | 
			
		||||
      s = buf[idx];
 | 
			
		||||
 | 
			
		||||
      grid->Broadcast(rank,s);
 | 
			
		||||
 | 
			
		||||
      return;
 | 
			
		||||
@@ -160,10 +156,8 @@ namespace Grid {
 | 
			
		||||
      odx= grid->oIndex(site);
 | 
			
		||||
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
 | 
			
		||||
      extract(l._odata[odx],pointers);
 | 
			
		||||
      extract(l._odata[odx],buf);
 | 
			
		||||
      
 | 
			
		||||
      s = buf[idx];
 | 
			
		||||
 | 
			
		||||
@@ -188,16 +182,13 @@ namespace Grid {
 | 
			
		||||
      odx= grid->oIndex(site);
 | 
			
		||||
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
 | 
			
		||||
      // extract-modify-merge cycle is easiest way and this is not perf critical
 | 
			
		||||
      extract(l._odata[odx],pointers);
 | 
			
		||||
      extract(l._odata[odx],buf);
 | 
			
		||||
      
 | 
			
		||||
      buf[idx] = s;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
      merge(l._odata[odx],pointers);
 | 
			
		||||
      merge(l._odata[odx],buf);
 | 
			
		||||
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
 
 | 
			
		||||
@@ -66,9 +66,7 @@ namespace Grid {
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      std::vector<sobj>               buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
      extract(vsum,pointers);
 | 
			
		||||
      extract(vsum,buf);
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -27,6 +27,19 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  // real scalars are one component
 | 
			
		||||
  template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=dist(gen);
 | 
			
		||||
  }
 | 
			
		||||
  template<class distribution,class generator> void fillScalar(ComplexF &s,distribution &dist, generator &gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=ComplexF(dist(gen),dist(gen));
 | 
			
		||||
  }
 | 
			
		||||
  template<class distribution,class generator> void fillScalar(ComplexD &s,distribution &dist,generator &gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=ComplexD(dist(gen),dist(gen));
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  class GridRNGbase {
 | 
			
		||||
 | 
			
		||||
@@ -64,20 +77,6 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // real scalars are one component
 | 
			
		||||
    template<class scalar,class distribution> void fillScalar(scalar &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=dist(_generators[0]);
 | 
			
		||||
    }
 | 
			
		||||
    template<class distribution> void fillScalar(ComplexF &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=ComplexF(dist(_generators[0]),dist(_generators[0]));
 | 
			
		||||
    }
 | 
			
		||||
    template<class distribution> void fillScalar(ComplexD &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=ComplexD(dist(_generators[0]),dist(_generators[0]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template <class sobj,class distribution> inline void fill(sobj &l,distribution &dist){
 | 
			
		||||
 | 
			
		||||
@@ -88,7 +87,7 @@ namespace Grid {
 | 
			
		||||
      scalar_type *buf = (scalar_type *) & l;
 | 
			
		||||
 | 
			
		||||
      for(int idx=0;idx<words;idx++){
 | 
			
		||||
	fillScalar(buf[idx],dist);
 | 
			
		||||
	fillScalar(buf[idx],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
@@ -96,47 +95,47 @@ namespace Grid {
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template <class distribution>  inline void fill(ComplexF &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      fillScalar(l,dist,_generators[0]);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(ComplexD &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      fillScalar(l,dist,_generators[0]);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(RealF &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      fillScalar(l,dist,_generators[0]);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(RealD &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      fillScalar(l,dist,_generators[0]);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    // vector fill
 | 
			
		||||
    template <class distribution>  inline void fill(vComplexF &l,distribution &dist){
 | 
			
		||||
      RealF *pointer=(RealF *)&l;
 | 
			
		||||
      for(int i=0;i<2*vComplexF::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
	fillScalar(pointer[i],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vComplexD &l,distribution &dist){
 | 
			
		||||
      RealD *pointer=(RealD *)&l;
 | 
			
		||||
      for(int i=0;i<2*vComplexD::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
	fillScalar(pointer[i],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vRealF &l,distribution &dist){
 | 
			
		||||
      RealF *pointer=(RealF *)&l;
 | 
			
		||||
      for(int i=0;i<vRealF::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
	fillScalar(pointer[i],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vRealD &l,distribution &dist){
 | 
			
		||||
      RealD *pointer=(RealD *)&l;
 | 
			
		||||
      for(int i=0;i<vRealD::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
	fillScalar(pointer[i],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
@@ -187,18 +186,31 @@ namespace Grid {
 | 
			
		||||
    {
 | 
			
		||||
      std::vector<int> gcoor;
 | 
			
		||||
 | 
			
		||||
      for(int gidx=0;gidx<_grid->_gsites;gidx++){
 | 
			
		||||
      int gsites = _grid->_gsites;
 | 
			
		||||
 | 
			
		||||
      typename source::result_type init = src();
 | 
			
		||||
      std::ranlux48 pseeder(init);
 | 
			
		||||
      std::uniform_int_distribution<uint64_t> ui;
 | 
			
		||||
 | 
			
		||||
      for(int gidx=0;gidx<gsites;gidx++){
 | 
			
		||||
 | 
			
		||||
	int rank,o_idx,i_idx;
 | 
			
		||||
	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 | 
			
		||||
	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 | 
			
		||||
 | 
			
		||||
	int l_idx=generator_idx(o_idx,i_idx);
 | 
			
		||||
	
 | 
			
		||||
	typename source::result_type init = src();
 | 
			
		||||
	std::vector<int> site_seeds(4);
 | 
			
		||||
	for(int i=0;i<4;i++){
 | 
			
		||||
	  site_seeds[i]= ui(pseeder);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	_grid->Broadcast(0,(void *)&site_seeds[0],sizeof(int)*site_seeds.size());
 | 
			
		||||
 | 
			
		||||
	_grid->Broadcast(0,(void *)&init,sizeof(init));
 | 
			
		||||
	if( rank == _grid->ThisRank() ){
 | 
			
		||||
	  _generators[l_idx] = std::ranlux48(init);
 | 
			
		||||
	  fixedSeed ssrc(site_seeds);
 | 
			
		||||
	  typename source::result_type sinit = ssrc();
 | 
			
		||||
	  _generators[l_idx] = std::ranlux48(sinit);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      _seeded=1;
 | 
			
		||||
@@ -210,6 +222,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,distribution &dist){
 | 
			
		||||
 | 
			
		||||
      typedef typename vobj::scalar_object scalar_object;
 | 
			
		||||
      typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
      typedef typename vobj::vector_type vector_type;
 | 
			
		||||
      
 | 
			
		||||
@@ -217,25 +230,22 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      int     Nsimd =_grid->Nsimd();
 | 
			
		||||
      int     osites=_grid->oSites();
 | 
			
		||||
      int words=sizeof(scalar_object)/sizeof(scalar_type);
 | 
			
		||||
 | 
			
		||||
      int words = sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
      std::vector<std::vector<scalar_type> > buf(Nsimd,std::vector<scalar_type>(words));
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      std::vector<scalar_object> buf(Nsimd);
 | 
			
		||||
      
 | 
			
		||||
      for(int ss=0;ss<osites;ss++){
 | 
			
		||||
 | 
			
		||||
	for(int si=0;si<Nsimd;si++){
 | 
			
		||||
 | 
			
		||||
	  int gdx = generator_idx(ss,si); // index of generator state
 | 
			
		||||
 | 
			
		||||
	  pointers[si] = (scalar_type *)&buf[si][0];
 | 
			
		||||
	  scalar_type *pointer = (scalar_type *)&buf[si];
 | 
			
		||||
	  for(int idx=0;idx<words;idx++){
 | 
			
		||||
	    pointers[si][idx] = dist(_generators[gdx]);
 | 
			
		||||
	    fillScalar(pointer[idx],dist,_generators[gdx]);
 | 
			
		||||
	  }
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
	// merge into SIMD lanes
 | 
			
		||||
	merge(l._odata[ss],pointers);
 | 
			
		||||
	merge(l._odata[ss],buf);
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -57,12 +57,6 @@ public:
 | 
			
		||||
    friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
 | 
			
		||||
      permute(out._internal,in._internal,permutetype);
 | 
			
		||||
    }
 | 
			
		||||
    friend void extract(const iScalar<vtype> &in,std::vector<scalar_type *> &out){
 | 
			
		||||
      extract(in._internal,out); // extract advances the pointers in out
 | 
			
		||||
    }
 | 
			
		||||
    friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
 | 
			
		||||
      merge(in._internal,out); // extract advances the pointers in out
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Unary negation
 | 
			
		||||
    friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
 | 
			
		||||
@@ -149,16 +143,6 @@ public:
 | 
			
		||||
	permute(out._internal[i],in._internal[i],permutetype);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    friend void extract(const iVector<vtype,N> &in,std::vector<scalar_type *> &out){
 | 
			
		||||
      for(int i=0;i<N;i++){
 | 
			
		||||
	extract(in._internal[i],out);// extract advances pointers in out
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    friend void merge(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
 | 
			
		||||
      for(int i=0;i<N;i++){
 | 
			
		||||
	merge(in._internal[i],out);// extract advances pointers in out
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    // Unary negation
 | 
			
		||||
    friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
 | 
			
		||||
        iVector<vtype,N> ret;
 | 
			
		||||
@@ -232,18 +216,6 @@ public:
 | 
			
		||||
	permute(out._internal[i][j],in._internal[i][j],permutetype);
 | 
			
		||||
    }}
 | 
			
		||||
  }
 | 
			
		||||
  friend void extract(const iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
 | 
			
		||||
    for(int i=0;i<N;i++){
 | 
			
		||||
      for(int j=0;j<N;j++){
 | 
			
		||||
	extract(in._internal[i][j],out);// extract advances pointers in out
 | 
			
		||||
    }}
 | 
			
		||||
  }
 | 
			
		||||
  friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
 | 
			
		||||
    for(int i=0;i<N;i++){
 | 
			
		||||
      for(int j=0;j<N;j++){
 | 
			
		||||
	merge(in._internal[i][j],out);// extract advances pointers in out
 | 
			
		||||
    }}
 | 
			
		||||
  }
 | 
			
		||||
  // Unary negation
 | 
			
		||||
  friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
 | 
			
		||||
    iMatrix<vtype,N> ret;
 | 
			
		||||
@@ -285,37 +257,6 @@ public:
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void extract(const vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
 | 
			
		||||
  int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  
 | 
			
		||||
  extracted.resize(Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) 
 | 
			
		||||
    pointers[i] =(scalar_type *)& extracted[i];
 | 
			
		||||
  
 | 
			
		||||
  extract(vec,pointers);
 | 
			
		||||
}
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  assert(extracted.size()==Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) 
 | 
			
		||||
    pointers[i] =(scalar_type *)& extracted[i];
 | 
			
		||||
  
 | 
			
		||||
  merge(vec,pointers);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -64,6 +64,14 @@ namespace Grid {
 | 
			
		||||
    typedef ComplexD scalar_object;
 | 
			
		||||
    enum { TensorLevel = 0 };
 | 
			
		||||
  };
 | 
			
		||||
  template<> class GridTypeMapper<Integer> {
 | 
			
		||||
  public:
 | 
			
		||||
    typedef Integer scalar_type;
 | 
			
		||||
    typedef Integer vector_type;
 | 
			
		||||
    typedef Integer tensor_reduced;
 | 
			
		||||
    typedef Integer scalar_object;
 | 
			
		||||
    enum { TensorLevel = 0 };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  template<> class GridTypeMapper<vRealF> {
 | 
			
		||||
  public:
 | 
			
		||||
 
 | 
			
		||||
@@ -99,23 +99,29 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
 | 
			
		||||
    int offset,local;
 | 
			
		||||
    int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
    vSpinColourVector result;
 | 
			
		||||
    vHalfSpinColourVector  chi;    
 | 
			
		||||
    vHalfSpinColourVector  tmp;    
 | 
			
		||||
    vHalfSpinColourVector Uchi;
 | 
			
		||||
    vHalfSpinColourVector *chi_p;
 | 
			
		||||
 | 
			
		||||
    result=zero;
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
    // Xp
 | 
			
		||||
    offset = Stencil._offsets [Xp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Xp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Xp];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjXp(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
      spProjXp(chi,in._odata[offset]);
 | 
			
		||||
      if ( perm ) {
 | 
			
		||||
	permute(tmp,chi,ptype);
 | 
			
		||||
	chi_p = &tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)());
 | 
			
		||||
    spReconXp(result,Uchi);
 | 
			
		||||
@@ -123,10 +129,16 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    // Yp
 | 
			
		||||
    offset = Stencil._offsets [Yp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Yp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Yp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Yp];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjYp(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
      spProjYp(chi,in._odata[offset]);
 | 
			
		||||
      if ( perm ) {
 | 
			
		||||
	permute(tmp,chi,ptype);
 | 
			
		||||
	chi_p = &tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Yp)),&(*chi_p)());
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
@@ -134,10 +146,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    // Zp
 | 
			
		||||
    offset = Stencil._offsets [Zp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Zp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Zp];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjZp(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
      spProjZp(chi,in._odata[offset]);
 | 
			
		||||
      if ( perm ) {
 | 
			
		||||
	permute(tmp,chi,ptype);
 | 
			
		||||
	chi_p = &tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Zp)),&(*chi_p)());
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
@@ -145,34 +164,54 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    // Tp
 | 
			
		||||
    offset = Stencil._offsets [Tp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Tp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Tp];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjTp(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
      spProjTp(chi,in._odata[offset]);
 | 
			
		||||
      if ( perm ) {
 | 
			
		||||
	permute(tmp,chi,ptype);
 | 
			
		||||
	chi_p = &tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)());
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    // Xm
 | 
			
		||||
    offset = Stencil._offsets [Xm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Xm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Xm];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjXm(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
      spProjXm(chi,in._odata[offset]);
 | 
			
		||||
      if ( perm ) {
 | 
			
		||||
	permute(tmp,chi,ptype);
 | 
			
		||||
	chi_p = &tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    std::cout<<"Xm for site  "<<ss<<" l "<<local<<" p "<<perm<<" chi "<<Reduce(TensorRemove(innerProduct(*chi_p,*chi_p)))<<std::endl;
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
#if 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Ym
 | 
			
		||||
    offset = Stencil._offsets [Ym][ss];
 | 
			
		||||
    local  = Stencil._is_local[Ym][ss];
 | 
			
		||||
    perm   = Stencil._permute[Ym][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Ym];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjYm(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
      spProjYm(chi,in._odata[offset]);
 | 
			
		||||
      if ( perm ) {
 | 
			
		||||
	permute(tmp,chi,ptype);
 | 
			
		||||
	chi_p = &tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Ym)),&(*chi_p)());
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
@@ -180,10 +219,17 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    // Zm
 | 
			
		||||
    offset = Stencil._offsets [Zm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Zm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Zm];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjZm(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
      spProjZm(chi,in._odata[offset]);
 | 
			
		||||
      if ( perm ) {
 | 
			
		||||
	permute(tmp,chi,ptype);
 | 
			
		||||
	chi_p = &tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Zm)),&(*chi_p)());
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
@@ -191,14 +237,21 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    // Tm
 | 
			
		||||
    offset = Stencil._offsets [Tm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Tm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Tm];
 | 
			
		||||
    chi_p  = &comm_buf[offset];
 | 
			
		||||
 | 
			
		||||
    if ( local ) {
 | 
			
		||||
      spProjTm(chi,in._odata[offset]);
 | 
			
		||||
      chi_p = χ
 | 
			
		||||
      spProjTm(chi,in._odata[offset]);
 | 
			
		||||
      if ( perm ) {
 | 
			
		||||
	permute(tmp,chi,ptype);
 | 
			
		||||
	chi_p = &tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&(Uchi()),&(Umu._odata[ss](Tm)),&(*chi_p)());
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    out._odata[ss] = result;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -163,6 +163,7 @@ namespace Grid {
 | 
			
		||||
	// all subtypes; may not be a good assumption, but could
 | 
			
		||||
	// add the vector width as a template param for BG/Q for example
 | 
			
		||||
	////////////////////////////////////////////////////////////////////
 | 
			
		||||
	/*
 | 
			
		||||
	friend inline void permute(vComplexD &y,vComplexD b,int perm)
 | 
			
		||||
	{
 | 
			
		||||
	  Gpermute<vComplexD>(y,b,perm);
 | 
			
		||||
@@ -183,6 +184,7 @@ namespace Grid {
 | 
			
		||||
	{
 | 
			
		||||
	  Gextract<vComplexD,ComplexD>(y,extracted);
 | 
			
		||||
	}
 | 
			
		||||
	*/
 | 
			
		||||
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        // Splat
 | 
			
		||||
 
 | 
			
		||||
@@ -412,6 +412,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
      {
 | 
			
		||||
	Gpermute<vComplexF>(y,b,perm);
 | 
			
		||||
      }
 | 
			
		||||
      /*
 | 
			
		||||
      friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
 | 
			
		||||
      {
 | 
			
		||||
	Gmerge<vComplexF,ComplexF >(y,extracted);
 | 
			
		||||
@@ -428,7 +429,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
      {
 | 
			
		||||
	Gextract<vComplexF,ComplexF>(y,extracted);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      */
 | 
			
		||||
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -221,6 +221,7 @@ namespace Grid {
 | 
			
		||||
      {
 | 
			
		||||
	Gpermute<vInteger>(y,b,perm);
 | 
			
		||||
      }
 | 
			
		||||
      /*
 | 
			
		||||
      friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
 | 
			
		||||
      {
 | 
			
		||||
	Gmerge<vInteger,Integer>(y,extracted);
 | 
			
		||||
@@ -237,7 +238,7 @@ namespace Grid {
 | 
			
		||||
      {
 | 
			
		||||
	Gextract<vInteger,Integer>(y,extracted);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      */
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
        static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
 | 
			
		||||
 
 | 
			
		||||
@@ -105,6 +105,7 @@ namespace Grid {
 | 
			
		||||
	// all subtypes; may not be a good assumption, but could
 | 
			
		||||
	// add the vector width as a template param for BG/Q for example
 | 
			
		||||
	////////////////////////////////////////////////////////////////////
 | 
			
		||||
	/*
 | 
			
		||||
	friend inline void permute(vRealD &y,vRealD b,int perm)
 | 
			
		||||
	{
 | 
			
		||||
	  Gpermute<vRealD>(y,b,perm);
 | 
			
		||||
@@ -125,7 +126,7 @@ namespace Grid {
 | 
			
		||||
	{
 | 
			
		||||
	  Gextract<vRealD,RealD>(y,extracted);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	*/
 | 
			
		||||
        
 | 
			
		||||
        friend inline void vsplat(vRealD &ret,double a){
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
 
 | 
			
		||||
@@ -127,6 +127,7 @@ namespace Grid {
 | 
			
		||||
	// all subtypes; may not be a good assumption, but could
 | 
			
		||||
	// add the vector width as a template param for BG/Q for example
 | 
			
		||||
	////////////////////////////////////////////////////////////////////
 | 
			
		||||
	/*
 | 
			
		||||
	friend inline void permute(vRealF &y,vRealF b,int perm)
 | 
			
		||||
	{
 | 
			
		||||
	  Gpermute<vRealF>(y,b,perm);
 | 
			
		||||
@@ -147,7 +148,7 @@ namespace Grid {
 | 
			
		||||
	{
 | 
			
		||||
	  Gextract<vRealF,RealF>(y,extracted);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	*/
 | 
			
		||||
 | 
			
		||||
        
 | 
			
		||||
        /////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -118,6 +118,7 @@ namespace Grid {
 | 
			
		||||
      
 | 
			
		||||
      int fd              = _grid->_fdimensions[dimension];
 | 
			
		||||
      int rd              = _grid->_rdimensions[dimension];
 | 
			
		||||
      int pd              = _grid->_processors[dimension];
 | 
			
		||||
      int simd_layout     = _grid->_simd_layout[dimension];
 | 
			
		||||
      int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
			
		||||
      
 | 
			
		||||
@@ -136,9 +137,9 @@ namespace Grid {
 | 
			
		||||
      
 | 
			
		||||
      for(int x=0;x<rd;x++){       
 | 
			
		||||
	
 | 
			
		||||
	int offnode = ( x+sshift >= rd );
 | 
			
		||||
	int comm_proc = ((x+sshift)/rd)%pd;
 | 
			
		||||
	int offnode =   (comm_proc!=0);
 | 
			
		||||
	int sx        = (x+sshift)%rd;
 | 
			
		||||
	int comm_proc = (x+sshift)/rd;
 | 
			
		||||
 | 
			
		||||
	if (!offnode) {
 | 
			
		||||
	  
 | 
			
		||||
 
 | 
			
		||||
@@ -75,9 +75,9 @@ void Tester(const functor &func)
 | 
			
		||||
    random(sRNG,result[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Gmerge(v_input1,input1);
 | 
			
		||||
  Gmerge(v_input2,input2);
 | 
			
		||||
  Gmerge(v_result,result);
 | 
			
		||||
  merge<vec,scal>(v_input1,input1);
 | 
			
		||||
  merge<vec,scal>(v_input2,input2);
 | 
			
		||||
  merge<vec,scal>(v_result,result);
 | 
			
		||||
 | 
			
		||||
  func(v_result,v_input1,v_input2);
 | 
			
		||||
 | 
			
		||||
@@ -85,7 +85,7 @@ void Tester(const functor &func)
 | 
			
		||||
    func(reference[i],input1[i],input2[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Gextract(v_result,result);
 | 
			
		||||
  extract<vec,scal>(v_result,result);
 | 
			
		||||
  std::cout << " " << func.name()<<std::endl;
 | 
			
		||||
 | 
			
		||||
  int ok=0;
 | 
			
		||||
 
 | 
			
		||||
@@ -4,21 +4,12 @@ using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
class SimpleCompressor {
 | 
			
		||||
public:
 | 
			
		||||
  void Point(int) {};
 | 
			
		||||
 | 
			
		||||
  vobj operator() (const vobj &arg) {
 | 
			
		||||
    return arg;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> simd_layout({1,1,2,2});
 | 
			
		||||
  std::vector<int> mpi_layout ({2,2,2,2});
 | 
			
		||||
  std::vector<int> mpi_layout ({2,2,1,2});
 | 
			
		||||
  std::vector<int> latt_size  ({8,8,8,8});
 | 
			
		||||
 | 
			
		||||
  double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
@@ -26,7 +17,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
  GridCartesian Fine(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridParallelRNG       fRNG(&Fine);
 | 
			
		||||
  fRNG.SeedRandomDevice();
 | 
			
		||||
  //  fRNG.SeedRandomDevice();
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  fRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  
 | 
			
		||||
  LatticeColourMatrix Foo(&Fine);
 | 
			
		||||
  LatticeColourMatrix Bar(&Fine);
 | 
			
		||||
@@ -38,8 +31,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    for(int dir=0;dir<4;dir++){
 | 
			
		||||
      for(int disp=0;disp<Fine._rdimensions[dir];disp++){
 | 
			
		||||
      for(int disp=0;disp<Fine._fdimensions[dir];disp++){
 | 
			
		||||
 | 
			
		||||
	std::cout << "Using stencil to shift dim "<<dir<< " by "<<disp<<std::endl;
 | 
			
		||||
	// start to test the Cartesian npoint stencil infrastructure
 | 
			
		||||
	int npoint=1;
 | 
			
		||||
	std::vector<int> directions(npoint,dir);
 | 
			
		||||
@@ -47,22 +41,13 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
	CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
 | 
			
		||||
 | 
			
		||||
	printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp);
 | 
			
		||||
	std::vector<int> ocoor(4);
 | 
			
		||||
	for(int o=0;o<Fine.oSites();o++){
 | 
			
		||||
	  Fine.oCoorFromOindex(ocoor,o);
 | 
			
		||||
	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 | 
			
		||||
	  int nbr = Fine.oIndexReduced(ocoor);
 | 
			
		||||
	  int stcl= myStencil._offsets[0][o];
 | 
			
		||||
	  if(nbr!=stcl){
 | 
			
		||||
	    printf("STENCIL: nbr %d stencil._offset %d\n",nbr,stcl);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	printf("allocating %d buffers\n",myStencil._unified_buffer_size);
 | 
			
		||||
	fflush(stdout);
 | 
			
		||||
	std::vector<vColourMatrix,alignedAllocator<vColourMatrix> >  comm_buf(myStencil._unified_buffer_size);
 | 
			
		||||
	printf("calling halo exchange\n");fflush(stdout);
 | 
			
		||||
	SimpleCompressor<vColourMatrix> compress;
 | 
			
		||||
	myStencil.HaloExchange(Foo,comm_buf,compress);
 | 
			
		||||
 | 
			
		||||
@@ -81,14 +66,12 @@ int main (int argc, char ** argv)
 | 
			
		||||
	    Check._odata[i] = Foo._odata[offset];
 | 
			
		||||
	  else 
 | 
			
		||||
	    Check._odata[i] = comm_buf[offset];
 | 
			
		||||
	  
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	Real nrmC = norm2(Check);
 | 
			
		||||
	Real nrmB = norm2(Bar);
 | 
			
		||||
	Real nrm  = norm2(Check-Bar);
 | 
			
		||||
	printf("N2diff = %le (%le, %le) \n",nrm,nrmC,nrmB);fflush(stdout);
 | 
			
		||||
	std::cout<<"N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;
 | 
			
		||||
 | 
			
		||||
	Real snrmC =0;
 | 
			
		||||
	Real snrmB =0;
 | 
			
		||||
@@ -110,10 +93,11 @@ int main (int argc, char ** argv)
 | 
			
		||||
            diff =check()()(r,c)-bar()()(r,c);
 | 
			
		||||
            double nn=real(conj(diff)*diff);
 | 
			
		||||
            if ( nn > 0){
 | 
			
		||||
	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
 | 
			
		||||
	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
 | 
			
		||||
		     coor[0],coor[1],coor[2],coor[3],r,c,
 | 
			
		||||
		     nn,
 | 
			
		||||
		     real(check()()(r,c)),
 | 
			
		||||
		     imag(check()()(r,c)),
 | 
			
		||||
		     real(bar()()(r,c))
 | 
			
		||||
		     );
 | 
			
		||||
	    }
 | 
			
		||||
@@ -124,7 +108,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
	 
 | 
			
		||||
	}}}}
 | 
			
		||||
 | 
			
		||||
	printf("scalar N2diff = %le (%le, %le) \n",snrm,snrmC,snrmB);fflush(stdout);
 | 
			
		||||
	std::cout<<"scalar N2diff = "<<snrm<<" " <<snrmC<<" "<<snrmB<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
 
 | 
			
		||||
@@ -28,6 +28,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);
 | 
			
		||||
  //  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  //  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  pRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
@@ -44,7 +45,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
    U[mu] = peekIndex<3>(Umu,mu);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> mask({0,0,0,0,1,0,0,0});
 | 
			
		||||
  std::vector<int> mask({1,1,1,1,1,1,1,1});
 | 
			
		||||
  { // Naive wilson implementation
 | 
			
		||||
    ref = zero;
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user