mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Major rework of extract/merge/permute processing debugged and working.
This commit is contained in:
parent
9e597ac50a
commit
982274e5a0
3
Grid.h
3
Grid.h
@ -42,11 +42,10 @@
|
||||
#endif
|
||||
|
||||
|
||||
#include <Grid_aligned_allocator.h>
|
||||
#include <Grid_simd.h>
|
||||
#include <Grid_math_types.h>
|
||||
#include <Grid_Cartesian.h>
|
||||
#include <Grid_aligned_allocator.h>
|
||||
#include <Grid_aligned_allocator.h>
|
||||
#include <Grid_Lattice.h>
|
||||
#include <Grid_QCD.h>
|
||||
|
||||
|
@ -8,48 +8,6 @@ namespace Grid{
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Grid Support.
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Cartesian grid inheritance
|
||||
// Grid::GridBase
|
||||
// |
|
||||
// __________|___________
|
||||
// | |
|
||||
// Grid::GridCartesian Grid::GridCartesianRedBlack
|
||||
//
|
||||
// TODO: document the following as an API guaranteed public interface
|
||||
|
||||
/*
|
||||
* Rough map of functionality against QDP++ Layout
|
||||
*
|
||||
* Param | Grid | QDP++
|
||||
* -----------------------------------------
|
||||
* | |
|
||||
* void | oSites, iSites, lSites | sitesOnNode
|
||||
* void | gSites | vol
|
||||
* | |
|
||||
* gcoor | oIndex, iIndex | linearSiteIndex // no virtual node in QDP
|
||||
* lcoor | |
|
||||
*
|
||||
* void | CheckerBoarded | - // No checkerboarded in QDP
|
||||
* void | FullDimensions | lattSize
|
||||
* void | GlobalDimensions | lattSize // No checkerboarded in QDP
|
||||
* void | LocalDimensions | subgridLattSize
|
||||
* void | VirtualLocalDimensions | subgridLattSize // no virtual node in QDP
|
||||
* | |
|
||||
* int x 3 | oiSiteRankToGlobal | siteCoords
|
||||
* | ProcessorCoorLocalCoorToGlobalCoor |
|
||||
* | |
|
||||
* vector<int> | GlobalCoorToRankIndex | nodeNumber(coord)
|
||||
* vector<int> | GlobalCoorToProcessorCoorLocalCoor| nodeCoord(coord)
|
||||
* | |
|
||||
* void | Processors | logicalSize // returns cart array shape
|
||||
* void | ThisRank | nodeNumber(); // returns this node rank
|
||||
* void | ThisProcessorCoor | // returns this node coor
|
||||
* void | isBoss(void) | primaryNode();
|
||||
* | |
|
||||
* | RankFromProcessorCoor | getLogicalCoorFrom(node)
|
||||
* | ProcessorCoorFromRank | getNodeNumberFrom(logical_coord)
|
||||
*/
|
||||
|
||||
class GridBase : public CartesianCommunicator {
|
||||
public:
|
||||
@ -60,7 +18,8 @@ public:
|
||||
GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
|
||||
|
||||
|
||||
//protected:
|
||||
//FIXME
|
||||
// protected:
|
||||
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
||||
// and one generator per site.
|
||||
// std::default_random_engine generator;
|
||||
@ -165,7 +124,16 @@ public:
|
||||
lane = lane / _simd_layout[d];
|
||||
}
|
||||
}
|
||||
|
||||
inline int PermuteDim(int dimension){
|
||||
return _simd_layout[dimension]>1;
|
||||
}
|
||||
inline int PermuteType(int dimension){
|
||||
int permute_type=0;
|
||||
for(int d=_ndimension-1;d>dimension;d--){
|
||||
if (_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
return permute_type;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Array sizing queries
|
||||
////////////////////////////////////////////////////////////////
|
||||
@ -399,8 +367,6 @@ public:
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// subplane information
|
||||
// It may be worth the investment of generating a more general subplane "iterator",
|
||||
// and providing support for threads grabbing a unit of allocation.
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
_slice_block.resize(_ndimension);
|
||||
_slice_stride.resize(_ndimension);
|
||||
|
@ -4,17 +4,9 @@
|
||||
#include "Grid.h"
|
||||
|
||||
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// Permute the pointers 32bitx16 = 512
|
||||
static int permute_map[4][16] = {
|
||||
{ 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
|
||||
{ 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13},
|
||||
{ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11},
|
||||
{ 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8}
|
||||
};
|
||||
|
||||
extern int GridCshiftPermuteMap[4][16];
|
||||
|
||||
template<class vobj>
|
||||
class Lattice
|
||||
@ -37,11 +29,10 @@ public:
|
||||
|
||||
#include <Grid_cshift.h>
|
||||
|
||||
// overloading Grid::conformable but no conformable in Grid ...?:w
|
||||
template<class obj1,class obj2>
|
||||
friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
|
||||
|
||||
// Performance difference between operator * and mult is troubling.
|
||||
// FIXME Performance difference between operator * and mult is troubling.
|
||||
// Auto move constructor seems to lose surprisingly much.
|
||||
|
||||
// Site wise binary operations
|
||||
@ -182,23 +173,20 @@ public:
|
||||
}}
|
||||
};
|
||||
|
||||
// FIXME Implement a consistent seed management strategy
|
||||
friend void gaussian(Lattice<vobj> &l){
|
||||
// Zero mean, unit variance.
|
||||
std::normal_distribution<double> distribution(0.0,1.0);
|
||||
Real *v_ptr = (Real *)&l._odata[0];
|
||||
size_t v_len = l._grid->oSites()*sizeof(vobj);
|
||||
size_t d_len = v_len/sizeof(Real);
|
||||
|
||||
// Not a parallel RNG. Could make up some seed per 4d site, seed
|
||||
// per hypercube type scheme.
|
||||
|
||||
for(int i=0;i<d_len;i++){
|
||||
v_ptr[i]= drand48();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Unary functions and Unops
|
||||
// Unary negation
|
||||
friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
|
||||
Lattice<vobj> ret(r._grid);
|
||||
#pragma omp parallel for
|
||||
|
12
Grid_QCD.h
12
Grid_QCD.h
@ -24,10 +24,7 @@ namespace QCD {
|
||||
typedef iSinglet<Real > TReal; // This is painful. Tensor singlet complex type.
|
||||
|
||||
|
||||
typedef iSinglet<vIntegerF > vTIntegerF;
|
||||
typedef iSinglet<vIntegerD > vTIntegerD;
|
||||
typedef iSinglet<vIntegerC > vTIntegerC;
|
||||
typedef iSinglet<vIntegerZ > vTIntegerZ;
|
||||
typedef iSinglet<vInteger > vTInteger;
|
||||
|
||||
typedef iSpinMatrix<Complex > SpinMatrix;
|
||||
typedef iColourMatrix<Complex > ColourMatrix;
|
||||
@ -46,12 +43,9 @@ namespace QCD {
|
||||
typedef iColourVector<vComplex > vColourVector;
|
||||
typedef iSpinColourVector<vComplex > vSpinColourVector;
|
||||
|
||||
typedef Lattice<vTComplex> LatticeComplex;
|
||||
typedef Lattice<vTComplex> LatticeComplex;
|
||||
|
||||
typedef Lattice<vTIntegerF> LatticeIntegerF; // Predicates for "where"
|
||||
typedef Lattice<vTIntegerD> LatticeIntegerD;
|
||||
typedef Lattice<vTIntegerC> LatticeIntegerC;
|
||||
typedef Lattice<vTIntegerZ> LatticeIntegerZ;
|
||||
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
|
||||
|
||||
typedef Lattice<vColourMatrix> LatticeColourMatrix;
|
||||
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
|
||||
|
@ -1,5 +1,8 @@
|
||||
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
||||
#define GRID_ALIGNED_ALLOCATOR_H
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
@ -10,9 +10,6 @@
|
||||
/* AVX512 */
|
||||
/* #undef AVX512 */
|
||||
|
||||
/* GRID_COMMS_FAKE */
|
||||
/* #undef GRID_COMMS_FAKE */
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
#define GRID_COMMS_MPI 1
|
||||
|
||||
|
@ -9,9 +9,6 @@
|
||||
/* AVX512 */
|
||||
#undef AVX512
|
||||
|
||||
/* GRID_COMMS_FAKE */
|
||||
#undef GRID_COMMS_FAKE
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
#undef GRID_COMMS_MPI
|
||||
|
||||
|
@ -1,17 +1,5 @@
|
||||
#ifndef _GRID_CSHIFT_COMMON_H_
|
||||
#define _GRID_CSHIFT_COMMON_H_
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Must not lose sight that goal is to be able to construct really efficient
|
||||
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
||||
// additional stencil support.
|
||||
//
|
||||
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
||||
//
|
||||
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
||||
//
|
||||
// Grid could create a neighbour index table for a given stencil.
|
||||
// Could also implement CovariantCshift.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Gather for when there is no need to SIMD split
|
||||
@ -57,7 +45,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Gather for when there *is* need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
@ -101,8 +88,6 @@ friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> p
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there is no need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
@ -146,7 +131,6 @@ friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAll
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there *is* need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
@ -190,11 +174,9 @@ friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> po
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// local to node block strided copies
|
||||
//////////////////////////////////////////////////////
|
||||
// if lhs is odd, rhs even??
|
||||
friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
@ -284,40 +266,6 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
|
||||
// Local to node Cshift
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
// Work out whether to permute
|
||||
// ABCDEFGH -> AE BF CG DH permute wrap num
|
||||
//
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
|
||||
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
|
||||
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
|
||||
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
|
||||
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
|
||||
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
|
||||
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
|
||||
|
||||
// Suppose 4way simd in one dim.
|
||||
// ABCDEFGH -> AECG BFDH permute wrap num
|
||||
|
||||
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
|
||||
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
|
||||
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
|
||||
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
|
||||
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
|
||||
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
|
||||
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
|
||||
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
|
||||
|
||||
// Generalisation to 8 way simd, 16 way simd required.
|
||||
//
|
||||
// Need log2 Nway masks. consisting of
|
||||
// 1 bit 256 bit granule
|
||||
// 2 bit 128 bit granule
|
||||
// 4 bits 64 bit granule
|
||||
// 8 bits 32 bit granules
|
||||
//
|
||||
// 15 bits....
|
||||
|
||||
friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
@ -333,35 +281,31 @@ friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_ldimensions[dimension];
|
||||
int gd = rhs._grid->_gdimensions[dimension];
|
||||
|
||||
GridBase *grid = rhs._grid;
|
||||
int fd = grid->_fdimensions[dimension];
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
int ld = grid->_ldimensions[dimension];
|
||||
int gd = grid->_gdimensions[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
|
||||
// the permute type
|
||||
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
int permute_dim =grid->PermuteDim(dimension);
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int o = 0;
|
||||
int bo = x * rhs._grid->_ostride[dimension];
|
||||
int bo = x * grid->_ostride[dimension];
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
int sshift = grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int permute_slice=0;
|
||||
|
@ -146,10 +146,7 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
@ -236,9 +233,12 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
int toggle_bit = (Nsimd>>(permute_type+1));
|
||||
int PermuteMap;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if ( permute_slice ) {
|
||||
pointers[i] = rpointers[permute_map[permute_type][i]];
|
||||
PermuteMap=i^toggle_bit;
|
||||
pointers[i] = rpointers[PermuteMap];
|
||||
} else {
|
||||
pointers[i] = rpointers[i];
|
||||
}
|
||||
@ -260,8 +260,4 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -20,8 +20,8 @@ int main (int argc, char ** argv)
|
||||
|
||||
std::vector<int> mpi_layout(4);
|
||||
mpi_layout[0]=2;
|
||||
mpi_layout[1]=1;
|
||||
mpi_layout[2]=1;
|
||||
mpi_layout[1]=2;
|
||||
mpi_layout[2]=2;
|
||||
mpi_layout[3]=2;
|
||||
|
||||
#ifdef AVX512
|
||||
|
74
Grid_simd.h
74
Grid_simd.h
@ -10,32 +10,6 @@
|
||||
//
|
||||
// Vector types are arch dependent
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// TODO
|
||||
//
|
||||
// Base class to share common code between vRealF, VComplexF etc...
|
||||
//
|
||||
// lattice Broad cast assignment
|
||||
//
|
||||
// where() support
|
||||
// implement with masks, and/or? Type of the mask & boolean support?
|
||||
//
|
||||
// Unary functions
|
||||
// cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
|
||||
// exp, log, sqrt, fabs
|
||||
//
|
||||
// transposeColor, transposeSpin,
|
||||
// adjColor, adjSpin,
|
||||
// traceColor, traceSpin.
|
||||
// peekColor, peekSpin + pokeColor PokeSpin
|
||||
//
|
||||
// copyMask.
|
||||
//
|
||||
// localMaxAbs
|
||||
//
|
||||
// norm2,
|
||||
// sumMulti equivalent.
|
||||
// Fourier transform equivalent.
|
||||
//
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// SIMD Alignment controls
|
||||
@ -71,9 +45,6 @@ namespace Grid {
|
||||
typedef std::complex<RealD> ComplexD;
|
||||
typedef std::complex<Real> Complex;
|
||||
|
||||
|
||||
|
||||
|
||||
inline RealF adj(const RealF & r){ return r; }
|
||||
inline RealF conj(const RealF & r){ return r; }
|
||||
inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
||||
@ -122,7 +93,6 @@ namespace Grid {
|
||||
template<> inline void ZeroIt(RealD &arg){ arg=0; };
|
||||
|
||||
|
||||
|
||||
#if defined (SSE2)
|
||||
typedef __m128 fvec;
|
||||
typedef __m128d dvec;
|
||||
@ -162,31 +132,46 @@ namespace Grid {
|
||||
inline void v_prefetch0(int size, const char *ptr){};
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Generic extract/merge/permute
|
||||
/////////////////////////////////////////////////////////////////
|
||||
template<class vsimd,class scalar,int Nsimd>
|
||||
template<class vsimd,class scalar>
|
||||
inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
scalar buf[Nsimd];
|
||||
vstore(y,buf);
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
#if 1
|
||||
// FIXME: bounce off stack is painful
|
||||
// temporary hack while I figure out better way.
|
||||
// There are intrinsics to do this work without the storage.
|
||||
int Nsimd = extracted.size();
|
||||
{
|
||||
std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd);
|
||||
vstore(y,&buf[0]);
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
}
|
||||
#else
|
||||
int NSo = extracted.size();
|
||||
int NSv = vsimd::Nsimd();
|
||||
int sparse= NSv/NSo;
|
||||
for(int i=0;i<NSv;i+=sparse){
|
||||
|
||||
}
|
||||
#endif
|
||||
};
|
||||
template<class vsimd,class scalar,int Nsimd>
|
||||
template<class vsimd,class scalar>
|
||||
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
|
||||
scalar buf[Nsimd];
|
||||
#if 1
|
||||
int Nsimd = extracted.size();
|
||||
std::vector<scalar> buf(Nsimd);
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,buf);
|
||||
vset(y,&buf[0]);
|
||||
#else
|
||||
#endif
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
@ -197,8 +182,6 @@ inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
|
||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||
// Permute 4 possible on half precision @512bit vectors.
|
||||
//////////////////////////////////////////////////////////
|
||||
// Should be able to make the permute/extract/merge independent of the
|
||||
// vector subtype and reduce the volume of code.
|
||||
template<class vsimd>
|
||||
inline void Gpermute(vsimd &y,vsimd b,int perm){
|
||||
switch (perm){
|
||||
@ -229,6 +212,7 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){
|
||||
default: assert(0); break;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
#include <Grid_vRealF.h>
|
||||
#include <Grid_vRealD.h>
|
||||
|
12
Grid_stencil.h
Normal file
12
Grid_stencil.h
Normal file
@ -0,0 +1,12 @@
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Must not lose sight that goal is to be able to construct really efficient
|
||||
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
||||
// additional stencil support.
|
||||
//
|
||||
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
||||
//
|
||||
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
||||
//
|
||||
// Grid could create a neighbour index table for a given stencil.
|
||||
// Could also implement CovariantCshift.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
@ -5,7 +5,7 @@
|
||||
|
||||
namespace Grid {
|
||||
class vComplexD {
|
||||
protected:
|
||||
public:
|
||||
zvec v;
|
||||
public:
|
||||
typedef zvec vector_type;
|
||||
@ -154,64 +154,27 @@ namespace Grid {
|
||||
return ret;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Extract
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vComplexD::Nsimd();
|
||||
std::vector<ComplexD> buf(Nsimd);
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute(vComplexD &y,vComplexD b,int perm)
|
||||
{
|
||||
Gpermute<vComplexD>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
|
||||
{
|
||||
Gmerge<vComplexD,ComplexD >(y,extracted);
|
||||
}
|
||||
friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted)
|
||||
{
|
||||
Gextract<vComplexD,ComplexD>(y,extracted);
|
||||
}
|
||||
|
||||
vstore(y,&buf[0]);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
};
|
||||
|
||||
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vComplexD::Nsimd();
|
||||
std::vector<ComplexD> buf(Nsimd);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,&buf[0]);
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void permute(vComplexD &y,vComplexD b,int perm){
|
||||
switch (perm){
|
||||
// 2 complex=>1 permute
|
||||
#if defined(AVX1)||defined(AVX2)
|
||||
case 0: y.v = _mm256_permute2f128_pd(b.v,b.v,0x01); break;
|
||||
// AB => BA i.e. ab cd =>cd ab
|
||||
#endif
|
||||
#ifdef SSE2
|
||||
break;
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
// 4 complex=>2 permute
|
||||
// ABCD => BADC i.e. abcd efgh => cdab ghef
|
||||
// ABCD => CDAB i.e. abcd efgh => efgh abcd
|
||||
case 0: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_BADC); break;
|
||||
case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); // permute for double is not implemented
|
||||
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error // Not implemented yet
|
||||
#endif
|
||||
default: assert(0); break;
|
||||
}
|
||||
};
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void vload(zvec& a){
|
||||
this->v = a;
|
||||
}
|
||||
@ -296,7 +259,7 @@ friend inline void vstore(vComplexD &ret, ComplexD *a){
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
// REDUCE
|
||||
// REDUCE FIXME must be a cleaner implementation
|
||||
friend inline ComplexD Reduce(const vComplexD & in)
|
||||
{
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
|
@ -4,7 +4,9 @@
|
||||
|
||||
namespace Grid {
|
||||
class vComplexF {
|
||||
protected:
|
||||
// protected:
|
||||
|
||||
public:
|
||||
cvec v;
|
||||
|
||||
public:
|
||||
@ -129,75 +131,11 @@ namespace Grid {
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Extract
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted){
|
||||
// Bounce off heap is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
vComplexF vbuf;
|
||||
ComplexF *buf = (ComplexF *)&vbuf;
|
||||
|
||||
vstore(y,&buf[0]);
|
||||
for(int i=0;i<vComplexF::Nsimd();i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vComplexF::Nsimd();
|
||||
vComplexF vbuf;
|
||||
ComplexF *buf = (ComplexF *)&vbuf;
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,&buf[0]);
|
||||
};
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void permute(vComplexF &y,vComplexF b,int perm){
|
||||
switch (perm){
|
||||
#if defined(AVX1)||defined(AVX2)
|
||||
//HERE
|
||||
// 4 complex=>2 permutes
|
||||
// case 0 ABCD->BADC
|
||||
// case 1 ABCD->CDAB
|
||||
case 0: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 1: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
|
||||
#endif
|
||||
#ifdef SSE2
|
||||
case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
//#error should permute for 512
|
||||
// 8 complex=>3 permutes
|
||||
// case 0 ABCD EFGH -> BADC FEHG
|
||||
// case 1 ABCD EFGH -> CDAB GHEF
|
||||
// case 2 ABCD EFGH -> EFGH ABCD
|
||||
case 0: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break; // OK
|
||||
case 1: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break; // OK
|
||||
case 2: y.v = _mm512_permute4f128_ps(b.v, (_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; // OK
|
||||
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error
|
||||
#endif
|
||||
default: assert(0); break;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
friend inline void vset(vComplexF &ret, Complex *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
@ -358,6 +296,20 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
|
||||
return *this;
|
||||
}
|
||||
|
||||
friend inline void permute(vComplexF &y,vComplexF b,int perm)
|
||||
{
|
||||
Gpermute<vComplexF>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
|
||||
{
|
||||
Gmerge<vComplexF,ComplexF >(y,extracted);
|
||||
}
|
||||
friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted)
|
||||
{
|
||||
Gextract<vComplexF,ComplexF>(y,extracted);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
|
||||
@ -371,7 +323,5 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
|
||||
return l*r;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@ -235,70 +235,11 @@ friend inline void vstore(vInteger &ret, Integer *a){
|
||||
}
|
||||
friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gmerge<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
|
||||
Gmerge<vIntegerF,Integer>(y,extracted);
|
||||
}
|
||||
friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gextract<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class vIntegerD : public vInteger
|
||||
{
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(double);}
|
||||
|
||||
friend inline void permute(vIntegerD &y,vIntegerD b,int perm)
|
||||
{
|
||||
Gpermute<vIntegerD>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vIntegerD &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gmerge<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
|
||||
}
|
||||
friend inline void extract(vIntegerD &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gextract<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class vIntegerC : public vInteger
|
||||
{
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexF);}
|
||||
|
||||
friend inline void permute(vIntegerC &y,vIntegerC b,int perm)
|
||||
{
|
||||
Gpermute<vIntegerC>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vIntegerC &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gmerge<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
|
||||
}
|
||||
friend inline void extract(vIntegerC &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gextract<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
|
||||
}
|
||||
};
|
||||
|
||||
class vIntegerZ : public vInteger
|
||||
{
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexD);}
|
||||
|
||||
friend inline void permute(vIntegerZ &y,vIntegerZ b,int perm)
|
||||
{
|
||||
Gpermute<vIntegerZ>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vIntegerZ &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gmerge<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
|
||||
}
|
||||
friend inline void extract(vIntegerZ &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gextract<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
|
||||
Gextract<vIntegerF,Integer>(y,extracted);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
namespace Grid {
|
||||
class vRealD {
|
||||
protected:
|
||||
public:
|
||||
dvec v; // dvec is double precision vector
|
||||
|
||||
public:
|
||||
@ -99,72 +99,27 @@ namespace Grid {
|
||||
return ret;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Extract
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void extract(vRealD &y,std::vector<RealD *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vRealD::Nsimd();
|
||||
RealD buf[Nsimd];
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute(vRealD &y,vRealD b,int perm)
|
||||
{
|
||||
Gpermute<vRealD>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
|
||||
{
|
||||
Gmerge<vRealD,RealD >(y,extracted);
|
||||
}
|
||||
friend inline void extract(vRealD &y,std::vector<RealD *> &extracted)
|
||||
{
|
||||
Gextract<vRealD,RealD>(y,extracted);
|
||||
}
|
||||
|
||||
vstore(y,buf);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
};
|
||||
|
||||
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vRealD::Nsimd();
|
||||
RealD buf[Nsimd];
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,buf);
|
||||
};
|
||||
|
||||
|
||||
// Permute plans
|
||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
||||
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||
// Permute 4 possible on half precision @512bit vectors.
|
||||
friend inline void permute(vRealD &y,vRealD b,int perm){
|
||||
switch (perm){
|
||||
// 4 doubles=>2 permutes
|
||||
#if defined(AVX1)||defined(AVX2)
|
||||
case 0: y.v = _mm256_shuffle_pd(b.v,b.v,0x5); break;
|
||||
case 1: y.v = _mm256_permute2f128_pd(b.v,b.v,0x01); break;
|
||||
#endif
|
||||
#ifdef SSE2
|
||||
case 0: y.v = _mm_shuffle_pd(b.v,b.v,0x1); break;
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
// 8 double => 3 permutes
|
||||
// Permute 0 every abcd efgh -> badc fehg
|
||||
// Permute 1 every abcd efgh -> cdab ghef
|
||||
// Permute 2 every abcd efgh -> efgh abcd
|
||||
// NOTE: mm_512_permutex_pd not implemented
|
||||
// NOTE: ignore warning
|
||||
case 0: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_CDAB); break;
|
||||
case 1: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_BADC); break;
|
||||
case 2: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error
|
||||
#endif
|
||||
default: assert(0);break;
|
||||
}
|
||||
};
|
||||
// gona be bye bye
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void vload(dvec& a){
|
||||
this->v = a;
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
namespace Grid {
|
||||
class vRealF {
|
||||
protected:
|
||||
public:
|
||||
fvec v;
|
||||
|
||||
public:
|
||||
@ -120,74 +120,25 @@ namespace Grid {
|
||||
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Extract
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void extract(vRealF &y,std::vector<RealF *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vRealF::Nsimd();
|
||||
RealF buf[Nsimd];
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute(vRealF &y,vRealF b,int perm)
|
||||
{
|
||||
Gpermute<vRealF>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
|
||||
{
|
||||
Gmerge<vRealF,RealF >(y,extracted);
|
||||
}
|
||||
friend inline void extract(vRealF &y,std::vector<RealF *> &extracted)
|
||||
{
|
||||
Gextract<vRealF,RealF>(y,extracted);
|
||||
}
|
||||
|
||||
vstore(y,buf);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
};
|
||||
|
||||
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vRealF::Nsimd();
|
||||
RealF buf[Nsimd];
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,buf);
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
||||
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||
// Permute 4 possible on half precision @512bit vectors.
|
||||
//////////////////////////////////////////////////////////
|
||||
friend inline void permute(vRealF &y,vRealF b,int perm){
|
||||
switch (perm){
|
||||
// 8 floats=>3 permutes
|
||||
#if defined(AVX1)||defined(AVX2)
|
||||
case 0: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 2: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
|
||||
#endif
|
||||
#ifdef SSE2
|
||||
case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
// 16 floats=> permutes
|
||||
// Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo
|
||||
// Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn
|
||||
// Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
|
||||
// Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
|
||||
//#error not implemented should do something
|
||||
case 0: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
|
||||
case 1: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
|
||||
case 2: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 3: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error not implemented
|
||||
#endif
|
||||
default: assert(0); break;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Broadcast a value across Nsimd copies.
|
||||
@ -207,6 +158,8 @@ namespace Grid {
|
||||
ret.v = {a,a,a,a};
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
friend inline void vset(vRealF &ret, float *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
@ -224,6 +177,9 @@ namespace Grid {
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
friend inline void vstore(vRealF &ret, float *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
_mm256_store_ps(a,ret.v);
|
||||
|
@ -24,7 +24,6 @@ include_HEADERS = Grid_config.h\
|
||||
Grid_aligned_allocator.h\
|
||||
Grid_cshift.h\
|
||||
Grid_cshift_common.h\
|
||||
Grid_cshift_fake.h\
|
||||
Grid_cshift_mpi.h\
|
||||
Grid_cshift_none.h\
|
||||
Grid_math_types.h
|
||||
@ -37,13 +36,10 @@ bin_PROGRAMS = Grid_main
|
||||
|
||||
extra_sources=
|
||||
if BUILD_COMMS_MPI
|
||||
extra_sources+=Grid_mpi.cc
|
||||
endif
|
||||
if BUILD_COMMS_FAKE
|
||||
extra_sources+=Grid_fake.cc
|
||||
extra_sources+=Grid_communicator_mpi.cc
|
||||
endif
|
||||
if BUILD_COMMS_NONE
|
||||
extra_sources+=Grid_fake.cc
|
||||
extra_sources+=Grid_communicator_fake.cc
|
||||
endif
|
||||
|
||||
Grid_main_SOURCES = \
|
||||
|
29
Makefile.in
29
Makefile.in
@ -89,9 +89,8 @@ NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
bin_PROGRAMS = Grid_main$(EXEEXT)
|
||||
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_mpi.cc
|
||||
@BUILD_COMMS_FAKE_TRUE@am__append_2 = Grid_fake.cc
|
||||
@BUILD_COMMS_NONE_TRUE@am__append_3 = Grid_fake.cc
|
||||
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc
|
||||
@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc
|
||||
subdir = .
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
|
||||
@ -146,12 +145,13 @@ libGrid_a_LIBADD =
|
||||
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
|
||||
libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
|
||||
PROGRAMS = $(bin_PROGRAMS)
|
||||
am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_mpi.cc Grid_fake.cc
|
||||
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_mpi.$(OBJEXT)
|
||||
@BUILD_COMMS_FAKE_TRUE@am__objects_2 = Grid_fake.$(OBJEXT)
|
||||
@BUILD_COMMS_NONE_TRUE@am__objects_3 = Grid_fake.$(OBJEXT)
|
||||
am__objects_4 = $(am__objects_1) $(am__objects_2) $(am__objects_3)
|
||||
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_4)
|
||||
am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_communicator_mpi.cc \
|
||||
Grid_communicator_fake.cc
|
||||
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT)
|
||||
@BUILD_COMMS_NONE_TRUE@am__objects_2 = \
|
||||
@BUILD_COMMS_NONE_TRUE@ Grid_communicator_fake.$(OBJEXT)
|
||||
am__objects_3 = $(am__objects_1) $(am__objects_2)
|
||||
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_3)
|
||||
Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
|
||||
Grid_main_DEPENDENCIES = libGrid.a
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
@ -214,8 +214,8 @@ CTAGS = ctags
|
||||
CSCOPE = cscope
|
||||
AM_RECURSIVE_TARGETS = cscope
|
||||
am__DIST_COMMON = $(srcdir)/Grid_config.h.in $(srcdir)/Makefile.in \
|
||||
AUTHORS COPYING ChangeLog INSTALL NEWS README compile depcomp \
|
||||
install-sh missing
|
||||
AUTHORS COPYING ChangeLog INSTALL NEWS README TODO compile \
|
||||
depcomp install-sh missing
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
distdir = $(PACKAGE)-$(VERSION)
|
||||
top_distdir = $(distdir)
|
||||
@ -353,12 +353,11 @@ include_HEADERS = Grid_config.h\
|
||||
Grid_aligned_allocator.h\
|
||||
Grid_cshift.h\
|
||||
Grid_cshift_common.h\
|
||||
Grid_cshift_fake.h\
|
||||
Grid_cshift_mpi.h\
|
||||
Grid_cshift_none.h\
|
||||
Grid_math_types.h
|
||||
|
||||
extra_sources = $(am__append_1) $(am__append_2) $(am__append_3)
|
||||
extra_sources = $(am__append_1) $(am__append_2)
|
||||
Grid_main_SOURCES = \
|
||||
Grid_main.cc\
|
||||
$(extra_sources)
|
||||
@ -506,10 +505,10 @@ mostlyclean-compile:
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_fake.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_fake.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_mpi.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_mpi.Po@am__quote@
|
||||
|
||||
.cc.o:
|
||||
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
|
103
TODO
103
TODO
@ -1,4 +1,7 @@
|
||||
* FIXME audit
|
||||
* Remove vload/store etc..
|
||||
* Replace vset with a call to merge.
|
||||
* Replace vset with a call to merge.
|
||||
|
||||
* Conditional execution Subset, where etc...
|
||||
* Coordinate information, integers etc...
|
||||
@ -27,3 +30,103 @@
|
||||
- BinaryWriter, TextWriter etc...
|
||||
- protocol buffers?
|
||||
-
|
||||
// Cartesian grid inheritance
|
||||
// Grid::GridBase
|
||||
// |
|
||||
// __________|___________
|
||||
// | |
|
||||
// Grid::GridCartesian Grid::GridCartesianRedBlack
|
||||
//
|
||||
// TODO: document the following as an API guaranteed public interface
|
||||
|
||||
/*
|
||||
* Rough map of functionality against QDP++ Layout
|
||||
*
|
||||
* Param | Grid | QDP++
|
||||
* -----------------------------------------
|
||||
* | |
|
||||
* void | oSites, iSites, lSites | sitesOnNode
|
||||
* void | gSites | vol
|
||||
* | |
|
||||
* gcoor | oIndex, iIndex | linearSiteIndex // no virtual node in QDP
|
||||
* lcoor | |
|
||||
*
|
||||
* void | CheckerBoarded | - // No checkerboarded in QDP
|
||||
* void | FullDimensions | lattSize
|
||||
* void | GlobalDimensions | lattSize // No checkerboarded in QDP
|
||||
* void | LocalDimensions | subgridLattSize
|
||||
* void | VirtualLocalDimensions | subgridLattSize // no virtual node in QDP
|
||||
* | |
|
||||
* int x 3 | oiSiteRankToGlobal | siteCoords
|
||||
* | ProcessorCoorLocalCoorToGlobalCoor |
|
||||
* | |
|
||||
* vector<int> | GlobalCoorToRankIndex | nodeNumber(coord)
|
||||
* vector<int> | GlobalCoorToProcessorCoorLocalCoor| nodeCoord(coord)
|
||||
* | |
|
||||
* void | Processors | logicalSize // returns cart array shape
|
||||
* void | ThisRank | nodeNumber(); // returns this node rank
|
||||
* void | ThisProcessorCoor | // returns this node coor
|
||||
* void | isBoss(void) | primaryNode();
|
||||
* | |
|
||||
* | RankFromProcessorCoor | getLogicalCoorFrom(node)
|
||||
* | ProcessorCoorFromRank | getNodeNumberFrom(logical_coord)
|
||||
*/
|
||||
// Work out whether to permute
|
||||
// ABCDEFGH -> AE BF CG DH permute wrap num
|
||||
//
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
|
||||
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
|
||||
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
|
||||
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
|
||||
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
|
||||
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
|
||||
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
|
||||
|
||||
// Suppose 4way simd in one dim.
|
||||
// ABCDEFGH -> AECG BFDH permute wrap num
|
||||
|
||||
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
|
||||
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
|
||||
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
|
||||
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
|
||||
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
|
||||
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
|
||||
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
|
||||
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
|
||||
|
||||
// Generalisation to 8 way simd, 16 way simd required.
|
||||
//
|
||||
// Need log2 Nway masks. consisting of
|
||||
// 1 bit 256 bit granule
|
||||
// 2 bit 128 bit granule
|
||||
// 4 bits 64 bit granule
|
||||
// 8 bits 32 bit granules
|
||||
//
|
||||
// 15 bits....
|
||||
// TODO
|
||||
//
|
||||
// Base class to share common code between vRealF, VComplexF etc...
|
||||
//
|
||||
// lattice Broad cast assignment
|
||||
//
|
||||
// where() support
|
||||
// implement with masks, and/or? Type of the mask & boolean support?
|
||||
//
|
||||
// Unary functions
|
||||
// cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
|
||||
// exp, log, sqrt, fabs
|
||||
//
|
||||
// transposeColor, transposeSpin,
|
||||
// adjColor, adjSpin,
|
||||
// traceColor, traceSpin.
|
||||
// peekColor, peekSpin + pokeColor PokeSpin
|
||||
//
|
||||
// copyMask.
|
||||
//
|
||||
// localMaxAbs
|
||||
//
|
||||
// norm2,
|
||||
// sumMulti equivalent.
|
||||
// Fourier transform equivalent.
|
||||
//
|
||||
|
23
configure
vendored
23
configure
vendored
@ -628,8 +628,6 @@ LTLIBOBJS
|
||||
LIBOBJS
|
||||
BUILD_COMMS_NONE_FALSE
|
||||
BUILD_COMMS_NONE_TRUE
|
||||
BUILD_COMMS_FAKE_FALSE
|
||||
BUILD_COMMS_FAKE_TRUE
|
||||
BUILD_COMMS_MPI_FALSE
|
||||
BUILD_COMMS_MPI_TRUE
|
||||
EGREP
|
||||
@ -1369,8 +1367,7 @@ Optional Features:
|
||||
--disable-openmp do not use OpenMP
|
||||
--enable-simd=SSE|AVX|AVX2|AVX512
|
||||
Select instructions
|
||||
--enable-comms=none|fake|mpi
|
||||
Select communications
|
||||
--enable-comms=none|mpi Select communications
|
||||
|
||||
Some influential environment variables:
|
||||
CXX C++ compiler command
|
||||
@ -5051,12 +5048,6 @@ fi
|
||||
|
||||
|
||||
case ${ac_COMMS} in
|
||||
fake)
|
||||
echo Configuring for FAKE communications
|
||||
|
||||
$as_echo "#define GRID_COMMS_FAKE 1" >>confdefs.h
|
||||
|
||||
;;
|
||||
none)
|
||||
echo Configuring for NO communications
|
||||
|
||||
@ -5082,14 +5073,6 @@ else
|
||||
BUILD_COMMS_MPI_FALSE=
|
||||
fi
|
||||
|
||||
if test "X${ac_COMMS}X" == "XfakeX" ; then
|
||||
BUILD_COMMS_FAKE_TRUE=
|
||||
BUILD_COMMS_FAKE_FALSE='#'
|
||||
else
|
||||
BUILD_COMMS_FAKE_TRUE='#'
|
||||
BUILD_COMMS_FAKE_FALSE=
|
||||
fi
|
||||
|
||||
if test "X${ac_COMMS}X" == "XnoneX" ; then
|
||||
BUILD_COMMS_NONE_TRUE=
|
||||
BUILD_COMMS_NONE_FALSE='#'
|
||||
@ -5243,10 +5226,6 @@ if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
|
||||
as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${BUILD_COMMS_FAKE_TRUE}" && test -z "${BUILD_COMMS_FAKE_FALSE}"; then
|
||||
as_fn_error $? "conditional \"BUILD_COMMS_FAKE\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${BUILD_COMMS_NONE_TRUE}" && test -z "${BUILD_COMMS_NONE_FALSE}"; then
|
||||
as_fn_error $? "conditional \"BUILD_COMMS_NONE\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
|
@ -51,13 +51,9 @@ case ${ac_SIMD} in
|
||||
esac
|
||||
|
||||
|
||||
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|fake|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
|
||||
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
|
||||
|
||||
case ${ac_COMMS} in
|
||||
fake)
|
||||
echo Configuring for FAKE communications
|
||||
AC_DEFINE([GRID_COMMS_FAKE],[1],[GRID_COMMS_FAKE] )
|
||||
;;
|
||||
none)
|
||||
echo Configuring for NO communications
|
||||
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
|
||||
@ -72,7 +68,6 @@ case ${ac_COMMS} in
|
||||
esac
|
||||
|
||||
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
|
||||
AM_CONDITIONAL(BUILD_COMMS_FAKE,[ test "X${ac_COMMS}X" == "XfakeX" ])
|
||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user