mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Major rework of extract/merge/permute processing debugged and working.
This commit is contained in:
parent
9e597ac50a
commit
982274e5a0
3
Grid.h
3
Grid.h
@ -42,11 +42,10 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#include <Grid_aligned_allocator.h>
|
||||||
#include <Grid_simd.h>
|
#include <Grid_simd.h>
|
||||||
#include <Grid_math_types.h>
|
#include <Grid_math_types.h>
|
||||||
#include <Grid_Cartesian.h>
|
#include <Grid_Cartesian.h>
|
||||||
#include <Grid_aligned_allocator.h>
|
|
||||||
#include <Grid_aligned_allocator.h>
|
|
||||||
#include <Grid_Lattice.h>
|
#include <Grid_Lattice.h>
|
||||||
#include <Grid_QCD.h>
|
#include <Grid_QCD.h>
|
||||||
|
|
||||||
|
@ -8,48 +8,6 @@ namespace Grid{
|
|||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Grid Support.
|
// Grid Support.
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
//
|
|
||||||
// Cartesian grid inheritance
|
|
||||||
// Grid::GridBase
|
|
||||||
// |
|
|
||||||
// __________|___________
|
|
||||||
// | |
|
|
||||||
// Grid::GridCartesian Grid::GridCartesianRedBlack
|
|
||||||
//
|
|
||||||
// TODO: document the following as an API guaranteed public interface
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Rough map of functionality against QDP++ Layout
|
|
||||||
*
|
|
||||||
* Param | Grid | QDP++
|
|
||||||
* -----------------------------------------
|
|
||||||
* | |
|
|
||||||
* void | oSites, iSites, lSites | sitesOnNode
|
|
||||||
* void | gSites | vol
|
|
||||||
* | |
|
|
||||||
* gcoor | oIndex, iIndex | linearSiteIndex // no virtual node in QDP
|
|
||||||
* lcoor | |
|
|
||||||
*
|
|
||||||
* void | CheckerBoarded | - // No checkerboarded in QDP
|
|
||||||
* void | FullDimensions | lattSize
|
|
||||||
* void | GlobalDimensions | lattSize // No checkerboarded in QDP
|
|
||||||
* void | LocalDimensions | subgridLattSize
|
|
||||||
* void | VirtualLocalDimensions | subgridLattSize // no virtual node in QDP
|
|
||||||
* | |
|
|
||||||
* int x 3 | oiSiteRankToGlobal | siteCoords
|
|
||||||
* | ProcessorCoorLocalCoorToGlobalCoor |
|
|
||||||
* | |
|
|
||||||
* vector<int> | GlobalCoorToRankIndex | nodeNumber(coord)
|
|
||||||
* vector<int> | GlobalCoorToProcessorCoorLocalCoor| nodeCoord(coord)
|
|
||||||
* | |
|
|
||||||
* void | Processors | logicalSize // returns cart array shape
|
|
||||||
* void | ThisRank | nodeNumber(); // returns this node rank
|
|
||||||
* void | ThisProcessorCoor | // returns this node coor
|
|
||||||
* void | isBoss(void) | primaryNode();
|
|
||||||
* | |
|
|
||||||
* | RankFromProcessorCoor | getLogicalCoorFrom(node)
|
|
||||||
* | ProcessorCoorFromRank | getNodeNumberFrom(logical_coord)
|
|
||||||
*/
|
|
||||||
|
|
||||||
class GridBase : public CartesianCommunicator {
|
class GridBase : public CartesianCommunicator {
|
||||||
public:
|
public:
|
||||||
@ -60,7 +18,8 @@ public:
|
|||||||
GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
|
GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
|
||||||
|
|
||||||
|
|
||||||
//protected:
|
//FIXME
|
||||||
|
// protected:
|
||||||
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
||||||
// and one generator per site.
|
// and one generator per site.
|
||||||
// std::default_random_engine generator;
|
// std::default_random_engine generator;
|
||||||
@ -165,7 +124,16 @@ public:
|
|||||||
lane = lane / _simd_layout[d];
|
lane = lane / _simd_layout[d];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
inline int PermuteDim(int dimension){
|
||||||
|
return _simd_layout[dimension]>1;
|
||||||
|
}
|
||||||
|
inline int PermuteType(int dimension){
|
||||||
|
int permute_type=0;
|
||||||
|
for(int d=_ndimension-1;d>dimension;d--){
|
||||||
|
if (_simd_layout[d]>1 ) permute_type++;
|
||||||
|
}
|
||||||
|
return permute_type;
|
||||||
|
}
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Array sizing queries
|
// Array sizing queries
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
@ -399,8 +367,6 @@ public:
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// subplane information
|
// subplane information
|
||||||
// It may be worth the investment of generating a more general subplane "iterator",
|
|
||||||
// and providing support for threads grabbing a unit of allocation.
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
_slice_block.resize(_ndimension);
|
_slice_block.resize(_ndimension);
|
||||||
_slice_stride.resize(_ndimension);
|
_slice_stride.resize(_ndimension);
|
||||||
|
@ -4,17 +4,9 @@
|
|||||||
#include "Grid.h"
|
#include "Grid.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
// Permute the pointers 32bitx16 = 512
|
extern int GridCshiftPermuteMap[4][16];
|
||||||
static int permute_map[4][16] = {
|
|
||||||
{ 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
|
|
||||||
{ 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13},
|
|
||||||
{ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11},
|
|
||||||
{ 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
class Lattice
|
class Lattice
|
||||||
@ -37,11 +29,10 @@ public:
|
|||||||
|
|
||||||
#include <Grid_cshift.h>
|
#include <Grid_cshift.h>
|
||||||
|
|
||||||
// overloading Grid::conformable but no conformable in Grid ...?:w
|
|
||||||
template<class obj1,class obj2>
|
template<class obj1,class obj2>
|
||||||
friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
|
friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
|
||||||
|
|
||||||
// Performance difference between operator * and mult is troubling.
|
// FIXME Performance difference between operator * and mult is troubling.
|
||||||
// Auto move constructor seems to lose surprisingly much.
|
// Auto move constructor seems to lose surprisingly much.
|
||||||
|
|
||||||
// Site wise binary operations
|
// Site wise binary operations
|
||||||
@ -182,23 +173,20 @@ public:
|
|||||||
}}
|
}}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// FIXME Implement a consistent seed management strategy
|
||||||
friend void gaussian(Lattice<vobj> &l){
|
friend void gaussian(Lattice<vobj> &l){
|
||||||
// Zero mean, unit variance.
|
// Zero mean, unit variance.
|
||||||
std::normal_distribution<double> distribution(0.0,1.0);
|
std::normal_distribution<double> distribution(0.0,1.0);
|
||||||
Real *v_ptr = (Real *)&l._odata[0];
|
Real *v_ptr = (Real *)&l._odata[0];
|
||||||
size_t v_len = l._grid->oSites()*sizeof(vobj);
|
size_t v_len = l._grid->oSites()*sizeof(vobj);
|
||||||
size_t d_len = v_len/sizeof(Real);
|
size_t d_len = v_len/sizeof(Real);
|
||||||
|
|
||||||
// Not a parallel RNG. Could make up some seed per 4d site, seed
|
|
||||||
// per hypercube type scheme.
|
|
||||||
for(int i=0;i<d_len;i++){
|
for(int i=0;i<d_len;i++){
|
||||||
v_ptr[i]= drand48();
|
v_ptr[i]= drand48();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// Unary functions and Unops
|
// Unary functions and Unops
|
||||||
// Unary negation
|
|
||||||
friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
|
friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
|
||||||
Lattice<vobj> ret(r._grid);
|
Lattice<vobj> ret(r._grid);
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
|
12
Grid_QCD.h
12
Grid_QCD.h
@ -24,10 +24,7 @@ namespace QCD {
|
|||||||
typedef iSinglet<Real > TReal; // This is painful. Tensor singlet complex type.
|
typedef iSinglet<Real > TReal; // This is painful. Tensor singlet complex type.
|
||||||
|
|
||||||
|
|
||||||
typedef iSinglet<vIntegerF > vTIntegerF;
|
typedef iSinglet<vInteger > vTInteger;
|
||||||
typedef iSinglet<vIntegerD > vTIntegerD;
|
|
||||||
typedef iSinglet<vIntegerC > vTIntegerC;
|
|
||||||
typedef iSinglet<vIntegerZ > vTIntegerZ;
|
|
||||||
|
|
||||||
typedef iSpinMatrix<Complex > SpinMatrix;
|
typedef iSpinMatrix<Complex > SpinMatrix;
|
||||||
typedef iColourMatrix<Complex > ColourMatrix;
|
typedef iColourMatrix<Complex > ColourMatrix;
|
||||||
@ -46,12 +43,9 @@ namespace QCD {
|
|||||||
typedef iColourVector<vComplex > vColourVector;
|
typedef iColourVector<vComplex > vColourVector;
|
||||||
typedef iSpinColourVector<vComplex > vSpinColourVector;
|
typedef iSpinColourVector<vComplex > vSpinColourVector;
|
||||||
|
|
||||||
typedef Lattice<vTComplex> LatticeComplex;
|
typedef Lattice<vTComplex> LatticeComplex;
|
||||||
|
|
||||||
typedef Lattice<vTIntegerF> LatticeIntegerF; // Predicates for "where"
|
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
|
||||||
typedef Lattice<vTIntegerD> LatticeIntegerD;
|
|
||||||
typedef Lattice<vTIntegerC> LatticeIntegerC;
|
|
||||||
typedef Lattice<vTIntegerZ> LatticeIntegerZ;
|
|
||||||
|
|
||||||
typedef Lattice<vColourMatrix> LatticeColourMatrix;
|
typedef Lattice<vColourMatrix> LatticeColourMatrix;
|
||||||
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
|
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
||||||
#define GRID_ALIGNED_ALLOCATOR_H
|
#define GRID_ALIGNED_ALLOCATOR_H
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
|
@ -10,9 +10,6 @@
|
|||||||
/* AVX512 */
|
/* AVX512 */
|
||||||
/* #undef AVX512 */
|
/* #undef AVX512 */
|
||||||
|
|
||||||
/* GRID_COMMS_FAKE */
|
|
||||||
/* #undef GRID_COMMS_FAKE */
|
|
||||||
|
|
||||||
/* GRID_COMMS_MPI */
|
/* GRID_COMMS_MPI */
|
||||||
#define GRID_COMMS_MPI 1
|
#define GRID_COMMS_MPI 1
|
||||||
|
|
||||||
|
@ -9,9 +9,6 @@
|
|||||||
/* AVX512 */
|
/* AVX512 */
|
||||||
#undef AVX512
|
#undef AVX512
|
||||||
|
|
||||||
/* GRID_COMMS_FAKE */
|
|
||||||
#undef GRID_COMMS_FAKE
|
|
||||||
|
|
||||||
/* GRID_COMMS_MPI */
|
/* GRID_COMMS_MPI */
|
||||||
#undef GRID_COMMS_MPI
|
#undef GRID_COMMS_MPI
|
||||||
|
|
||||||
|
@ -1,17 +1,5 @@
|
|||||||
#ifndef _GRID_CSHIFT_COMMON_H_
|
#ifndef _GRID_CSHIFT_COMMON_H_
|
||||||
#define _GRID_CSHIFT_COMMON_H_
|
#define _GRID_CSHIFT_COMMON_H_
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Must not lose sight that goal is to be able to construct really efficient
|
|
||||||
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
|
||||||
// additional stencil support.
|
|
||||||
//
|
|
||||||
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
|
||||||
//
|
|
||||||
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
|
||||||
//
|
|
||||||
// Grid could create a neighbour index table for a given stencil.
|
|
||||||
// Could also implement CovariantCshift.
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
@ -57,7 +45,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Gather for when there *is* need to SIMD split
|
// Gather for when there *is* need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -101,8 +88,6 @@ friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> p
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there is no need to SIMD split
|
// Scatter for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -146,7 +131,6 @@ friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAll
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there *is* need to SIMD split
|
// Scatter for when there *is* need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -190,11 +174,9 @@ friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> po
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// local to node block strided copies
|
// local to node block strided copies
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// if lhs is odd, rhs even??
|
|
||||||
friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
@ -284,40 +266,6 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
|
|||||||
// Local to node Cshift
|
// Local to node Cshift
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
|
|
||||||
// Work out whether to permute
|
|
||||||
// ABCDEFGH -> AE BF CG DH permute wrap num
|
|
||||||
//
|
|
||||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
|
|
||||||
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
|
|
||||||
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
|
|
||||||
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
|
|
||||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
|
|
||||||
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
|
|
||||||
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
|
|
||||||
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
|
|
||||||
|
|
||||||
// Suppose 4way simd in one dim.
|
|
||||||
// ABCDEFGH -> AECG BFDH permute wrap num
|
|
||||||
|
|
||||||
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
|
|
||||||
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
|
|
||||||
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
|
|
||||||
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
|
|
||||||
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
|
|
||||||
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
|
|
||||||
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
|
|
||||||
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
|
|
||||||
|
|
||||||
// Generalisation to 8 way simd, 16 way simd required.
|
|
||||||
//
|
|
||||||
// Need log2 Nway masks. consisting of
|
|
||||||
// 1 bit 256 bit granule
|
|
||||||
// 2 bit 128 bit granule
|
|
||||||
// 4 bits 64 bit granule
|
|
||||||
// 8 bits 32 bit granules
|
|
||||||
//
|
|
||||||
// 15 bits....
|
|
||||||
|
|
||||||
friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
{
|
{
|
||||||
int sshift[2];
|
int sshift[2];
|
||||||
@ -333,35 +281,31 @@ friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
{
|
{
|
||||||
int fd = rhs._grid->_fdimensions[dimension];
|
GridBase *grid = rhs._grid;
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int fd = grid->_fdimensions[dimension];
|
||||||
int ld = rhs._grid->_ldimensions[dimension];
|
int rd = grid->_rdimensions[dimension];
|
||||||
int gd = rhs._grid->_gdimensions[dimension];
|
int ld = grid->_ldimensions[dimension];
|
||||||
|
int gd = grid->_gdimensions[dimension];
|
||||||
|
|
||||||
// Map to always positive shift modulo global full dimension.
|
// Map to always positive shift modulo global full dimension.
|
||||||
shift = (shift+fd)%fd;
|
shift = (shift+fd)%fd;
|
||||||
|
|
||||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||||
|
|
||||||
// the permute type
|
// the permute type
|
||||||
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
int permute_dim =grid->PermuteDim(dimension);
|
||||||
int permute_type=0;
|
int permute_type=grid->PermuteType(dimension);
|
||||||
for(int d=0;d<dimension;d++){
|
|
||||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
int o = 0;
|
int o = 0;
|
||||||
int bo = x * rhs._grid->_ostride[dimension];
|
int bo = x * grid->_ostride[dimension];
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? 1 : 0;
|
int cb= (cbmask==0x2)? 1 : 0;
|
||||||
|
|
||||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
int sshift = grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
int permute_slice=0;
|
int permute_slice=0;
|
||||||
|
@ -146,10 +146,7 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
assert(shift>=0);
|
assert(shift>=0);
|
||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int permute_type=0;
|
int permute_type=grid->PermuteType(dimension);
|
||||||
for(int d=0;d<dimension;d++){
|
|
||||||
if (grid->_simd_layout[d]>1 ) permute_type++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
///////////////////////////////////////////////
|
||||||
// Simd direction uses an extract/merge pair
|
// Simd direction uses an extract/merge pair
|
||||||
@ -236,9 +233,12 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
if ( x< rd-num ) permute_slice=wrap;
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
else permute_slice = 1-wrap;
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
|
int toggle_bit = (Nsimd>>(permute_type+1));
|
||||||
|
int PermuteMap;
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
if ( permute_slice ) {
|
if ( permute_slice ) {
|
||||||
pointers[i] = rpointers[permute_map[permute_type][i]];
|
PermuteMap=i^toggle_bit;
|
||||||
|
pointers[i] = rpointers[PermuteMap];
|
||||||
} else {
|
} else {
|
||||||
pointers[i] = rpointers[i];
|
pointers[i] = rpointers[i];
|
||||||
}
|
}
|
||||||
@ -260,8 +260,4 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -20,8 +20,8 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::vector<int> mpi_layout(4);
|
std::vector<int> mpi_layout(4);
|
||||||
mpi_layout[0]=2;
|
mpi_layout[0]=2;
|
||||||
mpi_layout[1]=1;
|
mpi_layout[1]=2;
|
||||||
mpi_layout[2]=1;
|
mpi_layout[2]=2;
|
||||||
mpi_layout[3]=2;
|
mpi_layout[3]=2;
|
||||||
|
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
|
74
Grid_simd.h
74
Grid_simd.h
@ -10,32 +10,6 @@
|
|||||||
//
|
//
|
||||||
// Vector types are arch dependent
|
// Vector types are arch dependent
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// TODO
|
|
||||||
//
|
|
||||||
// Base class to share common code between vRealF, VComplexF etc...
|
|
||||||
//
|
|
||||||
// lattice Broad cast assignment
|
|
||||||
//
|
|
||||||
// where() support
|
|
||||||
// implement with masks, and/or? Type of the mask & boolean support?
|
|
||||||
//
|
|
||||||
// Unary functions
|
|
||||||
// cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
|
|
||||||
// exp, log, sqrt, fabs
|
|
||||||
//
|
|
||||||
// transposeColor, transposeSpin,
|
|
||||||
// adjColor, adjSpin,
|
|
||||||
// traceColor, traceSpin.
|
|
||||||
// peekColor, peekSpin + pokeColor PokeSpin
|
|
||||||
//
|
|
||||||
// copyMask.
|
|
||||||
//
|
|
||||||
// localMaxAbs
|
|
||||||
//
|
|
||||||
// norm2,
|
|
||||||
// sumMulti equivalent.
|
|
||||||
// Fourier transform equivalent.
|
|
||||||
//
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// SIMD Alignment controls
|
// SIMD Alignment controls
|
||||||
@ -71,9 +45,6 @@ namespace Grid {
|
|||||||
typedef std::complex<RealD> ComplexD;
|
typedef std::complex<RealD> ComplexD;
|
||||||
typedef std::complex<Real> Complex;
|
typedef std::complex<Real> Complex;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
inline RealF adj(const RealF & r){ return r; }
|
inline RealF adj(const RealF & r){ return r; }
|
||||||
inline RealF conj(const RealF & r){ return r; }
|
inline RealF conj(const RealF & r){ return r; }
|
||||||
inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
||||||
@ -122,7 +93,6 @@ namespace Grid {
|
|||||||
template<> inline void ZeroIt(RealD &arg){ arg=0; };
|
template<> inline void ZeroIt(RealD &arg){ arg=0; };
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if defined (SSE2)
|
#if defined (SSE2)
|
||||||
typedef __m128 fvec;
|
typedef __m128 fvec;
|
||||||
typedef __m128d dvec;
|
typedef __m128d dvec;
|
||||||
@ -162,31 +132,46 @@ namespace Grid {
|
|||||||
inline void v_prefetch0(int size, const char *ptr){};
|
inline void v_prefetch0(int size, const char *ptr){};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// Generic extract/merge/permute
|
// Generic extract/merge/permute
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
template<class vsimd,class scalar,int Nsimd>
|
template<class vsimd,class scalar>
|
||||||
inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
|
inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
|
||||||
// Bounce off stack is painful
|
#if 1
|
||||||
// temporary hack while I figure out the right interface
|
// FIXME: bounce off stack is painful
|
||||||
scalar buf[Nsimd];
|
// temporary hack while I figure out better way.
|
||||||
vstore(y,buf);
|
// There are intrinsics to do this work without the storage.
|
||||||
for(int i=0;i<Nsimd;i++){
|
int Nsimd = extracted.size();
|
||||||
*extracted[i] = buf[i];
|
{
|
||||||
extracted[i]++;
|
std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd);
|
||||||
|
vstore(y,&buf[0]);
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
*extracted[i] = buf[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
int NSo = extracted.size();
|
||||||
|
int NSv = vsimd::Nsimd();
|
||||||
|
int sparse= NSv/NSo;
|
||||||
|
for(int i=0;i<NSv;i+=sparse){
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
template<class vsimd,class scalar,int Nsimd>
|
template<class vsimd,class scalar>
|
||||||
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
|
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
|
||||||
scalar buf[Nsimd];
|
#if 1
|
||||||
|
int Nsimd = extracted.size();
|
||||||
|
std::vector<scalar> buf(Nsimd);
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
buf[i]=*extracted[i];
|
buf[i]=*extracted[i];
|
||||||
extracted[i]++;
|
extracted[i]++;
|
||||||
}
|
}
|
||||||
vset(y,buf);
|
vset(y,&buf[0]);
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
@ -197,8 +182,6 @@ inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
|
|||||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||||
// Permute 4 possible on half precision @512bit vectors.
|
// Permute 4 possible on half precision @512bit vectors.
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Should be able to make the permute/extract/merge independent of the
|
|
||||||
// vector subtype and reduce the volume of code.
|
|
||||||
template<class vsimd>
|
template<class vsimd>
|
||||||
inline void Gpermute(vsimd &y,vsimd b,int perm){
|
inline void Gpermute(vsimd &y,vsimd b,int perm){
|
||||||
switch (perm){
|
switch (perm){
|
||||||
@ -229,6 +212,7 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){
|
|||||||
default: assert(0); break;
|
default: assert(0); break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
};
|
||||||
|
|
||||||
#include <Grid_vRealF.h>
|
#include <Grid_vRealF.h>
|
||||||
#include <Grid_vRealD.h>
|
#include <Grid_vRealD.h>
|
||||||
|
12
Grid_stencil.h
Normal file
12
Grid_stencil.h
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Must not lose sight that goal is to be able to construct really efficient
|
||||||
|
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
||||||
|
// additional stencil support.
|
||||||
|
//
|
||||||
|
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
||||||
|
//
|
||||||
|
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
||||||
|
//
|
||||||
|
// Grid could create a neighbour index table for a given stencil.
|
||||||
|
// Could also implement CovariantCshift.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
class vComplexD {
|
class vComplexD {
|
||||||
protected:
|
public:
|
||||||
zvec v;
|
zvec v;
|
||||||
public:
|
public:
|
||||||
typedef zvec vector_type;
|
typedef zvec vector_type;
|
||||||
@ -154,64 +154,27 @@ namespace Grid {
|
|||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Extract
|
// General permute; assumes vector length is same across
|
||||||
/////////////////////////////////////////////////////////////////
|
// all subtypes; may not be a good assumption, but could
|
||||||
friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted){
|
// add the vector width as a template param for BG/Q for example
|
||||||
// Bounce off stack is painful
|
////////////////////////////////////////////////////////////////////
|
||||||
// temporary hack while I figure out the right interface
|
friend inline void permute(vComplexD &y,vComplexD b,int perm)
|
||||||
const int Nsimd = vComplexD::Nsimd();
|
{
|
||||||
std::vector<ComplexD> buf(Nsimd);
|
Gpermute<vComplexD>(y,b,perm);
|
||||||
|
}
|
||||||
|
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
|
||||||
|
{
|
||||||
|
Gmerge<vComplexD,ComplexD >(y,extracted);
|
||||||
|
}
|
||||||
|
friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted)
|
||||||
|
{
|
||||||
|
Gextract<vComplexD,ComplexD>(y,extracted);
|
||||||
|
}
|
||||||
|
|
||||||
vstore(y,&buf[0]);
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||||
for(int i=0;i<Nsimd;i++){
|
////////////////////////////////////////////////////////////////////////
|
||||||
*extracted[i] = buf[i];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted){
|
|
||||||
// Bounce off stack is painful
|
|
||||||
// temporary hack while I figure out the right interface
|
|
||||||
const int Nsimd = vComplexD::Nsimd();
|
|
||||||
std::vector<ComplexD> buf(Nsimd);
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
buf[i]=*extracted[i];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
vset(y,&buf[0]);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
// Permute
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
friend inline void permute(vComplexD &y,vComplexD b,int perm){
|
|
||||||
switch (perm){
|
|
||||||
// 2 complex=>1 permute
|
|
||||||
#if defined(AVX1)||defined(AVX2)
|
|
||||||
case 0: y.v = _mm256_permute2f128_pd(b.v,b.v,0x01); break;
|
|
||||||
// AB => BA i.e. ab cd =>cd ab
|
|
||||||
#endif
|
|
||||||
#ifdef SSE2
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
#ifdef AVX512
|
|
||||||
// 4 complex=>2 permute
|
|
||||||
// ABCD => BADC i.e. abcd efgh => cdab ghef
|
|
||||||
// ABCD => CDAB i.e. abcd efgh => efgh abcd
|
|
||||||
case 0: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_BADC); break;
|
|
||||||
case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); // permute for double is not implemented
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#ifdef QPX
|
|
||||||
#error // Not implemented yet
|
|
||||||
#endif
|
|
||||||
default: assert(0); break;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
void vload(zvec& a){
|
void vload(zvec& a){
|
||||||
this->v = a;
|
this->v = a;
|
||||||
}
|
}
|
||||||
@ -296,7 +259,7 @@ friend inline void vstore(vComplexD &ret, ComplexD *a){
|
|||||||
#endif
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
// REDUCE
|
// REDUCE FIXME must be a cleaner implementation
|
||||||
friend inline ComplexD Reduce(const vComplexD & in)
|
friend inline ComplexD Reduce(const vComplexD & in)
|
||||||
{
|
{
|
||||||
#if defined (AVX1) || defined(AVX2)
|
#if defined (AVX1) || defined(AVX2)
|
||||||
|
@ -4,7 +4,9 @@
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
class vComplexF {
|
class vComplexF {
|
||||||
protected:
|
// protected:
|
||||||
|
|
||||||
|
public:
|
||||||
cvec v;
|
cvec v;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -129,75 +131,11 @@ namespace Grid {
|
|||||||
#endif
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Extract
|
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||||
/////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted){
|
|
||||||
// Bounce off heap is painful
|
|
||||||
// temporary hack while I figure out the right interface
|
|
||||||
vComplexF vbuf;
|
|
||||||
ComplexF *buf = (ComplexF *)&vbuf;
|
|
||||||
|
|
||||||
vstore(y,&buf[0]);
|
|
||||||
for(int i=0;i<vComplexF::Nsimd();i++){
|
|
||||||
*extracted[i] = buf[i];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted){
|
|
||||||
// Bounce off stack is painful
|
|
||||||
// temporary hack while I figure out the right interface
|
|
||||||
const int Nsimd = vComplexF::Nsimd();
|
|
||||||
vComplexF vbuf;
|
|
||||||
ComplexF *buf = (ComplexF *)&vbuf;
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
buf[i]=*extracted[i];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
vset(y,&buf[0]);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
// Permute
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
friend inline void permute(vComplexF &y,vComplexF b,int perm){
|
|
||||||
switch (perm){
|
|
||||||
#if defined(AVX1)||defined(AVX2)
|
|
||||||
//HERE
|
|
||||||
// 4 complex=>2 permutes
|
|
||||||
// case 0 ABCD->BADC
|
|
||||||
// case 1 ABCD->CDAB
|
|
||||||
case 0: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
|
|
||||||
case 1: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
|
|
||||||
#endif
|
|
||||||
#ifdef SSE2
|
|
||||||
case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
|
|
||||||
#endif
|
|
||||||
#ifdef AVX512
|
|
||||||
//#error should permute for 512
|
|
||||||
// 8 complex=>3 permutes
|
|
||||||
// case 0 ABCD EFGH -> BADC FEHG
|
|
||||||
// case 1 ABCD EFGH -> CDAB GHEF
|
|
||||||
// case 2 ABCD EFGH -> EFGH ABCD
|
|
||||||
case 0: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break; // OK
|
|
||||||
case 1: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break; // OK
|
|
||||||
case 2: y.v = _mm512_permute4f128_ps(b.v, (_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; // OK
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#ifdef QPX
|
|
||||||
#error
|
|
||||||
#endif
|
|
||||||
default: assert(0); break;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
friend inline void vset(vComplexF &ret, Complex *a){
|
friend inline void vset(vComplexF &ret, Complex *a){
|
||||||
#if defined (AVX1)|| defined (AVX2)
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||||
@ -358,6 +296,20 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
|
|||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
friend inline void permute(vComplexF &y,vComplexF b,int perm)
|
||||||
|
{
|
||||||
|
Gpermute<vComplexF>(y,b,perm);
|
||||||
|
}
|
||||||
|
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
|
||||||
|
{
|
||||||
|
Gmerge<vComplexF,ComplexF >(y,extracted);
|
||||||
|
}
|
||||||
|
friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted)
|
||||||
|
{
|
||||||
|
Gextract<vComplexF,ComplexF>(y,extracted);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
|
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
|
||||||
@ -371,7 +323,5 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
|
|||||||
return l*r;
|
return l*r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -235,70 +235,11 @@ friend inline void vstore(vInteger &ret, Integer *a){
|
|||||||
}
|
}
|
||||||
friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
|
friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
|
||||||
{
|
{
|
||||||
Gmerge<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
|
Gmerge<vIntegerF,Integer>(y,extracted);
|
||||||
}
|
}
|
||||||
friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
|
friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
|
||||||
{
|
{
|
||||||
Gextract<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
|
Gextract<vIntegerF,Integer>(y,extracted);
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
class vIntegerD : public vInteger
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(double);}
|
|
||||||
|
|
||||||
friend inline void permute(vIntegerD &y,vIntegerD b,int perm)
|
|
||||||
{
|
|
||||||
Gpermute<vIntegerD>(y,b,perm);
|
|
||||||
}
|
|
||||||
friend inline void merge(vIntegerD &y,std::vector<Integer *> &extracted)
|
|
||||||
{
|
|
||||||
Gmerge<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
|
|
||||||
}
|
|
||||||
friend inline void extract(vIntegerD &y,std::vector<Integer *> &extracted)
|
|
||||||
{
|
|
||||||
Gextract<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
class vIntegerC : public vInteger
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexF);}
|
|
||||||
|
|
||||||
friend inline void permute(vIntegerC &y,vIntegerC b,int perm)
|
|
||||||
{
|
|
||||||
Gpermute<vIntegerC>(y,b,perm);
|
|
||||||
}
|
|
||||||
friend inline void merge(vIntegerC &y,std::vector<Integer *> &extracted)
|
|
||||||
{
|
|
||||||
Gmerge<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
|
|
||||||
}
|
|
||||||
friend inline void extract(vIntegerC &y,std::vector<Integer *> &extracted)
|
|
||||||
{
|
|
||||||
Gextract<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class vIntegerZ : public vInteger
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexD);}
|
|
||||||
|
|
||||||
friend inline void permute(vIntegerZ &y,vIntegerZ b,int perm)
|
|
||||||
{
|
|
||||||
Gpermute<vIntegerZ>(y,b,perm);
|
|
||||||
}
|
|
||||||
friend inline void merge(vIntegerZ &y,std::vector<Integer *> &extracted)
|
|
||||||
{
|
|
||||||
Gmerge<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
|
|
||||||
}
|
|
||||||
friend inline void extract(vIntegerZ &y,std::vector<Integer *> &extracted)
|
|
||||||
{
|
|
||||||
Gextract<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
class vRealD {
|
class vRealD {
|
||||||
protected:
|
public:
|
||||||
dvec v; // dvec is double precision vector
|
dvec v; // dvec is double precision vector
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -99,72 +99,27 @@ namespace Grid {
|
|||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Extract
|
// General permute; assumes vector length is same across
|
||||||
/////////////////////////////////////////////////////////////////
|
// all subtypes; may not be a good assumption, but could
|
||||||
friend inline void extract(vRealD &y,std::vector<RealD *> &extracted){
|
// add the vector width as a template param for BG/Q for example
|
||||||
// Bounce off stack is painful
|
////////////////////////////////////////////////////////////////////
|
||||||
// temporary hack while I figure out the right interface
|
friend inline void permute(vRealD &y,vRealD b,int perm)
|
||||||
const int Nsimd = vRealD::Nsimd();
|
{
|
||||||
RealD buf[Nsimd];
|
Gpermute<vRealD>(y,b,perm);
|
||||||
|
}
|
||||||
|
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
|
||||||
|
{
|
||||||
|
Gmerge<vRealD,RealD >(y,extracted);
|
||||||
|
}
|
||||||
|
friend inline void extract(vRealD &y,std::vector<RealD *> &extracted)
|
||||||
|
{
|
||||||
|
Gextract<vRealD,RealD>(y,extracted);
|
||||||
|
}
|
||||||
|
|
||||||
vstore(y,buf);
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||||
for(int i=0;i<Nsimd;i++){
|
////////////////////////////////////////////////////////////////////////
|
||||||
*extracted[i] = buf[i];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted){
|
|
||||||
// Bounce off stack is painful
|
|
||||||
// temporary hack while I figure out the right interface
|
|
||||||
const int Nsimd = vRealD::Nsimd();
|
|
||||||
RealD buf[Nsimd];
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
buf[i]=*extracted[i];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
vset(y,buf);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
// Permute plans
|
|
||||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
|
||||||
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
|
||||||
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
|
||||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
|
||||||
// Permute 4 possible on half precision @512bit vectors.
|
|
||||||
friend inline void permute(vRealD &y,vRealD b,int perm){
|
|
||||||
switch (perm){
|
|
||||||
// 4 doubles=>2 permutes
|
|
||||||
#if defined(AVX1)||defined(AVX2)
|
|
||||||
case 0: y.v = _mm256_shuffle_pd(b.v,b.v,0x5); break;
|
|
||||||
case 1: y.v = _mm256_permute2f128_pd(b.v,b.v,0x01); break;
|
|
||||||
#endif
|
|
||||||
#ifdef SSE2
|
|
||||||
case 0: y.v = _mm_shuffle_pd(b.v,b.v,0x1); break;
|
|
||||||
#endif
|
|
||||||
#ifdef AVX512
|
|
||||||
// 8 double => 3 permutes
|
|
||||||
// Permute 0 every abcd efgh -> badc fehg
|
|
||||||
// Permute 1 every abcd efgh -> cdab ghef
|
|
||||||
// Permute 2 every abcd efgh -> efgh abcd
|
|
||||||
// NOTE: mm_512_permutex_pd not implemented
|
|
||||||
// NOTE: ignore warning
|
|
||||||
case 0: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_CDAB); break;
|
|
||||||
case 1: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_BADC); break;
|
|
||||||
case 2: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#ifdef QPX
|
|
||||||
#error
|
|
||||||
#endif
|
|
||||||
default: assert(0);break;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// gona be bye bye
|
|
||||||
void vload(dvec& a){
|
void vload(dvec& a){
|
||||||
this->v = a;
|
this->v = a;
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
class vRealF {
|
class vRealF {
|
||||||
protected:
|
public:
|
||||||
fvec v;
|
fvec v;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -120,74 +120,25 @@ namespace Grid {
|
|||||||
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
|
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Extract
|
// General permute; assumes vector length is same across
|
||||||
/////////////////////////////////////////////////////////////////
|
// all subtypes; may not be a good assumption, but could
|
||||||
friend inline void extract(vRealF &y,std::vector<RealF *> &extracted){
|
// add the vector width as a template param for BG/Q for example
|
||||||
// Bounce off stack is painful
|
////////////////////////////////////////////////////////////////////
|
||||||
// temporary hack while I figure out the right interface
|
friend inline void permute(vRealF &y,vRealF b,int perm)
|
||||||
const int Nsimd = vRealF::Nsimd();
|
{
|
||||||
RealF buf[Nsimd];
|
Gpermute<vRealF>(y,b,perm);
|
||||||
|
}
|
||||||
|
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
|
||||||
|
{
|
||||||
|
Gmerge<vRealF,RealF >(y,extracted);
|
||||||
|
}
|
||||||
|
friend inline void extract(vRealF &y,std::vector<RealF *> &extracted)
|
||||||
|
{
|
||||||
|
Gextract<vRealF,RealF>(y,extracted);
|
||||||
|
}
|
||||||
|
|
||||||
vstore(y,buf);
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
*extracted[i] = buf[i];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted){
|
|
||||||
// Bounce off stack is painful
|
|
||||||
// temporary hack while I figure out the right interface
|
|
||||||
const int Nsimd = vRealF::Nsimd();
|
|
||||||
RealF buf[Nsimd];
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
buf[i]=*extracted[i];
|
|
||||||
extracted[i]++;
|
|
||||||
}
|
|
||||||
vset(y,buf);
|
|
||||||
};
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
|
||||||
// Permute
|
|
||||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
|
||||||
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
|
||||||
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
|
||||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
|
||||||
// Permute 4 possible on half precision @512bit vectors.
|
|
||||||
//////////////////////////////////////////////////////////
|
|
||||||
friend inline void permute(vRealF &y,vRealF b,int perm){
|
|
||||||
switch (perm){
|
|
||||||
// 8 floats=>3 permutes
|
|
||||||
#if defined(AVX1)||defined(AVX2)
|
|
||||||
case 0: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
|
|
||||||
case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
|
|
||||||
case 2: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
|
|
||||||
#endif
|
|
||||||
#ifdef SSE2
|
|
||||||
case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
|
|
||||||
case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
|
|
||||||
#endif
|
|
||||||
#ifdef AVX512
|
|
||||||
// 16 floats=> permutes
|
|
||||||
// Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo
|
|
||||||
// Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn
|
|
||||||
// Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
|
|
||||||
// Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
|
|
||||||
//#error not implemented should do something
|
|
||||||
case 0: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
|
|
||||||
case 1: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
|
|
||||||
case 2: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
|
||||||
case 3: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
|
||||||
#endif
|
|
||||||
#ifdef QPX
|
|
||||||
#error not implemented
|
|
||||||
#endif
|
|
||||||
default: assert(0); break;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
// Broadcast a value across Nsimd copies.
|
// Broadcast a value across Nsimd copies.
|
||||||
@ -207,6 +158,8 @@ namespace Grid {
|
|||||||
ret.v = {a,a,a,a};
|
ret.v = {a,a,a,a};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
friend inline void vset(vRealF &ret, float *a){
|
friend inline void vset(vRealF &ret, float *a){
|
||||||
#if defined (AVX1)|| defined (AVX2)
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||||
@ -224,6 +177,9 @@ namespace Grid {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
friend inline void vstore(vRealF &ret, float *a){
|
friend inline void vstore(vRealF &ret, float *a){
|
||||||
#if defined (AVX1)|| defined (AVX2)
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
_mm256_store_ps(a,ret.v);
|
_mm256_store_ps(a,ret.v);
|
||||||
|
@ -24,7 +24,6 @@ include_HEADERS = Grid_config.h\
|
|||||||
Grid_aligned_allocator.h\
|
Grid_aligned_allocator.h\
|
||||||
Grid_cshift.h\
|
Grid_cshift.h\
|
||||||
Grid_cshift_common.h\
|
Grid_cshift_common.h\
|
||||||
Grid_cshift_fake.h\
|
|
||||||
Grid_cshift_mpi.h\
|
Grid_cshift_mpi.h\
|
||||||
Grid_cshift_none.h\
|
Grid_cshift_none.h\
|
||||||
Grid_math_types.h
|
Grid_math_types.h
|
||||||
@ -37,13 +36,10 @@ bin_PROGRAMS = Grid_main
|
|||||||
|
|
||||||
extra_sources=
|
extra_sources=
|
||||||
if BUILD_COMMS_MPI
|
if BUILD_COMMS_MPI
|
||||||
extra_sources+=Grid_mpi.cc
|
extra_sources+=Grid_communicator_mpi.cc
|
||||||
endif
|
|
||||||
if BUILD_COMMS_FAKE
|
|
||||||
extra_sources+=Grid_fake.cc
|
|
||||||
endif
|
endif
|
||||||
if BUILD_COMMS_NONE
|
if BUILD_COMMS_NONE
|
||||||
extra_sources+=Grid_fake.cc
|
extra_sources+=Grid_communicator_fake.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
Grid_main_SOURCES = \
|
Grid_main_SOURCES = \
|
||||||
|
29
Makefile.in
29
Makefile.in
@ -89,9 +89,8 @@ NORMAL_UNINSTALL = :
|
|||||||
PRE_UNINSTALL = :
|
PRE_UNINSTALL = :
|
||||||
POST_UNINSTALL = :
|
POST_UNINSTALL = :
|
||||||
bin_PROGRAMS = Grid_main$(EXEEXT)
|
bin_PROGRAMS = Grid_main$(EXEEXT)
|
||||||
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_mpi.cc
|
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc
|
||||||
@BUILD_COMMS_FAKE_TRUE@am__append_2 = Grid_fake.cc
|
@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc
|
||||||
@BUILD_COMMS_NONE_TRUE@am__append_3 = Grid_fake.cc
|
|
||||||
subdir = .
|
subdir = .
|
||||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||||
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
|
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
|
||||||
@ -146,12 +145,13 @@ libGrid_a_LIBADD =
|
|||||||
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
|
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
|
||||||
libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
|
libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
|
||||||
PROGRAMS = $(bin_PROGRAMS)
|
PROGRAMS = $(bin_PROGRAMS)
|
||||||
am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_mpi.cc Grid_fake.cc
|
am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_communicator_mpi.cc \
|
||||||
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_mpi.$(OBJEXT)
|
Grid_communicator_fake.cc
|
||||||
@BUILD_COMMS_FAKE_TRUE@am__objects_2 = Grid_fake.$(OBJEXT)
|
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT)
|
||||||
@BUILD_COMMS_NONE_TRUE@am__objects_3 = Grid_fake.$(OBJEXT)
|
@BUILD_COMMS_NONE_TRUE@am__objects_2 = \
|
||||||
am__objects_4 = $(am__objects_1) $(am__objects_2) $(am__objects_3)
|
@BUILD_COMMS_NONE_TRUE@ Grid_communicator_fake.$(OBJEXT)
|
||||||
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_4)
|
am__objects_3 = $(am__objects_1) $(am__objects_2)
|
||||||
|
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_3)
|
||||||
Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
|
Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
|
||||||
Grid_main_DEPENDENCIES = libGrid.a
|
Grid_main_DEPENDENCIES = libGrid.a
|
||||||
AM_V_P = $(am__v_P_@AM_V@)
|
AM_V_P = $(am__v_P_@AM_V@)
|
||||||
@ -214,8 +214,8 @@ CTAGS = ctags
|
|||||||
CSCOPE = cscope
|
CSCOPE = cscope
|
||||||
AM_RECURSIVE_TARGETS = cscope
|
AM_RECURSIVE_TARGETS = cscope
|
||||||
am__DIST_COMMON = $(srcdir)/Grid_config.h.in $(srcdir)/Makefile.in \
|
am__DIST_COMMON = $(srcdir)/Grid_config.h.in $(srcdir)/Makefile.in \
|
||||||
AUTHORS COPYING ChangeLog INSTALL NEWS README compile depcomp \
|
AUTHORS COPYING ChangeLog INSTALL NEWS README TODO compile \
|
||||||
install-sh missing
|
depcomp install-sh missing
|
||||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||||
distdir = $(PACKAGE)-$(VERSION)
|
distdir = $(PACKAGE)-$(VERSION)
|
||||||
top_distdir = $(distdir)
|
top_distdir = $(distdir)
|
||||||
@ -353,12 +353,11 @@ include_HEADERS = Grid_config.h\
|
|||||||
Grid_aligned_allocator.h\
|
Grid_aligned_allocator.h\
|
||||||
Grid_cshift.h\
|
Grid_cshift.h\
|
||||||
Grid_cshift_common.h\
|
Grid_cshift_common.h\
|
||||||
Grid_cshift_fake.h\
|
|
||||||
Grid_cshift_mpi.h\
|
Grid_cshift_mpi.h\
|
||||||
Grid_cshift_none.h\
|
Grid_cshift_none.h\
|
||||||
Grid_math_types.h
|
Grid_math_types.h
|
||||||
|
|
||||||
extra_sources = $(am__append_1) $(am__append_2) $(am__append_3)
|
extra_sources = $(am__append_1) $(am__append_2)
|
||||||
Grid_main_SOURCES = \
|
Grid_main_SOURCES = \
|
||||||
Grid_main.cc\
|
Grid_main.cc\
|
||||||
$(extra_sources)
|
$(extra_sources)
|
||||||
@ -506,10 +505,10 @@ mostlyclean-compile:
|
|||||||
distclean-compile:
|
distclean-compile:
|
||||||
-rm -f *.tab.c
|
-rm -f *.tab.c
|
||||||
|
|
||||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_fake.Po@am__quote@
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_fake.Po@am__quote@
|
||||||
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_mpi.Po@am__quote@
|
||||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
|
||||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
|
||||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_mpi.Po@am__quote@
|
|
||||||
|
|
||||||
.cc.o:
|
.cc.o:
|
||||||
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||||
|
103
TODO
103
TODO
@ -1,4 +1,7 @@
|
|||||||
* FIXME audit
|
* FIXME audit
|
||||||
|
* Remove vload/store etc..
|
||||||
|
* Replace vset with a call to merge.
|
||||||
|
* Replace vset with a call to merge.
|
||||||
|
|
||||||
* Conditional execution Subset, where etc...
|
* Conditional execution Subset, where etc...
|
||||||
* Coordinate information, integers etc...
|
* Coordinate information, integers etc...
|
||||||
@ -27,3 +30,103 @@
|
|||||||
- BinaryWriter, TextWriter etc...
|
- BinaryWriter, TextWriter etc...
|
||||||
- protocol buffers?
|
- protocol buffers?
|
||||||
-
|
-
|
||||||
|
// Cartesian grid inheritance
|
||||||
|
// Grid::GridBase
|
||||||
|
// |
|
||||||
|
// __________|___________
|
||||||
|
// | |
|
||||||
|
// Grid::GridCartesian Grid::GridCartesianRedBlack
|
||||||
|
//
|
||||||
|
// TODO: document the following as an API guaranteed public interface
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Rough map of functionality against QDP++ Layout
|
||||||
|
*
|
||||||
|
* Param | Grid | QDP++
|
||||||
|
* -----------------------------------------
|
||||||
|
* | |
|
||||||
|
* void | oSites, iSites, lSites | sitesOnNode
|
||||||
|
* void | gSites | vol
|
||||||
|
* | |
|
||||||
|
* gcoor | oIndex, iIndex | linearSiteIndex // no virtual node in QDP
|
||||||
|
* lcoor | |
|
||||||
|
*
|
||||||
|
* void | CheckerBoarded | - // No checkerboarded in QDP
|
||||||
|
* void | FullDimensions | lattSize
|
||||||
|
* void | GlobalDimensions | lattSize // No checkerboarded in QDP
|
||||||
|
* void | LocalDimensions | subgridLattSize
|
||||||
|
* void | VirtualLocalDimensions | subgridLattSize // no virtual node in QDP
|
||||||
|
* | |
|
||||||
|
* int x 3 | oiSiteRankToGlobal | siteCoords
|
||||||
|
* | ProcessorCoorLocalCoorToGlobalCoor |
|
||||||
|
* | |
|
||||||
|
* vector<int> | GlobalCoorToRankIndex | nodeNumber(coord)
|
||||||
|
* vector<int> | GlobalCoorToProcessorCoorLocalCoor| nodeCoord(coord)
|
||||||
|
* | |
|
||||||
|
* void | Processors | logicalSize // returns cart array shape
|
||||||
|
* void | ThisRank | nodeNumber(); // returns this node rank
|
||||||
|
* void | ThisProcessorCoor | // returns this node coor
|
||||||
|
* void | isBoss(void) | primaryNode();
|
||||||
|
* | |
|
||||||
|
* | RankFromProcessorCoor | getLogicalCoorFrom(node)
|
||||||
|
* | ProcessorCoorFromRank | getNodeNumberFrom(logical_coord)
|
||||||
|
*/
|
||||||
|
// Work out whether to permute
|
||||||
|
// ABCDEFGH -> AE BF CG DH permute wrap num
|
||||||
|
//
|
||||||
|
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
|
||||||
|
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
|
||||||
|
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
|
||||||
|
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
|
||||||
|
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
|
||||||
|
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
|
||||||
|
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
|
||||||
|
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
|
||||||
|
|
||||||
|
// Suppose 4way simd in one dim.
|
||||||
|
// ABCDEFGH -> AECG BFDH permute wrap num
|
||||||
|
|
||||||
|
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
|
||||||
|
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
|
||||||
|
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
|
||||||
|
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
|
||||||
|
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
|
||||||
|
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
|
||||||
|
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
|
||||||
|
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
|
||||||
|
|
||||||
|
// Generalisation to 8 way simd, 16 way simd required.
|
||||||
|
//
|
||||||
|
// Need log2 Nway masks. consisting of
|
||||||
|
// 1 bit 256 bit granule
|
||||||
|
// 2 bit 128 bit granule
|
||||||
|
// 4 bits 64 bit granule
|
||||||
|
// 8 bits 32 bit granules
|
||||||
|
//
|
||||||
|
// 15 bits....
|
||||||
|
// TODO
|
||||||
|
//
|
||||||
|
// Base class to share common code between vRealF, VComplexF etc...
|
||||||
|
//
|
||||||
|
// lattice Broad cast assignment
|
||||||
|
//
|
||||||
|
// where() support
|
||||||
|
// implement with masks, and/or? Type of the mask & boolean support?
|
||||||
|
//
|
||||||
|
// Unary functions
|
||||||
|
// cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
|
||||||
|
// exp, log, sqrt, fabs
|
||||||
|
//
|
||||||
|
// transposeColor, transposeSpin,
|
||||||
|
// adjColor, adjSpin,
|
||||||
|
// traceColor, traceSpin.
|
||||||
|
// peekColor, peekSpin + pokeColor PokeSpin
|
||||||
|
//
|
||||||
|
// copyMask.
|
||||||
|
//
|
||||||
|
// localMaxAbs
|
||||||
|
//
|
||||||
|
// norm2,
|
||||||
|
// sumMulti equivalent.
|
||||||
|
// Fourier transform equivalent.
|
||||||
|
//
|
||||||
|
23
configure
vendored
23
configure
vendored
@ -628,8 +628,6 @@ LTLIBOBJS
|
|||||||
LIBOBJS
|
LIBOBJS
|
||||||
BUILD_COMMS_NONE_FALSE
|
BUILD_COMMS_NONE_FALSE
|
||||||
BUILD_COMMS_NONE_TRUE
|
BUILD_COMMS_NONE_TRUE
|
||||||
BUILD_COMMS_FAKE_FALSE
|
|
||||||
BUILD_COMMS_FAKE_TRUE
|
|
||||||
BUILD_COMMS_MPI_FALSE
|
BUILD_COMMS_MPI_FALSE
|
||||||
BUILD_COMMS_MPI_TRUE
|
BUILD_COMMS_MPI_TRUE
|
||||||
EGREP
|
EGREP
|
||||||
@ -1369,8 +1367,7 @@ Optional Features:
|
|||||||
--disable-openmp do not use OpenMP
|
--disable-openmp do not use OpenMP
|
||||||
--enable-simd=SSE|AVX|AVX2|AVX512
|
--enable-simd=SSE|AVX|AVX2|AVX512
|
||||||
Select instructions
|
Select instructions
|
||||||
--enable-comms=none|fake|mpi
|
--enable-comms=none|mpi Select communications
|
||||||
Select communications
|
|
||||||
|
|
||||||
Some influential environment variables:
|
Some influential environment variables:
|
||||||
CXX C++ compiler command
|
CXX C++ compiler command
|
||||||
@ -5051,12 +5048,6 @@ fi
|
|||||||
|
|
||||||
|
|
||||||
case ${ac_COMMS} in
|
case ${ac_COMMS} in
|
||||||
fake)
|
|
||||||
echo Configuring for FAKE communications
|
|
||||||
|
|
||||||
$as_echo "#define GRID_COMMS_FAKE 1" >>confdefs.h
|
|
||||||
|
|
||||||
;;
|
|
||||||
none)
|
none)
|
||||||
echo Configuring for NO communications
|
echo Configuring for NO communications
|
||||||
|
|
||||||
@ -5082,14 +5073,6 @@ else
|
|||||||
BUILD_COMMS_MPI_FALSE=
|
BUILD_COMMS_MPI_FALSE=
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if test "X${ac_COMMS}X" == "XfakeX" ; then
|
|
||||||
BUILD_COMMS_FAKE_TRUE=
|
|
||||||
BUILD_COMMS_FAKE_FALSE='#'
|
|
||||||
else
|
|
||||||
BUILD_COMMS_FAKE_TRUE='#'
|
|
||||||
BUILD_COMMS_FAKE_FALSE=
|
|
||||||
fi
|
|
||||||
|
|
||||||
if test "X${ac_COMMS}X" == "XnoneX" ; then
|
if test "X${ac_COMMS}X" == "XnoneX" ; then
|
||||||
BUILD_COMMS_NONE_TRUE=
|
BUILD_COMMS_NONE_TRUE=
|
||||||
BUILD_COMMS_NONE_FALSE='#'
|
BUILD_COMMS_NONE_FALSE='#'
|
||||||
@ -5243,10 +5226,6 @@ if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
|
|||||||
as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
|
as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
|
||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||||
fi
|
fi
|
||||||
if test -z "${BUILD_COMMS_FAKE_TRUE}" && test -z "${BUILD_COMMS_FAKE_FALSE}"; then
|
|
||||||
as_fn_error $? "conditional \"BUILD_COMMS_FAKE\" was never defined.
|
|
||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
|
||||||
fi
|
|
||||||
if test -z "${BUILD_COMMS_NONE_TRUE}" && test -z "${BUILD_COMMS_NONE_FALSE}"; then
|
if test -z "${BUILD_COMMS_NONE_TRUE}" && test -z "${BUILD_COMMS_NONE_FALSE}"; then
|
||||||
as_fn_error $? "conditional \"BUILD_COMMS_NONE\" was never defined.
|
as_fn_error $? "conditional \"BUILD_COMMS_NONE\" was never defined.
|
||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||||
|
@ -51,13 +51,9 @@ case ${ac_SIMD} in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
|
|
||||||
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|fake|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
|
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
|
||||||
|
|
||||||
case ${ac_COMMS} in
|
case ${ac_COMMS} in
|
||||||
fake)
|
|
||||||
echo Configuring for FAKE communications
|
|
||||||
AC_DEFINE([GRID_COMMS_FAKE],[1],[GRID_COMMS_FAKE] )
|
|
||||||
;;
|
|
||||||
none)
|
none)
|
||||||
echo Configuring for NO communications
|
echo Configuring for NO communications
|
||||||
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
|
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
|
||||||
@ -72,7 +68,6 @@ case ${ac_COMMS} in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
|
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_FAKE,[ test "X${ac_COMMS}X" == "XfakeX" ])
|
|
||||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user