1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Reorg of build structure

This commit is contained in:
Peter Boyle
2015-04-18 14:55:00 +01:00
parent 57586c8e05
commit c656164015
33 changed files with 59 additions and 0 deletions

63
lib/Grid.h Normal file
View File

@ -0,0 +1,63 @@
//
// Grid.cpp
// simd
//
// Created by Peter Boyle on 09/05/2014.
// Copyright (c) 2014 University of Edinburgh. All rights reserved.
//
#ifndef GRID_V3_H
#define GRID_V3_H
#include <stdio.h>
#include <complex>
#include <vector>
#include <iostream>
#include <cassert>
#include <random>
#include <functional>
#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>
#include <signal.h>
#include <Grid_config.h>
////////////////////////////////////////////////////////////
// Tunable header includes
////////////////////////////////////////////////////////////
#ifdef HAVE_OPENMP
#define OMP
#include <omp.h>
#endif
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#include <Grid_aligned_allocator.h>
#include <Grid_simd.h>
#include <Grid_math_types.h>
#include <Grid_Cartesian.h>
#include <Grid_Lattice.h>
#include <Grid_comparison.h>
#include <Grid_stencil.h>
#include <Grid_QCD.h>
namespace Grid {
void Grid_init(int *argc,char ***argv);
void Grid_finalize(void);
double usecond(void);
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void);
};
#endif

399
lib/Grid_Cartesian.h Normal file
View File

@ -0,0 +1,399 @@
#ifndef GRID_CARTESIAN_H
#define GRID_CARTESIAN_H
#include <Grid.h>
#include <Grid_Communicator.h>
namespace Grid{
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support.
/////////////////////////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator {
public:
// Give Lattice access
template<class object> friend class Lattice;
GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
//FIXME
// protected:
// Lattice wide random support. not yet fully implemented. Need seed strategy
// and one generator per site.
// std::default_random_engine generator;
// static std::mt19937 generator( 9 );
//////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid
//////////////////////////////////////////////////////////////////////
// unsigned long _ndimension;
// std::vector<int> _processors; // processor grid
// int _processor; // linear processor rank
// std::vector<int> _processor_coor; // linear processor rank
//////////////////////////////////////////////////////////////////////
// Physics Grid information.
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
std::vector<int> _ostride; // Outer stride for each dimension
std::vector<int> _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
std::vector<int> _slice_block; // subslice information
std::vector<int> _slice_stride;
std::vector<int> _slice_nblock;
// Might need these at some point
// std::vector<int> _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
// std::vector<int> _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
public:
////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(std::vector<int> site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
inline int CheckerBoardFromOindex (int Oindex){
std::vector<int> ocoor;
oCoorFromOindex(ocoor,Oindex);
int ss=0;
for(int d=0;d<_ndimension;d++){
ss=ss+ocoor[d];
}
return ss&0x1;
}
//////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations
//////////////////////////////////////////////////////////////////////////////////////////////
// These routines are key. Subdivide the linearised cartesian index into
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
//
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously.
virtual int oIndex(std::vector<int> &coor)
{
int idx=0;
// Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
}
inline int oIndexReduced(std::vector<int> &ocoor)
{
int idx=0;
// ocoor is already reduced so can eliminate the modulo operation
// for fast indexing and inline the routine
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx;
}
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
coor.resize(_ndimension);
for(int d=0;d<_ndimension;d++){
coor[d] = Oindex % _rdimensions[d];
Oindex = Oindex / _rdimensions[d];
}
}
//////////////////////////////////////////////////////////
// SIMD lane addressing
//////////////////////////////////////////////////////////
inline int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline void iCoorFromIindex(std::vector<int> &coor,int lane)
{
coor.resize(_ndimension);
for(int d=0;d<_ndimension;d++){
coor[d] = lane % _simd_layout[d];
lane = lane / _simd_layout[d];
}
}
inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1;
}
inline int PermuteType(int dimension){
int permute_type=0;
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
}
return permute_type;
}
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline int iSites(void) { return _isites; };
inline int Nsimd(void) { return _isites; };// Synonymous with iSites
inline int oSites(void) { return _osites; };
inline int lSites(void) { return _isites*_osites; };
inline int gSites(void) { return _isites*_osites*_Nprocessors; };
inline int Nd (void) { return _ndimension;};
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
std::vector<int> coor(_ndimension);
ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]&coor[mu];
iCoorFromIindex(coor,i_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]&coor[mu];
oCoorFromOindex (coor,o_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
}
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
}
}
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
}
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
{
pcoor.resize(_ndimension);
lcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++){
pcoor[mu] = gcoor[mu]/_ldimensions[mu];
lcoor[mu] = gcoor[mu]%_ldimensions[mu];
}
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
{
std::vector<int> pcoor;
std::vector<int> lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
i_idx= iIndex(lcoor);
o_idx= oIndex(lcoor);
}
};
class GridCartesian: public GridBase {
public:
virtual int CheckerBoarded(int dim){
return 0;
}
virtual int CheckerBoard(std::vector<int> site){
return 0;
}
virtual int CheckerBoardDestination(int cb,int shift){
return 0;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
GridCartesian(std::vector<int> &dimensions,
std::vector<int> &simd_layout,
std::vector<int> &processor_grid
) : GridBase(processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_osites = 1;
_isites = 1;
for(int d=0;d<_ndimension;d++){
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
//FIXME check for exact division
// Use a reduced simd grid
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if ( d==0 ) {
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block =1;
int nblock=1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d];
_slice_block[d] =block;
_slice_stride[d]=_ostride[d]*_rdimensions[d];
_slice_nblock[d]=nblock;
block = block*_rdimensions[d];
}
};
};
// Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase
{
public:
virtual int CheckerBoarded(int dim){
if( dim==0) return 1;
else return 0;
}
virtual int CheckerBoard(std::vector<int> site){
return (site[0]+site[1]+site[2]+site[3])&0x1;
}
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != 0) return shift;
int fulldim =_fdimensions[0];
shift = (shift+fulldim)%fulldim;
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
int ocb=CheckerBoardFromOindex(osite);
if ( (source_cb+ocb)&1 ) {
return (shift)/2;
} else {
return (shift+1)/2;
}
}
virtual int CheckerBoardDestination(int source_cb,int shift){
if ((shift+_fdimensions[0])&0x1) {
return 1-source_cb;
} else {
return source_cb;
}
};
GridRedBlackCartesian(std::vector<int> &dimensions,
std::vector<int> &simd_layout,
std::vector<int> &processor_grid) : GridBase(processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_osites = 1;
_isites = 1;
for(int d=0;d<_ndimension;d++){
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
_ldimensions[d] = _gdimensions[d]/_processors[d];
// Use a reduced simd grid
_simd_layout[d] = simd_layout[d];
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if ( d==0 ) {
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
}
////////////////////////////////////////////////////////////////////////////////////////////
// subplane information
////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block =1;
int nblock=1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d];
_slice_block[d] =block;
_slice_stride[d]=_ostride[d]*_rdimensions[d];
_slice_nblock[d]=nblock;
block = block*_rdimensions[d];
}
};
protected:
virtual int oIndex(std::vector<int> &coor)
{
int idx=_ostride[0]*((coor[0]/2)%_rdimensions[0]);
for(int d=1;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
};
};
}
#endif

103
lib/Grid_Communicator.h Normal file
View File

@ -0,0 +1,103 @@
#ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H
///////////////////////////////////
// Processor layout information
///////////////////////////////////
#ifdef GRID_COMMS_MPI
#include <mpi.h>
#endif
namespace Grid {
class CartesianCommunicator {
public:
// Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
#ifdef GRID_COMMS_MPI
MPI_Comm communicator;
#endif
// Constructor
CartesianCommunicator(std::vector<int> &pdimensions_in);
// Wraps MPI_Cart routines
void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
/////////////////////////////////
// Grid information queries
/////////////////////////////////
int IsBoss(void) { return _processor==0; };
int ThisRank(void) { return _processor; };
const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & ProcessorGrid(void) { return _processors; };
int ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////
// Reduction
////////////////////////////////////////////////////////////
void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N);
void GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o;
GlobalSum(ptr,words);
}
////////////////////////////////////////////////////////////
// Face exchange
////////////////////////////////////////////////////////////
void SendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
////////////////////////////////////////////////////////////
// Barrier
////////////////////////////////////////////////////////////
void Barrier(void);
////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger
////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes);
template<class obj> void Broadcast(int root,obj &data)
{
Broadcast(root,(void *)&data,sizeof(data));
};
};
}
#endif

611
lib/Grid_Lattice.h Normal file
View File

@ -0,0 +1,611 @@
#ifndef GRID_LATTICE_H
#define GRID_LATTICE_H
#include "Grid.h"
namespace Grid {
// TODO: Indexing ()
// mac,real,imag
//
// Functionality required:
// -=,+=,*=,()
// add,+,sub,-,mult,mac,*
// adj,conj
// real,imag
// transpose,transposeIndex
// trace,traceIndex
// peekIndex
// innerProduct,outerProduct,
// localNorm2
// localInnerProduct
//
extern int GridCshiftPermuteMap[4][16];
template<class vobj>
class Lattice
{
public:
GridBase *_grid;
int checkerboard;
std::vector<vobj,alignedAllocator<vobj> > _odata;
public:
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
Lattice(GridBase *grid) : _grid(grid) {
_odata.reserve(_grid->oSites());
assert((((uint64_t)&_odata[0])&0xF) ==0);
checkerboard=0;
}
#include <Grid_cshift.h>
template<class obj1,class obj2>
friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
// FIXME Performance difference between operator * and mult is troubling.
// Auto move constructor seems to lose surprisingly much.
// Site wise binary operations
// We eliminate a temporary object assignment if use the mult,add,sub routines.
// For the operator versions we rely on move constructor to eliminate the
// vector copy back.
template<class obj1,class obj2,class obj3>
friend void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
template<class obj1,class obj2,class obj3>
friend void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
template<class obj1,class obj2,class obj3>
friend void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
template<class obj1,class obj2,class obj3>
friend void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
friend void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
}
}
friend void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
}
}
inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
conformable(lhs,rhs);
Lattice<vobj> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss];
}
return ret;
};
template<class sobj>
inline Lattice<vobj> & operator = (const sobj & r){
#pragma omp parallel for
for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r;
}
return *this;
}
// Poke a scalar object into the SIMD array
template<class sobj>
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
// Optional to broadcast from node 0.
grid->Broadcast(0,s);
std::vector<sobj> buf(Nsimd);
std::vector<scalar_type *> pointers(Nsimd);
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
// extract-modify-merge cycle is easiest way and this is not perf critical
extract(l._odata[odx],pointers);
buf[idx] = s;
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
merge(l._odata[odx],pointers);
return;
};
// Peek a scalar object from the SIMD array
template<class sobj>
friend void peekSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
std::vector<sobj> buf(Nsimd);
std::vector<scalar_type *> pointers(Nsimd);
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
extract(l._odata[odx],pointers);
s = buf[idx];
grid->Broadcast(rank,s);
return;
};
// FIXME Randomise; deprecate this
friend void random(Lattice<vobj> &l){
Real *v_ptr = (Real *)&l._odata[0];
size_t v_len = l._grid->oSites()*sizeof(vobj);
size_t d_len = v_len/sizeof(Real);
for(int i=0;i<d_len;i++){
v_ptr[i]=drand48();
}
};
// FIXME for debug; deprecate this; made obscelete by
// LatticeCoordinate();
friend void lex_sites(Lattice<vobj> &l){
Real *v_ptr = (Real *)&l._odata[0];
size_t o_len = l._grid->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
// FIXME Implement a consistent seed management strategy
friend void gaussian(Lattice<vobj> &l){
// Zero mean, unit variance.
std::normal_distribution<double> distribution(0.0,1.0);
Real *v_ptr = (Real *)&l._odata[0];
size_t v_len = l._grid->oSites()*sizeof(vobj);
size_t d_len = v_len/sizeof(Real);
for(int i=0;i<d_len;i++){
v_ptr[i]= drand48();
}
};
// Unary functions and Unops
friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
Lattice<vobj> ret(r._grid);
#pragma omp parallel for
for(int ss=0;ss<r._grid->oSites();ss++){
ret._odata[ss]= -r._odata[ss];
}
return ret;
}
// *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T>
inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r;
return *this;
}
template<class T>
inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r;
return *this;
}
template<class T>
inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r;
return *this;
}
inline friend Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = adj(lhs._odata[ss]);
}
return ret;
};
inline friend Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transpose(lhs._odata[ss]);
}
return ret;
};
inline friend Lattice<vobj> conj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = conj(lhs._odata[ss]);
}
return ret;
};
// remove and insert a half checkerboard
friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
half.checkerboard = cb;
int ssh=0;
#pragma omp parallel for
for(int ss=0;ss<full._grid->oSites();ss++){
std::vector<int> coor;
int cbos;
full._grid->oCoorFromOindex(coor,ss);
cbos=half._grid->CheckerBoard(coor);
if (cbos==cb) {
half._odata[ssh] = full._odata[ss];
ssh++;
}
}
}
friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.checkerboard;
int ssh=0;
#pragma omp parallel for
for(int ss=0;ss<full._grid->oSites();ss++){
std::vector<int> coor;
int cbos;
full._grid->oCoorFromOindex(coor,ss);
cbos=half._grid->CheckerBoard(coor);
if (cbos==cb) {
full._odata[ss]=half._odata[ssh];
ssh++;
}
}
}
}; // class Lattice
template<class obj1,class obj2>
void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{
assert(lhs._grid == rhs._grid);
assert(lhs.checkerboard == rhs.checkerboard);
}
template<class obj1,class obj2,class obj3>
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
uint32_t vec_len = lhs._grid->oSites();
#pragma omp parallel for
for(int ss=0;ss<vec_len;ss++){
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
}
}
template<class obj1,class obj2,class obj3>
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
uint32_t vec_len = lhs._grid->oSites();
#pragma omp parallel for
for(int ss=0;ss<vec_len;ss++){
mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
}
}
template<class obj1,class obj2,class obj3>
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
}
}
template<class obj1,class obj2,class obj3>
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
}
}
// Lattice BinOp Lattice,
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
//NB mult performs conformable check. Do not reapply here for performance.
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
mult(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
//NB mult performs conformable check. Do not reapply here for performance.
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
add(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
//NB mult performs conformable check. Do not reapply here for performance.
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
sub(ret,lhs,rhs);
return ret;
}
// Scalar BinOp Lattice ;generate return type
template<class left,class right>
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs*rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs+rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs-rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites(); ss++){
ret._odata[ss]=lhs._odata[ss]*rhs;
}
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs._odata[ss]+rhs;
}
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=lhs._odata[ss]-rhs;
}
return ret;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto trace(const Lattice<vobj> &lhs)
-> Lattice<decltype(trace(lhs._odata[0]))>
{
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = trace(lhs._odata[ss]);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto traceIndex(const Lattice<vobj> &lhs)
-> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
}
return ret;
};
template<int Index,class vobj>
inline auto transposeIndex(const Lattice<vobj> &lhs)
-> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
}
return ret;
};
// Fixme; this is problematic since the number of args is variable and
// may mismatch...
template<int Index,class vobj>
inline auto peekIndex(const Lattice<vobj> &lhs)
-> Lattice<decltype(peekIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss]);
}
return ret;
};
template<int Index,class vobj>
inline auto peekIndex(const Lattice<vobj> &lhs,int i)
-> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
}
return ret;
};
template<int Index,class vobj>
inline auto peekIndex(const Lattice<vobj> &lhs,int i,int j)
-> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline RealD norm2(const Lattice<vobj> &arg){
typedef typename vobj::scalar_type scalar;
typedef typename vobj::vector_type vector;
decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
scalar nrm;
//FIXME make this loop parallelisable
vnrm=zero;
for(int ss=0;ss<arg._grid->oSites(); ss++){
vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
}
vector vvnrm =TensorRemove(vnrm) ;
nrm = Reduce(vvnrm);
arg._grid->GlobalSum(nrm);
return real(nrm);
}
template<class vobj>
inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
{
typedef typename vobj::scalar_type scalar;
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
scalar nrm;
//FIXME make this loop parallelisable
for(int ss=0;ss<left._grid->oSites(); ss++){
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
}
nrm = Reduce(vnrm);
right._grid->GlobalSum(nrm);
return nrm;
}
/////////////////////////////////////////////////////
// Non site reduced routines
/////////////////////////////////////////////////////
// localNorm2,
template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
template<class vobj>
inline auto real(const Lattice<vobj> &z) -> Lattice<decltype(real(z._odata[0]))>
{
Lattice<decltype(real(z._odata[0]))> ret(z._grid);
#pragma omp parallel for
for(int ss=0;ss<z._grid->oSites();ss++){
ret._odata[ss] = real(z._odata[ss]);
}
return ret;
}
template<class vobj>
inline auto imag(const Lattice<vobj> &z) -> Lattice<decltype(imag(z._odata[0]))>
{
Lattice<decltype(imag(z._odata[0]))> ret(z._grid);
#pragma omp parallel for
for(int ss=0;ss<z._grid->oSites();ss++){
ret._odata[ss] = imag(z._odata[ss]);
}
return ret;
}
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
{
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
}
#endif

106
lib/Grid_QCD.h Normal file
View File

@ -0,0 +1,106 @@
#ifndef GRID_QCD_H
#define GRID_QCD_H
namespace Grid{
namespace QCD {
static const int Nc=3;
static const int Ns=4;
static const int Nd=4;
static const int CbRed =0;
static const int CbBlack=1;
//////////////////////////////////////////////////////////////////////////////
// QCD iMatrix types
// Index conventions: Lorentz x Spin x Colour
//
// ChrisK very keen to add extra space for Gparity doubling.
//
// Also add domain wall index, in a way where Wilson operator
// naturally distributes across the 5th dimensions.
//////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
typedef iSpinMatrix<Complex > SpinMatrix;
typedef iColourMatrix<Complex > ColourMatrix;
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iSpinVector<Complex > SpinVector;
typedef iColourVector<Complex > ColourVector;
typedef iSpinColourVector<Complex > SpinColourVector;
typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iColourMatrix<vComplex > vColourMatrix;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
typedef iSpinVector<vComplex > vSpinVector;
typedef iColourVector<vComplex > vColourVector;
typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef iSinglet<Complex > TComplex; // This is painful. Tensor singlet complex type.
typedef iSinglet<vComplex > vTComplex; // what if we don't know the tensor structure
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
typedef iSinglet<vReal > vTReal;
typedef iSinglet<vInteger > vTInteger;
typedef iSinglet<Integer > TInteger;
typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vInteger> LatticeInteger; // Predicates for "where"
typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vColourVector> LatticeColourVector;
///////////////////////////////////////////
// Physical names for things
///////////////////////////////////////////
typedef Lattice<vSpinColourVector> LatticeFermion;
typedef Lattice<vSpinColourMatrix> LatticePropagator;
typedef Lattice<vLorentzColourMatrix> LatticeGaugeField;
inline void LatticeCoordinate(LatticeInteger &l,int mu){
GridBase *grid = l._grid;
int Nsimd = grid->iSites();
std::vector<int> gcoor;
std::vector<Integer> mergebuf(Nsimd);
std::vector<Integer *> mergeptr(Nsimd);
for(int o=0;o<grid->oSites();o++){
for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
// grid->RankIndexToGlobalCoor(0,o,i,gcoor);
mergebuf[i]=gcoor[mu];
mergeptr[i]=&mergebuf[i];
}
merge(l._odata[o],mergeptr);
}
};
#include <Grid_predicated.h>
#if 0
#endif
} //namespace QCD
} // Grid
#endif

View File

@ -0,0 +1,59 @@
#ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
#include <immintrin.h>
namespace Grid {
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp>
class alignedAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; };
alignedAllocator() throw() { }
alignedAllocator(const alignedAllocator&) throw() { }
template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
~alignedAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
const_pointer address(const_reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
// Should override allocate and deallocate
pointer allocate(size_type __n, const void* = 0)
{
//_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp));
// _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
#ifdef AVX512
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
#else
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
#endif
return ptr;
}
void deallocate(pointer __p, size_type) {
free(__p);
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool
operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool
operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
}; // namespace Grid
#endif

View File

@ -0,0 +1,56 @@
#include "Grid.h"
namespace Grid {
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
{
_ndimension = _processors.size();
_processor_coor.resize(_ndimension);
_processors = processors;
// Require 1^N processor grid for fake
for(int d=0;d<_ndimension;d++) if(_processors[d]!=1) exit(-1);
_processor = 0;// I am the one. The only one..
for(int d=0;d<_ndimension;d++) _processor_coor[d] = 0;
}
void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
exit(-1);
}
void CartesianCommunicator::Barrier(void)
{
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
source =1;
dest=1;
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
return 1;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
}
}

View File

@ -0,0 +1,93 @@
#include "Grid.h"
#include <mpi.h>
namespace Grid {
// Should error check all MPI calls.
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
{
_ndimension = processors.size();
std::vector<int> periodic(_ndimension,1);
_Nprocessors=1;
_processors = processors;
_processor_coor.resize(_ndimension);
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
int Size;
MPI_Comm_size(communicator,&Size);
assert(Size==_Nprocessors);
}
void CartesianCommunicator::GlobalSum(float &f){
MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSum(double &d)
{
MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
MPI_Cart_rank (communicator, &coor[0], &rank);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
MPI_Request reqs[2];
MPI_Status OkeyDokey[2];
int rank = _processor;
MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&reqs[0]);
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&reqs[1]);
MPI_Waitall(2,reqs,OkeyDokey);
}
void CartesianCommunicator::Barrier(void)
{
MPI_Barrier(communicator);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
}
}

264
lib/Grid_comparison.h Normal file
View File

@ -0,0 +1,264 @@
#ifndef GRID_COMPARISON_H
#define GRID_COMPARISON_H
namespace Grid {
// Generic list of functors
template<class lobj,class robj> class veq {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return lhs == rhs;
}
};
template<class lobj,class robj> class vne {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return lhs != rhs;
}
};
template<class lobj,class robj> class vlt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return lhs < rhs;
}
};
template<class lobj,class robj> class vle {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return lhs <= rhs;
}
};
template<class lobj,class robj> class vgt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return lhs > rhs;
}
};
template<class lobj,class robj> class vge {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return lhs >= rhs;
}
};
// Generic list of functors
template<class lobj,class robj> class seq {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return lhs == rhs;
}
};
template<class lobj,class robj> class sne {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return lhs != rhs;
}
};
template<class lobj,class robj> class slt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return lhs < rhs;
}
};
template<class lobj,class robj> class sle {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return lhs <= rhs;
}
};
template<class lobj,class robj> class sgt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return lhs > rhs;
}
};
template<class lobj,class robj> class sge {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return lhs >= rhs;
}
};
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Integer gets extra relational functions. Could also implement these for RealF, RealD etc..
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class sfunctor>
inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs)
{
std::vector<Integer> vlhs(vInteger::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vrhs(vInteger::Nsimd());
vInteger ret;
extract(lhs,vlhs);
extract(rhs,vrhs);
for(int s=0;s<vInteger::Nsimd();s++){
vlhs[s] = sop(vlhs[s],vrhs[s]);
}
merge(ret,vlhs);
return ret;
}
inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
{
return Comparison(slt<Integer,Integer>(),lhs,rhs);
}
inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs)
{
return Comparison(sle<Integer,Integer>(),lhs,rhs);
}
inline vInteger operator > (const vInteger & lhs, const vInteger & rhs)
{
return Comparison(sgt<Integer,Integer>(),lhs,rhs);
}
inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs)
{
return Comparison(sge<Integer,Integer>(),lhs,rhs);
}
inline vInteger operator == (const vInteger & lhs, const vInteger & rhs)
{
return Comparison(seq<Integer,Integer>(),lhs,rhs);
}
inline vInteger operator != (const vInteger & lhs, const vInteger & rhs)
{
return Comparison(sne<Integer,Integer>(),lhs,rhs);
}
//////////////////////////////////////////////////////////////////////////
// relational operators
//
// Support <,>,<=,>=,==,!=
//
//Query supporting bitwise &, |, ^, !
//Query supporting logical &&, ||,
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vInteger> ret(lhs._grid);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
// Less than
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
}
// Greater than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs);
}
// equal
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs);
}
// not equal
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs);
}
}
#endif

104
lib/Grid_config.h Normal file
View File

@ -0,0 +1,104 @@
/* Grid_config.h. Generated from Grid_config.h.in by configure. */
/* Grid_config.h.in. Generated from configure.ac by autoheader. */
/* AVX */
#define AVX1 1
/* AVX2 */
/* #undef AVX2 */
/* AVX512 */
/* #undef AVX512 */
/* GRID_COMMS_MPI */
#define GRID_COMMS_MPI 1
/* GRID_COMMS_NONE */
/* #undef GRID_COMMS_NONE */
/* Define to 1 if you have the `gettimeofday' function. */
#define HAVE_GETTIMEOFDAY 1
/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1
/* Define to 1 if you have the <malloc.h> header file. */
/* #undef HAVE_MALLOC_H */
/* Define to 1 if you have the <malloc/malloc.h> header file. */
#define HAVE_MALLOC_MALLOC_H 1
/* Define to 1 if you have the <memory.h> header file. */
#define HAVE_MEMORY_H 1
/* Define to 1 if you have the <stdint.h> header file. */
#define HAVE_STDINT_H 1
/* Define to 1 if you have the <stdlib.h> header file. */
#define HAVE_STDLIB_H 1
/* Define to 1 if you have the <strings.h> header file. */
#define HAVE_STRINGS_H 1
/* Define to 1 if you have the <string.h> header file. */
#define HAVE_STRING_H 1
/* Define to 1 if you have the <sys/stat.h> header file. */
#define HAVE_SYS_STAT_H 1
/* Define to 1 if you have the <sys/types.h> header file. */
#define HAVE_SYS_TYPES_H 1
/* Define to 1 if you have the <unistd.h> header file. */
#define HAVE_UNISTD_H 1
/* Name of package */
#define PACKAGE "grid"
/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT "paboyle@ph.ed.ac.uk"
/* Define to the full name of this package. */
#define PACKAGE_NAME "Grid"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "Grid 1.0"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "grid"
/* Define to the home page for this package. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.0"
/* SSE4 */
/* #undef SSE4 */
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1
/* Version number of package */
#define VERSION "1.0"
/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
#define below would cause a syntax error. */
/* #undef _UINT32_T */
/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
#define below would cause a syntax error. */
/* #undef _UINT64_T */
/* Define to `unsigned int' if <sys/types.h> does not define. */
/* #undef size_t */
/* Define to the type of an unsigned integer type of width exactly 32 bits if
such a type exists and the standard includes do not define it. */
/* #undef uint32_t */
/* Define to the type of an unsigned integer type of width exactly 64 bits if
such a type exists and the standard includes do not define it. */
/* #undef uint64_t */

103
lib/Grid_config.h.in Normal file
View File

@ -0,0 +1,103 @@
/* Grid_config.h.in. Generated from configure.ac by autoheader. */
/* AVX */
#undef AVX1
/* AVX2 */
#undef AVX2
/* AVX512 */
#undef AVX512
/* GRID_COMMS_MPI */
#undef GRID_COMMS_MPI
/* GRID_COMMS_NONE */
#undef GRID_COMMS_NONE
/* Define to 1 if you have the `gettimeofday' function. */
#undef HAVE_GETTIMEOFDAY
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the <malloc.h> header file. */
#undef HAVE_MALLOC_H
/* Define to 1 if you have the <malloc/malloc.h> header file. */
#undef HAVE_MALLOC_MALLOC_H
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Name of package */
#undef PACKAGE
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* SSE4 */
#undef SSE4
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS
/* Version number of package */
#undef VERSION
/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
#define below would cause a syntax error. */
#undef _UINT32_T
/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
#define below would cause a syntax error. */
#undef _UINT64_T
/* Define to `unsigned int' if <sys/types.h> does not define. */
#undef size_t
/* Define to the type of an unsigned integer type of width exactly 32 bits if
such a type exists and the standard includes do not define it. */
#undef uint32_t
/* Define to the type of an unsigned integer type of width exactly 64 bits if
such a type exists and the standard includes do not define it. */
#undef uint64_t

16
lib/Grid_cshift.h Normal file
View File

@ -0,0 +1,16 @@
#ifndef _GRID_CSHIFT_H_
#define _GRID_CSHIFT_H_
#include <Grid_cshift_common.h>
#ifdef GRID_COMMS_NONE
#include <Grid_cshift_none.h>
#endif
#ifdef GRID_COMMS_FAKE
#include <Grid_cshift_fake.h>
#endif
#ifdef GRID_COMMS_MPI
#include <Grid_cshift_mpi.h>
#endif
#endif

326
lib/Grid_cshift_common.h Normal file
View File

@ -0,0 +1,326 @@
#ifndef _GRID_CSHIFT_COMMON_H_
#define _GRID_CSHIFT_COMMON_H_
//////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
//////////////////////////////////////////////////////
friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
buffer[bo++]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo]=rhs._odata[so+o+b];
bo++;
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split
//////////////////////////////////////////////////////
friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
extract(rhs._odata[so+o+b],pointers);
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb & cbmask ) {
extract(rhs._odata[so+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
rhs._odata[so+o+b]=buffer[bo++];
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split
//////////////////////////////////////////////////////
friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
merge(rhs._odata[so+o+b],pointers);
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*rhs._grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// Local to node Cshift
//////////////////////////////////////////////////////
friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3);
} else {
Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid = rhs._grid;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int gd = grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift);
// the permute type
int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? 1 : 0;
int sshift = grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
return ret;
}
#endif

263
lib/Grid_cshift_mpi.h Normal file
View File

@ -0,0 +1,263 @@
#ifndef _GRID_MPI_CSHIFT_H_
#define _GRID_MPI_CSHIFT_H_
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
Lattice<vobj> ret(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
// the permute type
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
if ( !comm_dim ) {
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
Cshift_comms(ret,rhs,dimension,shift);
}
return ret;
}
friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Cshift_comms(ret,rhs,dimension,shift,0x3);
} else {
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else {
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs._grid;
Lattice<vobj> temp(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
int cb= (cbmask==0x2)? 1 : 0;
int sshift= rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
if (!offnode) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = send_buf.size();
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
}
}
}
friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs._grid;
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
int bytes = buffer_size*words*sizeof(scalar_type);
std::vector<scalar_type *> pointers(Nsimd); //
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? 1 : 0;
int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
std::vector<int> icoor(grid->Nd());
for(int x=0;x<rd;x++){
int comm_any = 0;
for(int s=0;s<simd_layout;s++) {
int shifted_x = x+s*rd+sshift;
comm_offnode[s] = shifted_x >= ld;
comm_any = comm_any | comm_offnode[s];
comm_proc[s] = shifted_x/ld;
}
int o = 0;
int bo = x*grid->_ostride[dimension];
int sx = (x+sshift)%rd;
if ( comm_any ) {
for(int i=0;i<Nsimd;i++){
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int s;
grid->iCoorFromIindex(icoor,i);
s = icoor[dimension];
if(comm_offnode[s]){
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
}
// Permute by swizzling pointers in merge
int permute_slice=0;
int lshift=sshift%ld;
int wrap =lshift/rd;
int num =lshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
int toggle_bit = (Nsimd>>(permute_type+1));
int PermuteMap;
for(int i=0;i<Nsimd;i++){
if ( permute_slice ) {
PermuteMap=i^toggle_bit;
pointers[i] = rpointers[PermuteMap];
} else {
pointers[i] = rpointers[i];
}
}
Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
} else {
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
}
}
#endif

12
lib/Grid_cshift_none.h Normal file
View File

@ -0,0 +1,12 @@
#ifndef _GRID_NONE_CSHIFT_H_
#define _GRID_NONE_CSHIFT_H_
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
Cshift_local(ret,rhs,dimension,shift);
return ret;
}
#endif

91
lib/Grid_init.cc Executable file
View File

@ -0,0 +1,91 @@
/****************************************************************************/
/* PAB: Signal magic. Processor state dump is x86-64 specific */
/****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <signal.h>
#include "Grid.h"
#undef __X86_64
namespace Grid {
void Grid_init(int *argc,char ***argv)
{
#ifdef GRID_COMMS_MPI
MPI_Init(argc,argv);
#endif
Grid_debug_handler_init();
}
void Grid_finalize(void)
{
#ifdef GRID_COMMS_MPI
MPI_Finalize();
#endif
}
double usecond(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
{
printf("Caught signal %d\n",si->si_signo);
printf(" mem address %llx\n",(uint64_t)si->si_addr);
printf(" code %d\n",si->si_code);
#ifdef __X86_64
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
printf(" instruction %llx\n",(uint64_t)sc->rip);
#define REG(A) printf(" %s %lx\n",#A, sc-> A);
REG(rdi);
REG(rsi);
REG(rbp);
REG(rbx);
REG(rdx);
REG(rax);
REG(rcx);
REG(rsp);
REG(rip);
REG(r8);
REG(r9);
REG(r10);
REG(r11);
REG(r12);
REG(r13);
REG(r14);
REG(r15);
#endif
fflush(stdout);
if ( si->si_signo == SIGSEGV ) {
printf("Grid_sa_signal_handler: Oops... this was a sigsegv you naughty naughty programmer. Goodbye\n");
fflush(stdout);
exit(-1);
}
return;
};
void Grid_debug_handler_init(void)
{
struct sigaction sa,osa;
sigemptyset (&sa.sa_mask);
sa.sa_sigaction= Grid_sa_signal_handler;
sa.sa_flags = SA_SIGINFO;
sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL);
}
}

100
lib/Grid_math_type_mapper.h Normal file
View File

@ -0,0 +1,100 @@
#ifndef GRID_MATH_TYPE_MAPPER_H
#define GRID_MATH_TYPE_MAPPER_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////
// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
// Use of a helper class like this allows us to template specialise and "dress"
// other classes such as RealD == double, ComplexD == std::complex<double> with these
// traits.
//
// It is possible that we could do this more elegantly if I introduced a
// queryable trait in iScalar, iMatrix and iVector and used the query on vtype in
// place of the type mapper?
//
// Not sure how to do this, but probably could be done with a research effort
// to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
//
//////////////////////////////////////////////////////////////////////////////////
template <class T> class GridTypeMapper {
public:
typedef typename T::scalar_type scalar_type;
typedef typename T::vector_type vector_type;
typedef typename T::tensor_reduced tensor_reduced;
enum { TensorLevel = T::TensorLevel };
};
//////////////////////////////////////////////////////////////////////////////////
// Recursion stops with these template specialisations
//////////////////////////////////////////////////////////////////////////////////
template<> class GridTypeMapper<RealF> {
public:
typedef RealF scalar_type;
typedef RealF vector_type;
typedef RealF tensor_reduced ;
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<RealD> {
public:
typedef RealD scalar_type;
typedef RealD vector_type;
typedef RealD tensor_reduced;
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<ComplexF> {
public:
typedef ComplexF scalar_type;
typedef ComplexF vector_type;
typedef ComplexF tensor_reduced;
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<ComplexD> {
public:
typedef ComplexD scalar_type;
typedef ComplexD vector_type;
typedef ComplexD tensor_reduced;
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vRealF> {
public:
typedef RealF scalar_type;
typedef vRealF vector_type;
typedef vRealF tensor_reduced;
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vRealD> {
public:
typedef RealD scalar_type;
typedef vRealD vector_type;
typedef vRealD tensor_reduced;
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vComplexF> {
public:
typedef ComplexF scalar_type;
typedef vComplexF vector_type;
typedef vComplexF tensor_reduced;
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vComplexD> {
public:
typedef ComplexD scalar_type;
typedef vComplexD vector_type;
typedef vComplexD tensor_reduced;
enum { TensorLevel = 0 };
};
template<> class GridTypeMapper<vInteger> {
public:
typedef Integer scalar_type;
typedef vInteger vector_type;
typedef vInteger tensor_reduced;
enum { TensorLevel = 0 };
};
}
#endif

1542
lib/Grid_math_types.h Normal file

File diff suppressed because it is too large Load Diff

62
lib/Grid_predicated.h Normal file
View File

@ -0,0 +1,62 @@
#ifndef GRID_PREDICATED_H
#define GRID_PREDICATED_H
// Must implement the predicate gating the
// Must be able to reduce the predicate down to a single vInteger per site.
// Must be able to require the type be iScalar x iScalar x ....
// give a GetVtype method in iScalar
// and blow away the tensor structures.
//
template<class vobj>
inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
{
conformable(iftrue,iffalse);
conformable(iftrue,predicate);
conformable(iftrue,ret);
GridBase *grid=iftrue._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = grid->Nsimd();
const int words = sizeof(vobj)/sizeof(vector_type);
std::vector<Integer> mask(Nsimd);
std::vector<std::vector<scalar_type> > truevals (Nsimd,std::vector<scalar_type>(words) );
std::vector<std::vector<scalar_type> > falsevals(Nsimd,std::vector<scalar_type>(words) );
std::vector<scalar_type *> pointers(Nsimd);
#pragma omp parallel for
for(int ss=0;ss<iftrue._grid->oSites(); ss++){
for(int s=0;s<Nsimd;s++) pointers[s] = & truevals[s][0];
extract(iftrue._odata[ss] ,pointers);
for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
extract(iffalse._odata[ss] ,pointers);
extract(predicate._odata[ss],mask);
for(int s=0;s<Nsimd;s++){
if (mask[s]) pointers[s]=&truevals[s][0];
else pointers[s]=&falsevals[s][0];
}
merge(ret._odata[ss],pointers);
}
}
template<class vobj>
inline Lattice<vobj> where(const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
{
conformable(iftrue,iffalse);
conformable(iftrue,predicate);
Lattice<vobj> ret(iftrue._grid);
where(ret,predicate,iftrue,iffalse);
return ret;
}
#endif

293
lib/Grid_simd.h Normal file
View File

@ -0,0 +1,293 @@
#ifndef GRID_SIMD_H
#define GRID_SIMD_H
////////////////////////////////////////////////////////////////////////
// Define scalar and vector floating point types
//
// Scalar: RealF, RealD, ComplexF, ComplexD
//
// Vector: vRealF, vRealD, vComplexF, vComplexD
//
// Vector types are arch dependent
////////////////////////////////////////////////////////////////////////
#ifdef SSE4
#include <pmmintrin.h>
#endif
#if defined(AVX1) || defined (AVX2)
#include <immintrin.h>
#endif
#ifdef AVX512
#include <immintrin.h>
#endif
namespace Grid {
typedef float RealF;
typedef double RealD;
typedef std::complex<RealF> ComplexF;
typedef std::complex<RealD> ComplexD;
inline RealF adj(const RealF & r){ return r; }
inline RealF conj(const RealF & r){ return r; }
inline RealF real(const RealF & r){ return r; }
inline RealD adj(const RealD & r){ return r; }
inline RealD conj(const RealD & r){ return r; }
inline RealD real(const RealD & r){ return r; }
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
////////////////////////////////////////////////////////////////////////////////
//Provide support functions for basic real and complex data types required by Grid
//Single and double precision versions. Should be able to template this once only.
////////////////////////////////////////////////////////////////////////////////
inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
// conj already supported for complex
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline ComplexF adj(const ComplexF& r ){ return(conj(r)); }
//conj already supported for complex
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
class Zero{};
static Zero zero;
template<class itype> inline void zeroit(itype &arg){ arg=zero;};
template<> inline void zeroit(ComplexF &arg){ arg=0; };
template<> inline void zeroit(ComplexD &arg){ arg=0; };
template<> inline void zeroit(RealF &arg){ arg=0; };
template<> inline void zeroit(RealD &arg){ arg=0; };
#if defined (SSE4)
typedef __m128 fvec;
typedef __m128d dvec;
typedef __m128 cvec;
typedef __m128d zvec;
typedef __m128i ivec;
#endif
#if defined (AVX1) || defined (AVX2)
typedef __m256 fvec;
typedef __m256d dvec;
typedef __m256 cvec;
typedef __m256d zvec;
typedef __m256i ivec;
#endif
#if defined (AVX512)
typedef __m512 fvec;
typedef __m512d dvec;
typedef __m512 cvec;
typedef __m512d zvec;
typedef __m512i ivec;
#endif
#if defined (QPX)
typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision
typedef float cvec __attribute__ ((vector_size (16)));
typedef vector4double dvec;
typedef vector4double zvec;
#endif
#if defined (AVX1) || defined (AVX2) || defined (AVX512)
inline void v_prefetch0(int size, const char *ptr){
for(int i=0;i<size;i+=64){ // Define L1 linesize above// What about SSE?
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
}
}
#else
inline void v_prefetch0(int size, const char *ptr){};
#endif
/////////////////////////////////////////////////////////////////
// Generic extract/merge/permute
/////////////////////////////////////////////////////////////////
template<class vsimd,class scalar>
inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
// FIXME: bounce off stack is painful
// temporary hack while I figure out better way.
// There are intrinsics to do this work without the storage.
int Nextr=extracted.size();
int Nsimd=vsimd::Nsimd();
int s=Nsimd/Nextr;
std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd);
vstore(y,&buf[0]);
for(int i=0;i<Nextr;i++){
*extracted[i] = buf[i*s];
extracted[i]++;
}
};
template<class vsimd,class scalar>
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
int Nextr=extracted.size();
int Nsimd=vsimd::Nsimd();
int s=Nsimd/Nextr;
std::vector<scalar> buf(Nsimd);
for(int i=0;i<Nextr;i++){
for(int ii=0;ii<s;ii++){
buf[i*s+ii]=*extracted[i];
}
extracted[i]++;
}
vset(y,&buf[0]);
};
template<class vsimd,class scalar>
inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
// FIXME: bounce off stack is painful
// temporary hack while I figure out better way.
// There are intrinsics to do this work without the storage.
int Nextr=extracted.size();
int Nsimd=vsimd::Nsimd();
int s=Nsimd/Nextr;
std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd);
vstore(y,&buf[0]);
for(int i=0;i<Nextr;i++){
extracted[i] = buf[i*s];
}
};
template<class vsimd,class scalar>
inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
int Nextr=extracted.size();
int Nsimd=vsimd::Nsimd();
int s=Nsimd/Nextr;
std::vector<scalar> buf(Nsimd);
for(int i=0;i<Nextr;i++){
for(int ii=0;ii<s;ii++){
buf[i*s+ii]=extracted[i];
}
}
vset(y,&buf[0]);
};
//////////////////////////////////////////////////////////
// Permute
// Permute 0 every ABCDEFGH -> BA DC FE HG
// Permute 1 every ABCDEFGH -> CD AB GH EF
// Permute 2 every ABCDEFGH -> EFGH ABCD
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
// Permute 4 possible on half precision @512bit vectors.
//////////////////////////////////////////////////////////
template<class vsimd>
inline void Gpermute(vsimd &y,const vsimd &b,int perm){
switch (perm){
#if defined(AVX1)||defined(AVX2)
// 8x32 bits=>3 permutes
case 2: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
#endif
#ifdef SSE4
case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
#endif
#ifdef AVX512
// 16 floats=> permutes
// Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo
// Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn
// Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
// Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
case 3: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
case 2: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
#endif
#ifdef QPX
#error not implemented
#endif
default: assert(0); break;
}
};
};
#include <Grid_vInteger.h>
#include <Grid_vRealF.h>
#include <Grid_vRealD.h>
#include <Grid_vComplexF.h>
#include <Grid_vComplexD.h>
namespace Grid {
// NB: Template the following on "type Complex" and then implement *,+,- for
// ComplexF, ComplexD, RealF, RealD above to
// get full generality of binops with scalars.
inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
// Default precision
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
typedef RealD Real;
typedef vRealD vReal;
typedef vComplexD vComplex;
typedef std::complex<Real> Complex;
#else
typedef RealF Real;
typedef vRealF vReal;
typedef vComplexF vComplex;
typedef std::complex<Real> Complex;
#endif
}
#endif

351
lib/Grid_stencil.h Normal file
View File

@ -0,0 +1,351 @@
#ifndef GRID_STENCIL_H
#define GRID_STENCIL_H
//////////////////////////////////////////////////////////////////////////////////////////
// Must not lose sight that goal is to be able to construct really efficient
// gather to a point stencil code. CSHIFT is not the best way, so need
// additional stencil support.
//
// Stencil based code will pre-exchange haloes and use a table lookup for neighbours.
// This will be done with generality to allow easier efficient implementations.
// Overlap of comms and compute could be semi-automated by tabulating off-node connected,
// and
//
// Lattice <foo> could also allocate haloes which get used for stencil code.
//
// Grid could create a neighbour index table for a given stencil.
//
// Could also implement CovariantCshift, to fuse the loops and enhance performance.
//
//
// General stencil computation:
//
// Generic services
// 0) Prebuild neighbour tables
// 1) Compute sizes of all haloes/comms buffers; allocate them.
//
// 2) Gather all faces, and communicate.
// 3) Loop over result sites, giving nbr index/offnode info for each
//
// Could take a
// SpinProjectFaces
// start comms
// complete comms
// Reconstruct Umu
//
// Approach.
//
//////////////////////////////////////////////////////////////////////////////////////////
namespace Grid {
struct CommsRequest {
int words;
int unified_buffer_offset;
int tag;
int to_rank;
int from_rank;
} ;
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
public:
int _checkerboard;
int _npoints; // Move to template param?
GridBase * _grid;
// npoints of these
std::vector<int> _directions;
std::vector<int> _distances;
std::vector<int> _comm_buf_size;
std::vector<int> _permute_type;
// npoints x Osites() of these
std::vector<std::vector<int> > _offsets;
std::vector<std::vector<int> > _is_local;
std::vector<std::vector<int> > _permute;
int _unified_buffer_size;
int _request_count;
std::vector<CommsRequest> CommsRequests;
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances);
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
// Can this be avoided with simpler coding of comms?
void Local (int point, int dimension,int shift,int cbmask);
void Comms (int point, int dimension,int shift,int cbmask);
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
// Could allow a functional munging of the halo to another type during the comms.
// this could implement the 16bit/32bit/64bit compression.
template<class vobj> void HaloExchange(Lattice<vobj> &source,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf)
{
// conformable(source._grid,_grid);
assert(source._grid==_grid);
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
int u_comm_offset=0;
// Gather all comms buffers
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
for(int point = 0 ; point < _npoints; point++) {
printf("Point %d \n",point);fflush(stdout);
int dimension = _directions[point];
int displacement = _distances[point];
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension.
int shift = (displacement+fd)%fd;
int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
assert (checkerboard== _checkerboard);
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
// Gather phase
int sshift [2];
if ( comm_dim ) {
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
if (splice_dim) {
printf("splice 0x3 \n");fflush(stdout);
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
} else {
printf("NO splice 0x3 \n");fflush(stdout);
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
}
} else {
if(splice_dim){
printf("splice 0x1,2 \n");fflush(stdout);
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration
} else {
printf("NO splice 0x1,2 \n");fflush(stdout);
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);
}
}
}
}
}
template<class vobj> void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
int &u_comm_offset)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=_grid;
assert(rhs._grid==_grid);
// conformable(_grid,rhs._grid);
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); // hmm...
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
int cb= (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
printf("GatherStartComms x %d/%d\n",x,rd);fflush(stdout);
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
if (offnode) {
printf("GatherStartComms offnode x %d\n",x);fflush(stdout);
int words = send_buf.size();
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout);
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout);
int rank = _grid->_processor;
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
// FIXME Implement asynchronous send & also avoid buffer copy
_grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);
printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout);
for(int i=0;i<buffer_size;i++){
u_comm_buf[u_comm_offset+i]=recv_buf[i];
}
u_comm_offset+=buffer_size;
printf("GatherStartComms inserted x %d\n",x);fflush(stdout);
}
}
}
template<class vobj>
void GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
int &u_comm_offset)
{
const int Nsimd = _grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=_grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
/* FIXME ALTERNATE BUFFER DETERMINATION */
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
int bytes = buffer_size*words*sizeof(scalar_type);
std::vector<scalar_type *> pointers(Nsimd); //
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
std::vector<int> icoor(_grid->Nd());
for(int x=0;x<rd;x++){
int comm_any = 0;
for(int s=0;s<simd_layout;s++) {
int shifted_x = x+s*rd+sshift;
comm_offnode[s] = shifted_x >= ld;
comm_any = comm_any | comm_offnode[s];
comm_proc[s] = shifted_x/ld;
}
int o = 0;
int bo = x*_grid->_ostride[dimension];
int sx = (x+sshift)%rd;
if ( comm_any ) {
for(int i=0;i<Nsimd;i++){
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int s;
_grid->iCoorFromIindex(icoor,i);
s = icoor[dimension];
if(comm_offnode[s]){
int rank = _grid->_processor;
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
_grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
}
// Permute by swizzling pointers in merge
int permute_slice=0;
int lshift=sshift%ld;
int wrap =lshift/rd;
int num =lshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
int toggle_bit = (Nsimd>>(permute_type+1));
int PermuteMap;
for(int i=0;i<Nsimd;i++){
if ( permute_slice ) {
PermuteMap=i^toggle_bit;
pointers[i] = rpointers[PermuteMap];
} else {
pointers[i] = rpointers[i];
}
}
// Here we don't want to scatter, just place into a buffer.
for(int i=0;i<buffer_size;i++){
merge(u_comm_buf[u_comm_offset+i],pointers);
}
}
}
}
};
}
#endif

258
lib/Grid_stencil_common.cc Normal file
View File

@ -0,0 +1,258 @@
#include "Grid.h"
namespace Grid {
CartesianStencil::CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: _offsets(npoints),
_is_local(npoints),
_comm_buf_size(npoints),
_permute_type(npoints),
_permute(npoints)
{
_npoints = npoints;
_grid = grid;
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
_request_count =0;
CommsRequests.resize(0);
int osites = _grid->oSites();
for(int i=0;i<npoints;i++){
int point = i;
_offsets[i].resize( osites);
_is_local[i].resize(osites);
_permute[i].resize( osites);
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
_permute_type[point]=_grid->PermuteType(dimension);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int sshift[2];
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
// live in lattice or a comms buffer.
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Local(point,dimension,shift,0x3);
} else {
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Local(point,dimension,shift,0x2);// both with block stride loop iteration
}
} else { // All permute extract done in comms phase prior to Stencil application
// So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
} else {
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
}
}
}
}
void CartesianStencil::Local (int point, int dimension,int shift,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * _grid->_ostride[dimension];
int cb= (cbmask==0x2)? 1 : 0;
int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
}
}
void CartesianStencil::Comms (int point,int dimension,int shift,int cbmask)
{
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes.
int cb= (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
CommsRequest cr;
cr.tag = _request_count++;
cr.words = words;
cr.unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank);
CommsRequests.push_back(cr);
ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_offsets [point][lo+o+b]=ro+o+b;
_is_local[point][lo+o+b]=1;
_permute [point][lo+o+b]=permute;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
_offsets [point][lo+o+b]=ro+o+b;
_is_local[point][lo+o+b]=1;
_permute [point][lo+o+b]=permute;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_offsets [point][so+o+b]=offset+(bo++);
_is_local[point][so+o+b]=0;
_permute [point][so+o+b]=0;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
_offsets [point][so+o+b]=offset+(bo++);
_is_local[point][so+o+b]=0;
_permute [point][so+o+b]=0;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
}

12
lib/Grid_summation.h Normal file
View File

@ -0,0 +1,12 @@
#ifndef GRID_SUMMATION_H
#define GRID_SUMMATION_H
template<class vobj>
inline void sumBlocks(Lattice<vobj> &coarseData,const Lattice<vobj &fineData)
{
GridBase * fine = findData._grid;
GridBase * coarse= findData._grid;
return;
}
#endif

346
lib/Grid_vComplexD.h Normal file
View File

@ -0,0 +1,346 @@
#ifndef VCOMPLEXD_H
#define VCOMPLEXD_H
#include "Grid.h"
#include "Grid_vComplexF.h"
namespace Grid {
class vComplexD {
public:
zvec v;
public:
typedef zvec vector_type;
typedef ComplexD scalar_type;
vComplexD & operator = ( Zero & z){
vzero(*this);
return (*this);
}
vComplexD(){};
vComplexD(ComplexD a){
vsplat(*this,a);
};
vComplexD(double a){
vsplat(*this,ComplexD(a));
};
///////////////////////////////////////////////
// mac, mult, sub, add, adj
// Should do an AVX2 version with mac.
///////////////////////////////////////////////
friend inline void mac (vComplexD * __restrict__ y,const vComplexD * __restrict__ a,const vComplexD *__restrict__ x) {*y = (*a)*(*x)+(*y);};
friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
friend inline vComplexD adj(const vComplexD &in){ return conj(in); }
//////////////////////////////////
// Initialise to 1,0,i
//////////////////////////////////
friend inline void vone (vComplexD &ret){ vsplat(ret,1.0,0.0);}
friend inline void vzero (vComplexD &ret){ vsplat(ret,0.0,0.0);}
friend inline void vcomplex_i(vComplexD &ret){ vsplat(ret,0.0,1.0);}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////
friend inline vComplexD operator + (vComplexD a, vComplexD b)
{
vComplexD ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_add_pd(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_add_pd(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_add_pd(a.v,b.v);
#endif
#ifdef QPX
ret.v = vec_add(a.v,b.v);
#endif
return ret;
};
friend inline vComplexD operator - (vComplexD a, vComplexD b)
{
vComplexD ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_sub_pd(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_sub_pd(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_sub_pd(a.v,b.v);
#endif
#ifdef QPX
ret.v = vec_sub(a.v,b.v);
#endif
return ret;
};
friend inline vComplexD operator * (vComplexD a, vComplexD b)
{
vComplexD ret;
//Multiplicationof (ak+ibk)*(ck+idk)
// a + i b can be stored as a data structure
//From intel optimisation reference guide
/*
movsldup xmm0, Src1; load real parts into the destination,
; a1, a1, a0, a0
movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
VSHUFPD (VEX.256 encoded version)
IF IMM0[0] = 0
THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
IF IMM0[1] = 0
THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
IF IMM0[2] = 0
THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
IF IMM0[3] = 0
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI;
*/
#if defined (AVX1)|| defined (AVX2)
zvec ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, ar,ar b'00,00
ymm0 = _mm256_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br
ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi b'01,01
ymm2 = _mm256_shuffle_pd(a.v,a.v,0xF); // ymm2 <- ai,ai b'11,11
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
ret.v= _mm256_addsub_pd(ymm0,ymm1);
#endif
#ifdef SSE4
zvec ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar,
ymm0 = _mm_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_pd(b.v,b.v,0x1); // ymm1 <- br,bi b01
ymm2 = _mm_shuffle_pd(a.v,a.v,0x3); // ymm2 <- ai,ai b11
ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
ret.v= _mm_addsub_pd(ymm0,ymm1);
#endif
#ifdef AVX512
/* This is from
* Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets
* @inproceedings{McFarlin:2011:ASV:1995896.1995938,
* author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
* title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
* booktitle = {Proceedings of the International Conference on Supercomputing},
* series = {ICS '11},
* year = {2011},
* isbn = {978-1-4503-0102-2},
* location = {Tucson, Arizona, USA},
* pages = {265--274},
* numpages = {10},
* url = {http://doi.acm.org/10.1145/1995896.1995938},
* doi = {10.1145/1995896.1995938},
* acmid = {1995938},
* publisher = {ACM},
* address = {New York, NY, USA},
* keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
* }
*/
zvec vzero,ymm0,ymm1,real,imag;
vzero = _mm512_setzero();
ymm0 = _mm512_swizzle_pd(a.v, _MM_SWIZ_REG_CDAB); //
real = _mm512_mask_or_epi64(a.v, 0xAAAA,vzero, ymm0);
imag = _mm512_mask_sub_pd(a.v, 0x5555,vzero, ymm0);
ymm1 = _mm512_mul_pd(real, b.v);
ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1);
/* Imag OK */
#endif
#ifdef QPX
ret.v = vec_mul(a.v,b.v);
#endif
return ret;
};
////////////////////////////////////////////////////////////////////
// General permute; assumes vector length is same across
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
friend inline void permute(vComplexD &y,vComplexD b,int perm)
{
Gpermute<vComplexD>(y,b,perm);
}
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
{
Gmerge<vComplexD,ComplexD >(y,extracted);
}
friend inline void extract(const vComplexD &y,std::vector<ComplexD *> &extracted)
{
Gextract<vComplexD,ComplexD>(y,extracted);
}
friend inline void merge(vComplexD &y,std::vector<ComplexD > &extracted)
{
Gmerge<vComplexD,ComplexD >(y,extracted);
}
friend inline void extract(const vComplexD &y,std::vector<ComplexD > &extracted)
{
Gextract<vComplexD,ComplexD>(y,extracted);
}
///////////////////////
// Splat
///////////////////////
friend inline void vsplat(vComplexD &ret,ComplexD c){
float a= real(c);
float b= imag(c);
vsplat(ret,a,b);
}
friend inline void vsplat(vComplexD &ret,double rl,double ig){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(ig,rl,ig,rl);
#endif
#ifdef SSE4
ret.v = _mm_set_pd(ig,rl);
#endif
#ifdef AVX512
ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl);
#endif
#ifdef QPX
ret.v = {ig,rl,ig,rl};
#endif
}
friend inline void vset(vComplexD &ret,ComplexD *a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
#endif
#ifdef SSE4
ret.v = _mm_set_pd(a[0].imag(),a[0].real());
#endif
#ifdef AVX512
ret.v = _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
// Note v has a0 a1 a2 a3
#endif
#ifdef QPX
ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[3].imag()};
#endif
}
friend inline void vstore(const vComplexD &ret, ComplexD *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_pd((double *)a,ret.v);
#endif
#ifdef SSE4
_mm_store_pd((double *)a,ret.v);
#endif
#ifdef AVX512
_mm512_store_pd((double *)a,ret.v);
//Note v has a3 a2 a1 a0
#endif
#ifdef QPX
assert(0);
#endif
}
friend inline void vprefetch(const vComplexD &v)
{
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
}
////////////////////////
// Conjugate
////////////////////////
friend inline vComplexD conj(const vComplexD &in){
vComplexD ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2)
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
__m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
#endif
#ifdef SSE4
ret.v = _mm_addsub_pd(ret.v,in.v);
#endif
#ifdef AVX512
// Xeon does not have fmaddsub or addsub
// with mask 0xa (1010), v[0] -v[1] v[2] -v[3] ....
ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);
#endif
#ifdef QPX
assert(0);
#endif
return ret;
}
// REDUCE FIXME must be a cleaner implementation
friend inline ComplexD Reduce(const vComplexD & in)
{
#if defined (AVX1) || defined(AVX2)
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
__attribute__ ((aligned(32))) double c_[4];
_mm256_store_pd(c_,in.v);
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
#endif
#ifdef AVX512
return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
#endif
#ifdef QPX
#endif
}
// Unary negation
friend inline vComplexD operator -(const vComplexD &r) {
vComplexD ret;
vzero(ret);
ret = ret - r;
return ret;
}
// *=,+=,-= operators
inline vComplexD &operator *=(const vComplexD &r) {
*this = (*this)*r;
return *this;
}
inline vComplexD &operator +=(const vComplexD &r) {
*this = *this+r;
return *this;
}
inline vComplexD &operator -=(const vComplexD &r) {
*this = *this-r;
return *this;
}
public:
static int Nsimd(void) { return sizeof(zvec)/sizeof(double)/2;}
};
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
typedef vComplexD vDComplex;
inline void zeroit(vComplexD &z){ vzero(z);}
inline vComplexD outerProduct(const vComplexD &l, const vComplexD& r)
{
return l*r;
}
inline vComplexD trace(const vComplexD &arg){
return arg;
}
/////////////////////////////////////////////////////////////////////////
//// Generic routine to promote object<complex> -> object<vcomplex>
//// Supports the array reordering transformation that gives me SIMD utilisation
///////////////////////////////////////////////////////////////////////////
/*
template<template<class> class object>
inline object<vComplex> splat(object<Complex >s){
object<vComplex> ret;
vComplex * v_ptr = (vComplex *)& ret;
Complex * s_ptr = (Complex *) &s;
for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
vsplat(*(v_ptr++),*(s_ptr++));
}
return ret;
}
*/
}
#endif

384
lib/Grid_vComplexF.h Normal file
View File

@ -0,0 +1,384 @@
#ifndef VCOMPLEXF
#define VCOMPLEXF
#include "Grid.h"
namespace Grid {
/*
inline void Print(const char *A,cvec c) {
float *fp=(float *)&c;
printf(A);
printf(" %le %le %le %le %le %le %le %le\n",
fp[0],fp[1],fp[2],fp[3],fp[4],fp[5],fp[6],fp[7]);
}
*/
class vComplexF {
// protected:
public:
cvec v;
public:
static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
public:
typedef cvec vector_type;
typedef ComplexF scalar_type;
vComplexF & operator = ( Zero & z){
vzero(*this);
return (*this);
}
vComplexF(){};
vComplexF(ComplexF a){
vsplat(*this,a);
};
vComplexF(double a){
vsplat(*this,ComplexF(a));
};
///////////////////////////////////////////////
// mac, mult, sub, add, adj
// Should do an AVX2 version with mac.
///////////////////////////////////////////////
friend inline void mac (vComplexF * __restrict__ y,const vComplexF * __restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
friend inline vComplexF adj(const vComplexF &in){ return conj(in); }
//////////////////////////////////
// Initialise to 1,0,i
//////////////////////////////////
friend inline void vone(vComplexF &ret) { vsplat(ret,1.0,0.0); }
friend inline void vzero(vComplexF &ret) { vsplat(ret,0.0,0.0); }
friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0);}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////
friend inline vComplexF operator + (vComplexF a, vComplexF b)
{
vComplexF ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_add_ps(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_add_ps(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_add_ps(a.v,b.v);
#endif
#ifdef QPX
#error
#endif
return ret;
};
friend inline vComplexF operator - (vComplexF a, vComplexF b)
{
vComplexF ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_sub_ps(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_sub_ps(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_sub_ps(a.v,b.v);
#endif
#ifdef QPX
#error
#endif
return ret;
};
friend inline vComplexF operator * (vComplexF a, vComplexF b)
{
vComplexF ret;
//Multiplicationof (ak+ibk)*(ck+idk)
// a + i b can be stored as a data structure
//From intel optimisation reference
/*
movsldup xmm0, Src1; load real parts into the destination,
; a1, a1, a0, a0
movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
*/
#if defined (AVX1)|| defined (AVX2)
cvec ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_mul_ps(ymm0,b.v); // ymm0 <- ar bi, ar br
// FIXME AVX2 could MAC
ymm1 = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
ret.v= _mm256_addsub_ps(ymm0,ymm1);
#endif
#ifdef SSE4
cvec ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm_mul_ps(ymm0,b.v); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
ret.v= _mm_addsub_ps(ymm0,ymm1);
#endif
#ifdef AVX512
//
cvec vzero,ymm0,ymm1,real, imag;
vzero = _mm512_setzero();
ymm0 = _mm512_swizzle_ps(a.v, _MM_SWIZ_REG_CDAB); //
real = _mm512_mask_or_epi32(a.v, 0xAAAA,vzero, ymm0);
imag = _mm512_mask_sub_ps(a.v, 0x5555,vzero, ymm0);
ymm1 = _mm512_mul_ps(real, b.v);
ymm0 = _mm512_swizzle_ps(b.v, _MM_SWIZ_REG_CDAB); // OK
ret.v = _mm512_fmadd_ps(ymm0,imag,ymm1);
#endif
#ifdef QPX
ret.v = vec_mul(a.v,b.v);
#endif
return ret;
};
////////////////////////////////////////////////////////////////////////
// FIXME: gonna remove these load/store, get, set, prefetch
////////////////////////////////////////////////////////////////////////
friend inline void vset(vComplexF &ret, ComplexF *a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
#endif
#ifdef SSE4
ret.v = _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
#endif
#ifdef AVX512
ret.v = _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
// Note v has a0 a1 a2 a3 a4 a5 a6 a7
#endif
#ifdef QPX
ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
#endif
}
///////////////////////
// Splat
///////////////////////
friend inline void vsplat(vComplexF &ret,ComplexF c){
float a= real(c);
float b= imag(c);
vsplat(ret,a,b);
}
friend inline void vstore(const vComplexF &ret, ComplexF *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_ps((float *)a,ret.v);
#endif
#ifdef SSE4
_mm_store_ps((float *)a,ret.v);
#endif
#ifdef AVX512
_mm512_store_ps((float *)a,ret.v);
//Note v has a3 a2 a1 a0
#endif
#ifdef QPX
assert(0);
#endif
}
friend inline void vprefetch(const vComplexF &v)
{
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
}
friend inline void vsplat(vComplexF &ret,float a,float b){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(b,a,b,a,b,a,b,a);
#endif
#ifdef SSE4
ret.v = _mm_set_ps(a,b,a,b);
#endif
#ifdef AVX512
ret.v = _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
#endif
#ifdef QPX
ret.v = {a,b,a,b};
#endif
}
friend inline ComplexF Reduce(const vComplexF & in)
{
#ifdef SSE4
#error
#endif
#if defined (AVX1) || defined(AVX2)
// FIXME this is inefficient and use
__attribute__ ((aligned(32))) float c_[8];
_mm256_store_ps(c_,in.v);
return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
#endif
#ifdef AVX512
return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
#endif
#ifdef QPX
#endif
}
friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
vComplexF va;
vsplat(va,a);
return va*b;
}
friend inline vComplexF operator * (vComplexF b,const ComplexF &a){
return a*b;
}
/*
template<class real>
friend inline vComplexF operator * (vComplexF b,const real &a){
vComplexF va;
Complex ca(a,0);
vsplat(va,ca);
return va*b;
}
template<class real>
friend inline vComplexF operator * (const real &a,vComplexF b){
return a*b;
}
friend inline vComplexF operator + (const Complex &a, vComplexF b){
vComplexF va;
vsplat(va,a);
return va+b;
}
friend inline vComplexF operator + (vComplexF b,const Complex &a){
return a+b;
}
template<class real>
friend inline vComplexF operator + (vComplexF b,const real &a){
vComplexF va;
Complex ca(a,0);
vsplat(va,ca);
return va+b;
}
template<class real>
friend inline vComplexF operator + (const real &a,vComplexF b){
return a+b;
}
friend inline vComplexF operator - (const Complex &a, vComplexF b){
vComplexF va;
vsplat(va,a);
return va-b;
}
friend inline vComplexF operator - (vComplexF b,const Complex &a){
vComplexF va;
vsplat(va,a);
return b-va;
}
template<class real>
friend inline vComplexF operator - (vComplexF b,const real &a){
vComplexF va;
Complex ca(a,0);
vsplat(va,ca);
return b-va;
}
template<class real>
friend inline vComplexF operator - (const real &a,vComplexF b){
vComplexF va;
Complex ca(a,0);
vsplat(va,ca);
return va-b;
}
*/
///////////////////////
// Conjugate
///////////////////////
friend inline vComplexF conj(const vComplexF &in){
vComplexF ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2)
cvec tmp;
tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
#endif
#ifdef SSE4
ret.v = _mm_addsub_ps(ret.v,in.v);
#endif
#ifdef AVX512
ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag
#endif
#ifdef QPX
assert(0);
#endif
return ret;
}
// Unary negation
friend inline vComplexF operator -(const vComplexF &r) {
vComplexF ret;
vzero(ret);
ret = ret - r;
return ret;
}
// *=,+=,-= operators
inline vComplexF &operator *=(const vComplexF &r) {
*this = (*this)*r;
return *this;
}
inline vComplexF &operator +=(const vComplexF &r) {
*this = *this+r;
return *this;
}
inline vComplexF &operator -=(const vComplexF &r) {
*this = *this-r;
return *this;
}
friend inline void permute(vComplexF &y,vComplexF b,int perm)
{
Gpermute<vComplexF>(y,b,perm);
}
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
{
Gmerge<vComplexF,ComplexF >(y,extracted);
}
friend inline void extract(const vComplexF &y,std::vector<ComplexF *> &extracted)
{
Gextract<vComplexF,ComplexF>(y,extracted);
}
friend inline void merge(vComplexF &y,std::vector<ComplexF > &extracted)
{
Gmerge<vComplexF,ComplexF >(y,extracted);
}
friend inline void extract(const vComplexF &y,std::vector<ComplexF > &extracted)
{
Gextract<vComplexF,ComplexF>(y,extracted);
}
};
inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)
{
return conj(l)*r;
}
inline void zeroit(vComplexF &z){ vzero(z);}
inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r)
{
return l*r;
}
inline vComplexF trace(const vComplexF &arg){
return arg;
}
}
#endif

259
lib/Grid_vInteger.h Normal file
View File

@ -0,0 +1,259 @@
#ifndef VINTEGER_H
#define VINTEGER_H
#include "Grid.h"
namespace Grid {
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
typedef uint32_t Integer;
class vInteger {
protected:
public:
ivec v;
typedef ivec vector_type;
typedef Integer scalar_type;
vInteger(){};
vInteger & operator = (const Zero & z){
vzero(*this);
return (*this);
}
vInteger(Integer a){
vsplat(*this,a);
};
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////
friend inline vInteger operator + ( vInteger a, vInteger b)
{
vInteger ret;
#if defined (AVX1)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a.v,0);
b0 = _mm256_extractf128_si256(b.v,0);
a1 = _mm256_extractf128_si256(a.v,1);
b1 = _mm256_extractf128_si256(b.v,1);
a0 = _mm_add_epi32(a0,b0);
a1 = _mm_add_epi32(a1,b1);
ret.v = _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
ret.v = _mm256_add_epi32(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_add_epi32(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_add_epi32(a.v,b.v);
#endif
#ifdef QPX
// Implement as array of ints is only option
#error
#endif
return ret;
};
friend inline vInteger operator - ( vInteger a, vInteger b)
{
vInteger ret;
#if defined (AVX1)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a.v,0);
b0 = _mm256_extractf128_si256(b.v,0);
a1 = _mm256_extractf128_si256(a.v,1);
b1 = _mm256_extractf128_si256(b.v,1);
a0 = _mm_sub_epi32(a0,b0);
a1 = _mm_sub_epi32(a1,b1);
ret.v = _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
ret.v = _mm256_sub_epi32(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_sub_epi32(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_sub_epi32(a.v,b.v);
#endif
#ifdef QPX
// Implement as array of ints is only option
#error
#endif
return ret;
};
friend inline vInteger operator * ( vInteger a, vInteger b)
{
vInteger ret;
#if defined (AVX1)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a.v,0);
b0 = _mm256_extractf128_si256(b.v,0);
a1 = _mm256_extractf128_si256(a.v,1);
b1 = _mm256_extractf128_si256(b.v,1);
a0 = _mm_mul_epi32(a0,b0);
a1 = _mm_mul_epi32(a1,b1);
ret.v = _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
ret.v = _mm256_mul_epi32(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_mul_epi32(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_mul_epi32(a.v,b.v);
#endif
#ifdef QPX
// Implement as array of ints is only option
#error
#endif
return ret;
};
///////////////////////////////////////////////
// mult, sub, add, adj,conj, mac functions
///////////////////////////////////////////////
friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) + (*r);}
friend inline void mac (vInteger &y,const vInteger a,const vInteger x){
y = a*x+y;
}
//////////////////////////////////
// Initialise to 1,0,i
//////////////////////////////////
friend inline void vone (vInteger &ret){vsplat(ret,1);}
friend inline void vzero(vInteger &ret){vsplat(ret,0);}
friend inline void vtrue (vInteger &ret){vsplat(ret,0xFFFFFFFF);}
friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
/////////////////////////////////////////////////////
// Broadcast a value across Nsimd copies.
/////////////////////////////////////////////////////
friend inline void vsplat(vInteger &ret,scalar_type a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set1_epi32(a);
#endif
#ifdef SSE4
ret.v = _mm_set1_epi32(a);
#endif
#ifdef AVX512
ret.v = _mm512_set1_epi32(a);
#endif
#ifdef QPX
#error
#endif
}
friend inline void vset(vInteger &ret,scalar_type *a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE4
ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
#endif
#ifdef AVX512
ret.v = _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
#endif
#ifdef QPX
#error
#endif
}
friend inline void vstore(const vInteger &ret, Integer *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_si256((__m256i*)a,ret.v);
#endif
#ifdef SSE4
_mm_store_si128((__m128i *)a,ret.v);
#endif
#ifdef AVX512
_mm512_store_si512(a,ret.v);
#endif
#ifdef QPX
assert(0);
#endif
}
friend inline void vprefetch(const vInteger &v)
{
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
}
// Unary negation
friend inline vInteger operator -(const vInteger &r) {
vInteger ret;
vzero(ret);
ret = ret - r;
return ret;
}
friend inline Integer Reduce(const vInteger & in)
{
// unimplemented
assert(0);
}
// *=,+=,-= operators
inline vInteger &operator *=(const vInteger &r) {
*this = (*this)*r;
return *this;
}
inline vInteger &operator +=(const vInteger &r) {
*this = *this+r;
return *this;
}
inline vInteger &operator -=(const vInteger &r) {
*this = *this-r;
return *this;
}
friend inline void permute(vInteger &y,const vInteger b,int perm)
{
Gpermute<vInteger>(y,b,perm);
}
friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
{
Gmerge<vInteger,Integer>(y,extracted);
}
friend inline void extract(const vInteger &y,std::vector<Integer *> &extracted)
{
Gextract<vInteger,Integer>(y,extracted);
}
friend inline void merge(vInteger &y,std::vector<Integer> &extracted)
{
Gmerge<vInteger,Integer>(y,extracted);
}
friend inline void extract(const vInteger &y,std::vector<Integer> &extracted)
{
Gextract<vInteger,Integer>(y,extracted);
}
public:
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
};
inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
inline void zeroit(vInteger &z){ vzero(z);}
inline vInteger outerProduct(const vInteger &l, const vInteger& r)
{
return l*r;
}
}
#endif

259
lib/Grid_vRealD.h Normal file
View File

@ -0,0 +1,259 @@
#ifndef VREALD_H
#define VREALD_H
#include "Grid.h"
namespace Grid {
class vRealD {
public:
dvec v; // dvec is double precision vector
public:
typedef dvec vector_type;
typedef RealD scalar_type;
vRealD(){};
vRealD(RealD a){
vsplat(*this,a);
};
friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
friend inline vRealD adj(const vRealD &in) { return in; }
friend inline vRealD conj(const vRealD &in){ return in; }
friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
#if defined (AVX1) || defined (SSE4)
y = a*x+y;
#endif
#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
// accelerates multiply accumulate, but not general multiply add
y.v = _mm256_fmadd_pd(a.v,x.v,y.v);
#endif
#ifdef AVX512
// here precision of vector are still single
y.v = _mm512_fmadd_pd(a.v,x.v,y.v);
#endif
#ifdef QPX
y.v = vec_madd(a.v,x.v,y.v);
#endif
}
//////////////////////////////////
// Initialise to 1,0
//////////////////////////////////
friend inline void vone (vRealD &ret){ vsplat(ret,1.0);}
friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////
friend inline vRealD operator + (vRealD a, vRealD b)
{
vRealD ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_add_pd(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_add_pd(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_add_pd(a.v,b.v);
#endif
#ifdef QPX
ret.v = vec_add(a.v,b.v);
#endif
return ret;
};
friend inline vRealD operator - (vRealD a, vRealD b)
{
vRealD ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_sub_pd(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_sub_pd(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_sub_pd(a.v,b.v);
#endif
#ifdef QPX
ret.v = vec_sub(a.v,b.v);
#endif
return ret;
};
friend inline vRealD operator * (vRealD a, vRealD b)
{
vRealD ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_mul_pd(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_mul_pd(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_mul_pd(a.v,b.v);
#endif
#ifdef QPX
ret.v = vec_mul(a.v,b.v);
#endif
return ret;
};
////////////////////////////////////////////////////////////////////
// General permute; assumes vector length is same across
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
friend inline void permute(vRealD &y,vRealD b,int perm)
{
Gpermute<vRealD>(y,b,perm);
}
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
{
Gmerge<vRealD,RealD >(y,extracted);
}
friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
{
Gextract<vRealD,RealD>(y,extracted);
}
friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
{
Gmerge<vRealD,RealD >(y,extracted);
}
friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
{
Gextract<vRealD,RealD>(y,extracted);
}
friend inline void vsplat(vRealD &ret,double a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(a,a,a,a);
#endif
#ifdef SSE4
ret.v = _mm_set_pd(a,a);
#endif
#ifdef AVX512
ret.v = _mm512_set1_pd(a);
#endif
#ifdef QPX
ret.v = {a,a,a,a};
#endif
}
friend inline void vset(vRealD &ret, double *a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE4
ret.v = _mm_set_pd(a[0],a[1]);
#endif
#ifdef AVX512
ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
// Note v has a0 a1 a2 a3 a4 a5 a6 a7
#endif
#ifdef QPX
ret.v = {a[0],a[1],a[2],a[3]};
#endif
}
friend inline void vstore(const vRealD &ret, double *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_pd(a,ret.v);
#endif
#ifdef SSE4
_mm_store_pd(a,ret.v);
#endif
#ifdef AVX512
_mm512_store_pd(a,ret.v);
// Note v has a7 a6 a5ba4 a3 a2 a1 a0
#endif
#ifdef QPX
assert(0);
#endif
}
friend inline void vprefetch(const vRealD &v)
{
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
}
// Unary negation
friend inline vRealD operator -(const vRealD &r) {
vRealD ret;
vzero(ret);
ret = ret - r;
return ret;
}
friend inline RealD Reduce(const vRealD & in)
{
#if defined (AVX1) || defined(AVX2)
typedef union {
uint64_t l;
double d;
} my_conv_t;
my_conv_t converter;
// more reduce_add
/*
__attribute__ ((aligned(32))) double c_[16];
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp = 3+2,1+0,1+0,3+2
hadd = _mm256_hadd_pd(tmp,tmp); // tmp = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
_mm256_store_pd(c_,hadd);<3B>
return c[0]
*/
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
hadd = _mm256_hadd_pd(hadd,hadd); // hadd = 1+0+3+2...
converter.l = _mm256_extract_epi64(hadd,0);
return converter.d;
#endif
#ifdef AVX512
return _mm512_reduce_add_pd(in.v);
/*
__attribute__ ((aligned(32))) double c_[8];
_mm512_store_pd(c_,in.v);
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
*/
#endif
#ifdef QPX
#endif
}
// *=,+=,-= operators
inline vRealD &operator *=(const vRealD &r) {
*this = (*this)*r;
return *this;
}
inline vRealD &operator +=(const vRealD &r) {
*this = *this+r;
return *this;
}
inline vRealD &operator -=(const vRealD &r) {
*this = *this-r;
return *this;
}
public:
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
};
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
inline void zeroit(vRealD &z){ vzero(z);}
inline vRealD outerProduct(const vRealD &l, const vRealD& r)
{
return l*r;
}
inline vRealD trace(const vRealD &arg){
return arg;
}
inline vRealD real(const vRealD &arg){
return arg;
}
}
#endif

279
lib/Grid_vRealF.h Normal file
View File

@ -0,0 +1,279 @@
#ifndef VREALF_H
#define VREALF_H
#include "Grid.h"
namespace Grid {
class vRealF {
public:
fvec v;
public:
typedef fvec vector_type;
typedef RealF scalar_type;
vRealF(){};
vRealF(RealF a){
vsplat(*this,a);
};
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////
friend inline vRealF operator + ( vRealF a, vRealF b)
{
vRealF ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_add_ps(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_add_ps(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_add_ps(a.v,b.v);
#endif
#ifdef QPX
vector4double aa,bb,cc;
aa = vec_lda(0,(float *)&a);
bb = vec_lda(0,(float *)&b);
cc = vec_add(aa,bb);
vec_sta(cc,0,(float *)&ret.v);
#endif
return ret;
};
friend inline vRealF operator - ( vRealF a, vRealF b)
{
vRealF ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_sub_ps(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_sub_ps(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_sub_ps(a.v,b.v);
#endif
#ifdef QPX
vector4double aa,bb,cc;
aa = vec_lda(0,(float *)&a);
bb = vec_lda(0,(float *)&b);
cc = vec_sub(aa,bb);
vec_sta(cc,0,(float *)&ret.v);
#endif
return ret;
};
friend inline vRealF operator * ( vRealF a, vRealF b)
{
vRealF ret;
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_mul_ps(a.v,b.v);
#endif
#ifdef SSE4
ret.v = _mm_mul_ps(a.v,b.v);
#endif
#ifdef AVX512
ret.v = _mm512_mul_ps(a.v,b.v);
#endif
#ifdef QPX
vector4double aa,bb,cc; // QPX single we are forced to load as this promotes single mem->double regs.
aa = vec_lda(0,(float *)&a);
bb = vec_lda(0,(float *)&b);
cc = vec_mul(aa,bb);
vec_sta(cc,0,(float *)&ret.v);
#endif
return ret;
};
///////////////////////////////////////////////
// mult, sub, add, adj,conj, mac functions
///////////////////////////////////////////////
friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
friend inline vRealF adj(const vRealF &in) { return in; }
friend inline vRealF conj(const vRealF &in){ return in; }
friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
#if defined (AVX1) || defined (SSE4)
y = a*x+y;
#endif
#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
// accelerates multiply accumulate, but not general multiply add
y.v = _mm256_fmadd_ps(a.v,x.v,y.v);
#endif
#ifdef AVX512
y.v = _mm512_fmadd_ps(a.v,x.v,y.v);
#endif
#ifdef QPX
vector4double aa,xx,yy; // QPX single we are forced to load as this promotes single mem->double regs.
aa = vec_lda(0,(float *)&a.v);
xx = vec_lda(0,(float *)&x.v);
yy = vec_lda(0,(float *)&y.v);
yy = vec_madd(aa,xx,yy);
vec_sta(yy,0,(float *)&y.v);
#endif
}
//////////////////////////////////
// Initialise to 1,0,i
//////////////////////////////////
friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
////////////////////////////////////////////////////////////////////
// General permute; assumes vector length is same across
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
friend inline void permute(vRealF &y,vRealF b,int perm)
{
Gpermute<vRealF>(y,b,perm);
}
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
{
Gmerge<vRealF,RealF >(y,extracted);
}
friend inline void extract(const vRealF &y,std::vector<RealF *> &extracted)
{
Gextract<vRealF,RealF>(y,extracted);
}
friend inline void merge(vRealF &y,std::vector<RealF> &extracted)
{
Gmerge<vRealF,RealF >(y,extracted);
}
friend inline void extract(const vRealF &y,std::vector<RealF> &extracted)
{
Gextract<vRealF,RealF>(y,extracted);
}
/////////////////////////////////////////////////////
// Broadcast a value across Nsimd copies.
/////////////////////////////////////////////////////
friend inline void vsplat(vRealF &ret,float a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a);
#endif
#ifdef SSE4
ret.v = _mm_set_ps(a,a,a,a);
#endif
#ifdef AVX512
//ret.v = _mm512_set_ps(a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a);
ret.v = _mm512_set1_ps(a);
#endif
#ifdef QPX
ret.v = {a,a,a,a};
#endif
}
friend inline void vset(vRealF &ret, float *a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
#endif
#ifdef SSE4
ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
#endif
#ifdef AVX512
ret.v = _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
// Note v has a0 a1 a2 a3 a4 a5 a6 a7
#endif
#ifdef QPX
ret.v = {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
#endif
}
////////////////////////////////////////////////////////////////////////
// FIXME: gonna remove these load/store, get, set, prefetch
////////////////////////////////////////////////////////////////////////
friend inline void vstore(const vRealF &ret, float *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_ps(a,ret.v);
#endif
#ifdef SSE4
_mm_store_ps(a,ret.v);
#endif
#ifdef AVX512
_mm512_store_ps(a,ret.v);
// Note v has a7 a6 a5ba4 a3 a2 a1 a0
#endif
#ifdef QPX
assert(0);
#endif
}
friend inline void vprefetch(const vRealF &v)
{
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
}
// Unary negation
friend inline vRealF operator -(const vRealF &r) {
vRealF ret;
vzero(ret);
ret = ret - r;
return ret;
}
friend inline RealF Reduce(const vRealF & in)
{
#if defined (AVX1) || defined(AVX2)
__attribute__ ((aligned(32))) float c_[16];
__m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
__m256 hadd = _mm256_hadd_ps(in.v,tmp);
tmp = _mm256_permute2f128_ps(hadd,hadd,0x01);
hadd = _mm256_hadd_ps(tmp,tmp);
_mm256_store_ps(c_,hadd);
return (float)c_[0];
#endif
#ifdef AVX512
return _mm512_reduce_add_ps(in.v);
/*
__attribute__ ((aligned(64))) float c_[16];
_mm512_store_ps(c_,in.v);
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
+c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
*/
#endif
#ifdef QPX
#endif
}
// *=,+=,-= operators
inline vRealF &operator *=(const vRealF &r) {
*this = (*this)*r;
return *this;
}
inline vRealF &operator +=(const vRealF &r) {
*this = *this+r;
return *this;
}
inline vRealF &operator -=(const vRealF &r) {
*this = *this-r;
return *this;
}
public:
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
};
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
inline void zeroit(vRealF &z){ vzero(z);}
inline vRealF outerProduct(const vRealF &l, const vRealF& r)
{
return l*r;
}
inline vRealF trace(const vRealF &arg){
return arg;
}
inline vRealF real(const vRealF &arg){
return arg;
}
}
#endif

44
lib/Makefile.am Normal file
View File

@ -0,0 +1,44 @@
# additional include paths necessary to compile the C++ library
AM_CXXFLAGS = -I$(top_srcdir)/
extra_sources=
if BUILD_COMMS_MPI
extra_sources+=Grid_communicator_mpi.cc
extra_sources+=Grid_stencil_common.cc
endif
if BUILD_COMMS_NONE
extra_sources+=Grid_communicator_fake.cc
extra_sources+=Grid_stencil_common.cc
endif
#
# Libraries
#
lib_LIBRARIES = libGrid.a
libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
#
# Include files
#
include_HEADERS = Grid_config.h\
Grid.h\
Grid_simd.h\
Grid_vComplexD.h\
Grid_vComplexF.h\
Grid_vRealD.h\
Grid_vRealF.h\
Grid_Cartesian.h\
Grid_Lattice.h\
Grid_Communicator.h\
Grid_QCD.h\
Grid_aligned_allocator.h\
Grid_cshift.h\
Grid_cshift_common.h\
Grid_cshift_mpi.h\
Grid_cshift_none.h\
Grid_stencil.h\
Grid_math_types.h