mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-13 04:37:05 +01:00
Reorg of build structure
This commit is contained in:
63
lib/Grid.h
Normal file
63
lib/Grid.h
Normal file
@ -0,0 +1,63 @@
|
||||
//
|
||||
// Grid.cpp
|
||||
// simd
|
||||
//
|
||||
// Created by Peter Boyle on 09/05/2014.
|
||||
// Copyright (c) 2014 University of Edinburgh. All rights reserved.
|
||||
//
|
||||
|
||||
|
||||
#ifndef GRID_V3_H
|
||||
#define GRID_V3_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <random>
|
||||
#include <functional>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include <Grid_config.h>
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Tunable header includes
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef HAVE_OPENMP
|
||||
#define OMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_MALLOC_MALLOC_H
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_MALLOC_H
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#include <Grid_aligned_allocator.h>
|
||||
#include <Grid_simd.h>
|
||||
#include <Grid_math_types.h>
|
||||
#include <Grid_Cartesian.h>
|
||||
#include <Grid_Lattice.h>
|
||||
#include <Grid_comparison.h>
|
||||
#include <Grid_stencil.h>
|
||||
#include <Grid_QCD.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
void Grid_init(int *argc,char ***argv);
|
||||
void Grid_finalize(void);
|
||||
double usecond(void);
|
||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||
void Grid_debug_handler_init(void);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
399
lib/Grid_Cartesian.h
Normal file
399
lib/Grid_Cartesian.h
Normal file
@ -0,0 +1,399 @@
|
||||
#ifndef GRID_CARTESIAN_H
|
||||
#define GRID_CARTESIAN_H
|
||||
|
||||
#include <Grid.h>
|
||||
#include <Grid_Communicator.h>
|
||||
namespace Grid{
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Grid Support.
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class GridBase : public CartesianCommunicator {
|
||||
public:
|
||||
|
||||
// Give Lattice access
|
||||
template<class object> friend class Lattice;
|
||||
|
||||
GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
|
||||
|
||||
|
||||
//FIXME
|
||||
// protected:
|
||||
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
||||
// and one generator per site.
|
||||
// std::default_random_engine generator;
|
||||
// static std::mt19937 generator( 9 );
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Commicator provides information on the processor grid
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// unsigned long _ndimension;
|
||||
// std::vector<int> _processors; // processor grid
|
||||
// int _processor; // linear processor rank
|
||||
// std::vector<int> _processor_coor; // linear processor rank
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Physics Grid information.
|
||||
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
|
||||
std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
|
||||
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
|
||||
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
|
||||
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
|
||||
std::vector<int> _ostride; // Outer stride for each dimension
|
||||
std::vector<int> _istride; // Inner stride i.e. within simd lane
|
||||
int _osites; // _isites*_osites = product(dimensions).
|
||||
int _isites;
|
||||
std::vector<int> _slice_block; // subslice information
|
||||
std::vector<int> _slice_stride;
|
||||
std::vector<int> _slice_nblock;
|
||||
|
||||
// Might need these at some point
|
||||
// std::vector<int> _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
|
||||
// std::vector<int> _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
|
||||
|
||||
public:
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Checkerboarding interface is virtual and overridden by
|
||||
// GridCartesian / GridRedBlackCartesian
|
||||
////////////////////////////////////////////////////////////////
|
||||
virtual int CheckerBoarded(int dim)=0;
|
||||
virtual int CheckerBoard(std::vector<int> site)=0;
|
||||
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||
inline int CheckerBoardFromOindex (int Oindex){
|
||||
std::vector<int> ocoor;
|
||||
oCoorFromOindex(ocoor,Oindex);
|
||||
int ss=0;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
ss=ss+ocoor[d];
|
||||
}
|
||||
return ss&0x1;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Local layout calculations
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// These routines are key. Subdivide the linearised cartesian index into
|
||||
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
|
||||
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
|
||||
//
|
||||
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
|
||||
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
|
||||
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
|
||||
// lanes are operated upon simultaneously.
|
||||
|
||||
virtual int oIndex(std::vector<int> &coor)
|
||||
{
|
||||
int idx=0;
|
||||
// Works with either global or local coordinates
|
||||
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
|
||||
return idx;
|
||||
}
|
||||
inline int oIndexReduced(std::vector<int> &ocoor)
|
||||
{
|
||||
int idx=0;
|
||||
// ocoor is already reduced so can eliminate the modulo operation
|
||||
// for fast indexing and inline the routine
|
||||
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
|
||||
return idx;
|
||||
}
|
||||
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
|
||||
coor.resize(_ndimension);
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
coor[d] = Oindex % _rdimensions[d];
|
||||
Oindex = Oindex / _rdimensions[d];
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// SIMD lane addressing
|
||||
//////////////////////////////////////////////////////////
|
||||
inline int iIndex(std::vector<int> &lcoor)
|
||||
{
|
||||
int idx=0;
|
||||
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
|
||||
return idx;
|
||||
}
|
||||
inline void iCoorFromIindex(std::vector<int> &coor,int lane)
|
||||
{
|
||||
coor.resize(_ndimension);
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
coor[d] = lane % _simd_layout[d];
|
||||
lane = lane / _simd_layout[d];
|
||||
}
|
||||
}
|
||||
inline int PermuteDim(int dimension){
|
||||
return _simd_layout[dimension]>1;
|
||||
}
|
||||
inline int PermuteType(int dimension){
|
||||
int permute_type=0;
|
||||
for(int d=_ndimension-1;d>dimension;d--){
|
||||
if (_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
return permute_type;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Array sizing queries
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int iSites(void) { return _isites; };
|
||||
inline int Nsimd(void) { return _isites; };// Synonymous with iSites
|
||||
inline int oSites(void) { return _osites; };
|
||||
inline int lSites(void) { return _isites*_osites; };
|
||||
inline int gSites(void) { return _isites*_osites*_Nprocessors; };
|
||||
inline int Nd (void) { return _ndimension;};
|
||||
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
|
||||
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
|
||||
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
|
||||
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Global addressing
|
||||
////////////////////////////////////////////////////////////////
|
||||
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
|
||||
{
|
||||
gcoor.resize(_ndimension);
|
||||
std::vector<int> coor(_ndimension);
|
||||
|
||||
ProcessorCoorFromRank(rank,coor);
|
||||
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]&coor[mu];
|
||||
|
||||
iCoorFromIindex(coor,i_idx);
|
||||
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]&coor[mu];
|
||||
|
||||
oCoorFromOindex (coor,o_idx);
|
||||
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
|
||||
|
||||
}
|
||||
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
|
||||
{
|
||||
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
|
||||
if(CheckerBoarded(0)){
|
||||
fcoor[0] = fcoor[0]*2+cb;
|
||||
}
|
||||
}
|
||||
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
|
||||
{
|
||||
gcoor.resize(_ndimension);
|
||||
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
|
||||
}
|
||||
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
|
||||
{
|
||||
pcoor.resize(_ndimension);
|
||||
lcoor.resize(_ndimension);
|
||||
for(int mu=0;mu<_ndimension;mu++){
|
||||
pcoor[mu] = gcoor[mu]/_ldimensions[mu];
|
||||
lcoor[mu] = gcoor[mu]%_ldimensions[mu];
|
||||
}
|
||||
}
|
||||
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
|
||||
{
|
||||
std::vector<int> pcoor;
|
||||
std::vector<int> lcoor;
|
||||
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
|
||||
rank = RankFromProcessorCoor(pcoor);
|
||||
i_idx= iIndex(lcoor);
|
||||
o_idx= oIndex(lcoor);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
class GridCartesian: public GridBase {
|
||||
|
||||
public:
|
||||
|
||||
virtual int CheckerBoarded(int dim){
|
||||
return 0;
|
||||
}
|
||||
virtual int CheckerBoard(std::vector<int> site){
|
||||
return 0;
|
||||
}
|
||||
virtual int CheckerBoardDestination(int cb,int shift){
|
||||
return 0;
|
||||
}
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
|
||||
return shift;
|
||||
}
|
||||
GridCartesian(std::vector<int> &dimensions,
|
||||
std::vector<int> &simd_layout,
|
||||
std::vector<int> &processor_grid
|
||||
) : GridBase(processor_grid)
|
||||
{
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
_ndimension = dimensions.size();
|
||||
|
||||
_fdimensions.resize(_ndimension);
|
||||
_gdimensions.resize(_ndimension);
|
||||
_ldimensions.resize(_ndimension);
|
||||
_rdimensions.resize(_ndimension);
|
||||
_simd_layout.resize(_ndimension);
|
||||
|
||||
_ostride.resize(_ndimension);
|
||||
_istride.resize(_ndimension);
|
||||
|
||||
_osites = 1;
|
||||
_isites = 1;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
|
||||
//FIXME check for exact division
|
||||
|
||||
// Use a reduced simd grid
|
||||
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
|
||||
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _simd_layout[d];
|
||||
|
||||
// Addressing support
|
||||
if ( d==0 ) {
|
||||
_ostride[d] = 1;
|
||||
_istride[d] = 1;
|
||||
} else {
|
||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||
_istride[d] = _istride[d-1]*_simd_layout[d-1];
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// subplane information
|
||||
///////////////////////
|
||||
_slice_block.resize(_ndimension);
|
||||
_slice_stride.resize(_ndimension);
|
||||
_slice_nblock.resize(_ndimension);
|
||||
|
||||
int block =1;
|
||||
int nblock=1;
|
||||
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
|
||||
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
nblock/=_rdimensions[d];
|
||||
_slice_block[d] =block;
|
||||
_slice_stride[d]=_ostride[d]*_rdimensions[d];
|
||||
_slice_nblock[d]=nblock;
|
||||
block = block*_rdimensions[d];
|
||||
}
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
// Specialise this for red black grids storing half the data like a chess board.
|
||||
class GridRedBlackCartesian : public GridBase
|
||||
{
|
||||
public:
|
||||
virtual int CheckerBoarded(int dim){
|
||||
if( dim==0) return 1;
|
||||
else return 0;
|
||||
}
|
||||
virtual int CheckerBoard(std::vector<int> site){
|
||||
return (site[0]+site[1]+site[2]+site[3])&0x1;
|
||||
}
|
||||
|
||||
// Depending on the cb of site, we toggle source cb.
|
||||
// for block #b, element #e = (b, e)
|
||||
// we need
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
|
||||
|
||||
if(dim != 0) return shift;
|
||||
|
||||
int fulldim =_fdimensions[0];
|
||||
shift = (shift+fulldim)%fulldim;
|
||||
|
||||
// Probably faster with table lookup;
|
||||
// or by looping over x,y,z and multiply rather than computing checkerboard.
|
||||
int ocb=CheckerBoardFromOindex(osite);
|
||||
|
||||
if ( (source_cb+ocb)&1 ) {
|
||||
return (shift)/2;
|
||||
} else {
|
||||
return (shift+1)/2;
|
||||
}
|
||||
}
|
||||
|
||||
virtual int CheckerBoardDestination(int source_cb,int shift){
|
||||
if ((shift+_fdimensions[0])&0x1) {
|
||||
return 1-source_cb;
|
||||
} else {
|
||||
return source_cb;
|
||||
}
|
||||
};
|
||||
GridRedBlackCartesian(std::vector<int> &dimensions,
|
||||
std::vector<int> &simd_layout,
|
||||
std::vector<int> &processor_grid) : GridBase(processor_grid)
|
||||
{
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
_ndimension = dimensions.size();
|
||||
|
||||
_fdimensions.resize(_ndimension);
|
||||
_gdimensions.resize(_ndimension);
|
||||
_ldimensions.resize(_ndimension);
|
||||
_rdimensions.resize(_ndimension);
|
||||
_simd_layout.resize(_ndimension);
|
||||
|
||||
_ostride.resize(_ndimension);
|
||||
_istride.resize(_ndimension);
|
||||
|
||||
_osites = 1;
|
||||
_isites = 1;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
_fdimensions[d] = dimensions[d];
|
||||
_gdimensions[d] = _fdimensions[d];
|
||||
if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
|
||||
_ldimensions[d] = _gdimensions[d]/_processors[d];
|
||||
|
||||
// Use a reduced simd grid
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
|
||||
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _simd_layout[d];
|
||||
|
||||
// Addressing support
|
||||
if ( d==0 ) {
|
||||
_ostride[d] = 1;
|
||||
_istride[d] = 1;
|
||||
} else {
|
||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||
_istride[d] = _istride[d-1]*_simd_layout[d-1];
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// subplane information
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
_slice_block.resize(_ndimension);
|
||||
_slice_stride.resize(_ndimension);
|
||||
_slice_nblock.resize(_ndimension);
|
||||
|
||||
int block =1;
|
||||
int nblock=1;
|
||||
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
|
||||
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
nblock/=_rdimensions[d];
|
||||
_slice_block[d] =block;
|
||||
_slice_stride[d]=_ostride[d]*_rdimensions[d];
|
||||
_slice_nblock[d]=nblock;
|
||||
block = block*_rdimensions[d];
|
||||
}
|
||||
|
||||
};
|
||||
protected:
|
||||
virtual int oIndex(std::vector<int> &coor)
|
||||
{
|
||||
int idx=_ostride[0]*((coor[0]/2)%_rdimensions[0]);
|
||||
for(int d=1;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
|
||||
return idx;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
103
lib/Grid_Communicator.h
Normal file
103
lib/Grid_Communicator.h
Normal file
@ -0,0 +1,103 @@
|
||||
#ifndef GRID_COMMUNICATOR_H
|
||||
#define GRID_COMMUNICATOR_H
|
||||
///////////////////////////////////
|
||||
// Processor layout information
|
||||
///////////////////////////////////
|
||||
#ifdef GRID_COMMS_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
namespace Grid {
|
||||
class CartesianCommunicator {
|
||||
public:
|
||||
|
||||
// Communicator should know nothing of the physics grid, only processor grid.
|
||||
|
||||
int _Nprocessors; // How many in all
|
||||
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
|
||||
int _processor; // linear processor rank
|
||||
std::vector<int> _processor_coor; // linear processor coordinate
|
||||
unsigned long _ndimension;
|
||||
|
||||
#ifdef GRID_COMMS_MPI
|
||||
MPI_Comm communicator;
|
||||
#endif
|
||||
|
||||
// Constructor
|
||||
CartesianCommunicator(std::vector<int> &pdimensions_in);
|
||||
|
||||
// Wraps MPI_Cart routines
|
||||
void ShiftedRanks(int dim,int shift,int & source, int & dest);
|
||||
int RankFromProcessorCoor(std::vector<int> &coor);
|
||||
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
|
||||
|
||||
/////////////////////////////////
|
||||
// Grid information queries
|
||||
/////////////////////////////////
|
||||
int IsBoss(void) { return _processor==0; };
|
||||
int ThisRank(void) { return _processor; };
|
||||
const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
|
||||
const std::vector<int> & ProcessorGrid(void) { return _processors; };
|
||||
int ProcessorCount(void) { return _Nprocessors; };
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
////////////////////////////////////////////////////////////
|
||||
void GlobalSum(RealF &);
|
||||
void GlobalSumVector(RealF *,int N);
|
||||
|
||||
void GlobalSum(RealD &);
|
||||
void GlobalSumVector(RealD *,int N);
|
||||
|
||||
void GlobalSum(ComplexF &c)
|
||||
{
|
||||
GlobalSumVector((float *)&c,2);
|
||||
}
|
||||
void GlobalSumVector(ComplexF *c,int N)
|
||||
{
|
||||
GlobalSumVector((float *)c,2*N);
|
||||
}
|
||||
|
||||
void GlobalSum(ComplexD &c)
|
||||
{
|
||||
GlobalSumVector((double *)&c,2);
|
||||
}
|
||||
void GlobalSumVector(ComplexD *c,int N)
|
||||
{
|
||||
GlobalSumVector((double *)c,2*N);
|
||||
}
|
||||
|
||||
template<class obj> void GlobalSum(obj &o){
|
||||
|
||||
typedef typename obj::scalar_type scalar_type;
|
||||
int words = sizeof(obj)/sizeof(scalar_type);
|
||||
|
||||
scalar_type * ptr = (scalar_type *)& o;
|
||||
GlobalSum(ptr,words);
|
||||
}
|
||||
////////////////////////////////////////////////////////////
|
||||
// Face exchange
|
||||
////////////////////////////////////////////////////////////
|
||||
void SendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Barrier
|
||||
////////////////////////////////////////////////////////////
|
||||
void Barrier(void);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Broadcast a buffer and composite larger
|
||||
////////////////////////////////////////////////////////////
|
||||
void Broadcast(int root,void* data, int bytes);
|
||||
template<class obj> void Broadcast(int root,obj &data)
|
||||
{
|
||||
Broadcast(root,(void *)&data,sizeof(data));
|
||||
};
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
611
lib/Grid_Lattice.h
Normal file
611
lib/Grid_Lattice.h
Normal file
@ -0,0 +1,611 @@
|
||||
#ifndef GRID_LATTICE_H
|
||||
#define GRID_LATTICE_H
|
||||
|
||||
#include "Grid.h"
|
||||
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// TODO: Indexing ()
|
||||
// mac,real,imag
|
||||
//
|
||||
// Functionality required:
|
||||
// -=,+=,*=,()
|
||||
// add,+,sub,-,mult,mac,*
|
||||
// adj,conj
|
||||
// real,imag
|
||||
// transpose,transposeIndex
|
||||
// trace,traceIndex
|
||||
// peekIndex
|
||||
// innerProduct,outerProduct,
|
||||
// localNorm2
|
||||
// localInnerProduct
|
||||
//
|
||||
|
||||
extern int GridCshiftPermuteMap[4][16];
|
||||
|
||||
template<class vobj>
|
||||
class Lattice
|
||||
{
|
||||
public:
|
||||
GridBase *_grid;
|
||||
int checkerboard;
|
||||
std::vector<vobj,alignedAllocator<vobj> > _odata;
|
||||
public:
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
Lattice(GridBase *grid) : _grid(grid) {
|
||||
_odata.reserve(_grid->oSites());
|
||||
assert((((uint64_t)&_odata[0])&0xF) ==0);
|
||||
checkerboard=0;
|
||||
}
|
||||
|
||||
#include <Grid_cshift.h>
|
||||
|
||||
template<class obj1,class obj2>
|
||||
friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
|
||||
|
||||
// FIXME Performance difference between operator * and mult is troubling.
|
||||
// Auto move constructor seems to lose surprisingly much.
|
||||
|
||||
// Site wise binary operations
|
||||
// We eliminate a temporary object assignment if use the mult,add,sub routines.
|
||||
// For the operator versions we rely on move constructor to eliminate the
|
||||
// vector copy back.
|
||||
template<class obj1,class obj2,class obj3>
|
||||
friend void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
|
||||
|
||||
template<class obj1,class obj2,class obj3>
|
||||
friend void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
|
||||
|
||||
template<class obj1,class obj2,class obj3>
|
||||
friend void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
|
||||
|
||||
template<class obj1,class obj2,class obj3>
|
||||
friend void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
|
||||
|
||||
|
||||
friend void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
|
||||
conformable(lhs,rhs);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
|
||||
}
|
||||
}
|
||||
friend void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
|
||||
conformable(lhs,rhs);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
|
||||
}
|
||||
}
|
||||
inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
|
||||
conformable(lhs,rhs);
|
||||
Lattice<vobj> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss];
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
template<class sobj>
|
||||
inline Lattice<vobj> & operator = (const sobj & r){
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<_grid->oSites();ss++){
|
||||
this->_odata[ss]=r;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Poke a scalar object into the SIMD array
|
||||
template<class sobj>
|
||||
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
||||
|
||||
GridBase *grid=l._grid;
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||
|
||||
int rank,odx,idx;
|
||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||
|
||||
// Optional to broadcast from node 0.
|
||||
grid->Broadcast(0,s);
|
||||
|
||||
std::vector<sobj> buf(Nsimd);
|
||||
std::vector<scalar_type *> pointers(Nsimd);
|
||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
||||
|
||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||
extract(l._odata[odx],pointers);
|
||||
|
||||
buf[idx] = s;
|
||||
|
||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
||||
merge(l._odata[odx],pointers);
|
||||
|
||||
return;
|
||||
};
|
||||
|
||||
// Peek a scalar object from the SIMD array
|
||||
template<class sobj>
|
||||
friend void peekSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
||||
|
||||
GridBase *grid=l._grid;
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||
|
||||
int rank,odx,idx;
|
||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||
std::vector<sobj> buf(Nsimd);
|
||||
std::vector<scalar_type *> pointers(Nsimd);
|
||||
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
||||
|
||||
extract(l._odata[odx],pointers);
|
||||
|
||||
s = buf[idx];
|
||||
grid->Broadcast(rank,s);
|
||||
|
||||
return;
|
||||
};
|
||||
|
||||
// FIXME Randomise; deprecate this
|
||||
friend void random(Lattice<vobj> &l){
|
||||
Real *v_ptr = (Real *)&l._odata[0];
|
||||
size_t v_len = l._grid->oSites()*sizeof(vobj);
|
||||
size_t d_len = v_len/sizeof(Real);
|
||||
|
||||
for(int i=0;i<d_len;i++){
|
||||
|
||||
v_ptr[i]=drand48();
|
||||
}
|
||||
};
|
||||
|
||||
// FIXME for debug; deprecate this; made obscelete by
|
||||
// LatticeCoordinate();
|
||||
friend void lex_sites(Lattice<vobj> &l){
|
||||
Real *v_ptr = (Real *)&l._odata[0];
|
||||
size_t o_len = l._grid->oSites();
|
||||
size_t v_len = sizeof(vobj)/sizeof(vRealF);
|
||||
size_t vec_len = vRealF::Nsimd();
|
||||
|
||||
for(int i=0;i<o_len;i++){
|
||||
for(int j=0;j<v_len;j++){
|
||||
for(int vv=0;vv<vec_len;vv+=2){
|
||||
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
|
||||
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
|
||||
}
|
||||
}}
|
||||
}
|
||||
|
||||
// FIXME Implement a consistent seed management strategy
|
||||
friend void gaussian(Lattice<vobj> &l){
|
||||
// Zero mean, unit variance.
|
||||
std::normal_distribution<double> distribution(0.0,1.0);
|
||||
Real *v_ptr = (Real *)&l._odata[0];
|
||||
size_t v_len = l._grid->oSites()*sizeof(vobj);
|
||||
size_t d_len = v_len/sizeof(Real);
|
||||
|
||||
for(int i=0;i<d_len;i++){
|
||||
v_ptr[i]= drand48();
|
||||
}
|
||||
};
|
||||
|
||||
// Unary functions and Unops
|
||||
friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
|
||||
Lattice<vobj> ret(r._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<r._grid->oSites();ss++){
|
||||
ret._odata[ss]= -r._odata[ss];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
// *=,+=,-= operators inherit behvour from correspond */+/- operation
|
||||
template<class T>
|
||||
inline Lattice<vobj> &operator *=(const T &r) {
|
||||
*this = (*this)*r;
|
||||
return *this;
|
||||
}
|
||||
template<class T>
|
||||
inline Lattice<vobj> &operator -=(const T &r) {
|
||||
*this = (*this)-r;
|
||||
return *this;
|
||||
}
|
||||
template<class T>
|
||||
inline Lattice<vobj> &operator +=(const T &r) {
|
||||
*this = (*this)+r;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline friend Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = adj(lhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
inline friend Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = transpose(lhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
|
||||
inline friend Lattice<vobj> conj(const Lattice<vobj> &lhs){
|
||||
Lattice<vobj> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = conj(lhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
// remove and insert a half checkerboard
|
||||
friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
||||
half.checkerboard = cb;
|
||||
int ssh=0;
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<full._grid->oSites();ss++){
|
||||
std::vector<int> coor;
|
||||
int cbos;
|
||||
|
||||
full._grid->oCoorFromOindex(coor,ss);
|
||||
cbos=half._grid->CheckerBoard(coor);
|
||||
|
||||
if (cbos==cb) {
|
||||
|
||||
half._odata[ssh] = full._odata[ss];
|
||||
ssh++;
|
||||
}
|
||||
}
|
||||
}
|
||||
friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||
int cb = half.checkerboard;
|
||||
int ssh=0;
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<full._grid->oSites();ss++){
|
||||
std::vector<int> coor;
|
||||
int cbos;
|
||||
|
||||
full._grid->oCoorFromOindex(coor,ss);
|
||||
cbos=half._grid->CheckerBoard(coor);
|
||||
|
||||
if (cbos==cb) {
|
||||
full._odata[ss]=half._odata[ssh];
|
||||
ssh++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // class Lattice
|
||||
|
||||
template<class obj1,class obj2>
|
||||
void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
|
||||
{
|
||||
assert(lhs._grid == rhs._grid);
|
||||
assert(lhs.checkerboard == rhs.checkerboard);
|
||||
}
|
||||
|
||||
template<class obj1,class obj2,class obj3>
|
||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
conformable(lhs,rhs);
|
||||
uint32_t vec_len = lhs._grid->oSites();
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<vec_len;ss++){
|
||||
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
|
||||
}
|
||||
}
|
||||
|
||||
template<class obj1,class obj2,class obj3>
|
||||
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
conformable(lhs,rhs);
|
||||
uint32_t vec_len = lhs._grid->oSites();
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<vec_len;ss++){
|
||||
mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
|
||||
}
|
||||
}
|
||||
|
||||
template<class obj1,class obj2,class obj3>
|
||||
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
conformable(lhs,rhs);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
|
||||
}
|
||||
}
|
||||
template<class obj1,class obj2,class obj3>
|
||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
conformable(lhs,rhs);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
|
||||
}
|
||||
}
|
||||
|
||||
// Lattice BinOp Lattice,
|
||||
template<class left,class right>
|
||||
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
|
||||
{
|
||||
//NB mult performs conformable check. Do not reapply here for performance.
|
||||
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
|
||||
mult(ret,lhs,rhs);
|
||||
return ret;
|
||||
}
|
||||
template<class left,class right>
|
||||
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
|
||||
{
|
||||
//NB mult performs conformable check. Do not reapply here for performance.
|
||||
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
|
||||
add(ret,lhs,rhs);
|
||||
return ret;
|
||||
}
|
||||
template<class left,class right>
|
||||
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
|
||||
{
|
||||
//NB mult performs conformable check. Do not reapply here for performance.
|
||||
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
|
||||
sub(ret,lhs,rhs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Scalar BinOp Lattice ;generate return type
|
||||
template<class left,class right>
|
||||
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
|
||||
{
|
||||
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=lhs*rhs._odata[ss];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class left,class right>
|
||||
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
|
||||
{
|
||||
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=lhs+rhs._odata[ss];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class left,class right>
|
||||
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
|
||||
{
|
||||
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=lhs-rhs._odata[ss];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class left,class right>
|
||||
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
|
||||
{
|
||||
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=lhs._odata[ss]*rhs;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class left,class right>
|
||||
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
|
||||
{
|
||||
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=lhs._odata[ss]+rhs;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class left,class right>
|
||||
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
|
||||
{
|
||||
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=lhs._odata[ss]-rhs;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Trace
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj>
|
||||
inline auto trace(const Lattice<vobj> &lhs)
|
||||
-> Lattice<decltype(trace(lhs._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = trace(lhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Index level dependent operations
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<int Index,class vobj>
|
||||
inline auto traceIndex(const Lattice<vobj> &lhs)
|
||||
-> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
template<int Index,class vobj>
|
||||
inline auto transposeIndex(const Lattice<vobj> &lhs)
|
||||
-> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
// Fixme; this is problematic since the number of args is variable and
|
||||
// may mismatch...
|
||||
template<int Index,class vobj>
|
||||
inline auto peekIndex(const Lattice<vobj> &lhs)
|
||||
-> Lattice<decltype(peekIndex<Index>(lhs._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
template<int Index,class vobj>
|
||||
inline auto peekIndex(const Lattice<vobj> &lhs,int i)
|
||||
-> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
|
||||
{
|
||||
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
template<int Index,class vobj>
|
||||
inline auto peekIndex(const Lattice<vobj> &lhs,int i,int j)
|
||||
-> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
|
||||
{
|
||||
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites();ss++){
|
||||
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction operations
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj>
|
||||
inline RealD norm2(const Lattice<vobj> &arg){
|
||||
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
typedef typename vobj::vector_type vector;
|
||||
decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
|
||||
scalar nrm;
|
||||
//FIXME make this loop parallelisable
|
||||
vnrm=zero;
|
||||
for(int ss=0;ss<arg._grid->oSites(); ss++){
|
||||
vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
|
||||
}
|
||||
vector vvnrm =TensorRemove(vnrm) ;
|
||||
nrm = Reduce(vvnrm);
|
||||
arg._grid->GlobalSum(nrm);
|
||||
return real(nrm);
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
|
||||
|
||||
scalar nrm;
|
||||
//FIXME make this loop parallelisable
|
||||
for(int ss=0;ss<left._grid->oSites(); ss++){
|
||||
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
|
||||
}
|
||||
nrm = Reduce(vnrm);
|
||||
right._grid->GlobalSum(nrm);
|
||||
return nrm;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Non site reduced routines
|
||||
/////////////////////////////////////////////////////
|
||||
|
||||
// localNorm2,
|
||||
template<class vobj>
|
||||
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
||||
{
|
||||
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline auto real(const Lattice<vobj> &z) -> Lattice<decltype(real(z._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(real(z._odata[0]))> ret(z._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<z._grid->oSites();ss++){
|
||||
ret._odata[ss] = real(z._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline auto imag(const Lattice<vobj> &z) -> Lattice<decltype(imag(z._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(imag(z._odata[0]))> ret(z._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<z._grid->oSites();ss++){
|
||||
ret._odata[ss] = imag(z._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// localInnerProduct
|
||||
template<class vobj>
|
||||
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
|
||||
-> Lattice<typename vobj::tensor_reduced>
|
||||
{
|
||||
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// outerProduct Scalar x Scalar -> Scalar
|
||||
// Vector x Vector -> Matrix
|
||||
template<class ll,class rr>
|
||||
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#endif
|
106
lib/Grid_QCD.h
Normal file
106
lib/Grid_QCD.h
Normal file
@ -0,0 +1,106 @@
|
||||
#ifndef GRID_QCD_H
|
||||
#define GRID_QCD_H
|
||||
namespace Grid{
|
||||
namespace QCD {
|
||||
|
||||
static const int Nc=3;
|
||||
static const int Ns=4;
|
||||
static const int Nd=4;
|
||||
|
||||
static const int CbRed =0;
|
||||
static const int CbBlack=1;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// QCD iMatrix types
|
||||
// Index conventions: Lorentz x Spin x Colour
|
||||
//
|
||||
// ChrisK very keen to add extra space for Gparity doubling.
|
||||
//
|
||||
// Also add domain wall index, in a way where Wilson operator
|
||||
// naturally distributes across the 5th dimensions.
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
|
||||
template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
|
||||
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
|
||||
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
|
||||
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
|
||||
|
||||
|
||||
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
|
||||
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
|
||||
template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
|
||||
|
||||
typedef iSpinMatrix<Complex > SpinMatrix;
|
||||
typedef iColourMatrix<Complex > ColourMatrix;
|
||||
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
|
||||
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
|
||||
|
||||
typedef iSpinVector<Complex > SpinVector;
|
||||
typedef iColourVector<Complex > ColourVector;
|
||||
typedef iSpinColourVector<Complex > SpinColourVector;
|
||||
|
||||
|
||||
typedef iSpinMatrix<vComplex > vSpinMatrix;
|
||||
typedef iColourMatrix<vComplex > vColourMatrix;
|
||||
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
|
||||
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
|
||||
|
||||
typedef iSpinVector<vComplex > vSpinVector;
|
||||
typedef iColourVector<vComplex > vColourVector;
|
||||
typedef iSpinColourVector<vComplex > vSpinColourVector;
|
||||
|
||||
typedef iSinglet<Complex > TComplex; // This is painful. Tensor singlet complex type.
|
||||
typedef iSinglet<vComplex > vTComplex; // what if we don't know the tensor structure
|
||||
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
|
||||
typedef iSinglet<vReal > vTReal;
|
||||
typedef iSinglet<vInteger > vTInteger;
|
||||
typedef iSinglet<Integer > TInteger;
|
||||
|
||||
typedef Lattice<vTReal> LatticeReal;
|
||||
typedef Lattice<vTComplex> LatticeComplex;
|
||||
typedef Lattice<vInteger> LatticeInteger; // Predicates for "where"
|
||||
|
||||
typedef Lattice<vColourMatrix> LatticeColourMatrix;
|
||||
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
|
||||
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
|
||||
|
||||
typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
|
||||
typedef Lattice<vSpinVector> LatticeSpinVector;
|
||||
typedef Lattice<vColourVector> LatticeColourVector;
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Physical names for things
|
||||
///////////////////////////////////////////
|
||||
typedef Lattice<vSpinColourVector> LatticeFermion;
|
||||
typedef Lattice<vSpinColourMatrix> LatticePropagator;
|
||||
typedef Lattice<vLorentzColourMatrix> LatticeGaugeField;
|
||||
|
||||
|
||||
|
||||
|
||||
inline void LatticeCoordinate(LatticeInteger &l,int mu){
|
||||
GridBase *grid = l._grid;
|
||||
int Nsimd = grid->iSites();
|
||||
std::vector<int> gcoor;
|
||||
std::vector<Integer> mergebuf(Nsimd);
|
||||
std::vector<Integer *> mergeptr(Nsimd);
|
||||
for(int o=0;o<grid->oSites();o++){
|
||||
for(int i=0;i<grid->iSites();i++){
|
||||
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
|
||||
// grid->RankIndexToGlobalCoor(0,o,i,gcoor);
|
||||
mergebuf[i]=gcoor[mu];
|
||||
mergeptr[i]=&mergebuf[i];
|
||||
}
|
||||
merge(l._odata[o],mergeptr);
|
||||
}
|
||||
};
|
||||
|
||||
#include <Grid_predicated.h>
|
||||
|
||||
#if 0
|
||||
|
||||
#endif
|
||||
|
||||
} //namespace QCD
|
||||
} // Grid
|
||||
#endif
|
59
lib/Grid_aligned_allocator.h
Normal file
59
lib/Grid_aligned_allocator.h
Normal file
@ -0,0 +1,59 @@
|
||||
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
||||
#define GRID_ALIGNED_ALLOCATOR_H
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// A lattice of something, but assume the something is SIMDized.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template<typename _Tp>
|
||||
class alignedAllocator {
|
||||
public:
|
||||
typedef std::size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef _Tp* pointer;
|
||||
typedef const _Tp* const_pointer;
|
||||
typedef _Tp& reference;
|
||||
typedef const _Tp& const_reference;
|
||||
typedef _Tp value_type;
|
||||
|
||||
template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; };
|
||||
alignedAllocator() throw() { }
|
||||
alignedAllocator(const alignedAllocator&) throw() { }
|
||||
template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
|
||||
~alignedAllocator() throw() { }
|
||||
pointer address(reference __x) const { return &__x; }
|
||||
const_pointer address(const_reference __x) const { return &__x; }
|
||||
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||
// Should override allocate and deallocate
|
||||
pointer allocate(size_type __n, const void* = 0)
|
||||
{
|
||||
//_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp));
|
||||
// _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
||||
#ifdef AVX512
|
||||
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
||||
#else
|
||||
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
|
||||
#endif
|
||||
|
||||
return ptr;
|
||||
}
|
||||
void deallocate(pointer __p, size_type) {
|
||||
free(__p);
|
||||
}
|
||||
void construct(pointer __p, const _Tp& __val) { };
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
|
||||
template<typename _Tp> inline bool
|
||||
operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
||||
|
||||
template<typename _Tp> inline bool
|
||||
operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
||||
|
||||
|
||||
}; // namespace Grid
|
||||
#endif
|
56
lib/Grid_communicator_fake.cc
Normal file
56
lib/Grid_communicator_fake.cc
Normal file
@ -0,0 +1,56 @@
|
||||
#include "Grid.h"
|
||||
namespace Grid {
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
||||
{
|
||||
_ndimension = _processors.size();
|
||||
_processor_coor.resize(_ndimension);
|
||||
_processors = processors;
|
||||
|
||||
// Require 1^N processor grid for fake
|
||||
for(int d=0;d<_ndimension;d++) if(_processors[d]!=1) exit(-1);
|
||||
|
||||
_processor = 0;// I am the one. The only one..
|
||||
for(int d=0;d<_ndimension;d++) _processor_coor[d] = 0;
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSum(float &){}
|
||||
void CartesianCommunicator::GlobalSumVector(float *,int N){}
|
||||
void CartesianCommunicator::GlobalSum(double &){}
|
||||
void CartesianCommunicator::GlobalSumVector(double *,int N){}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
source =1;
|
||||
dest=1;
|
||||
}
|
||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
93
lib/Grid_communicator_mpi.cc
Normal file
93
lib/Grid_communicator_mpi.cc
Normal file
@ -0,0 +1,93 @@
|
||||
#include "Grid.h"
|
||||
#include <mpi.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// Should error check all MPI calls.
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
||||
{
|
||||
_ndimension = processors.size();
|
||||
std::vector<int> periodic(_ndimension,1);
|
||||
|
||||
_Nprocessors=1;
|
||||
_processors = processors;
|
||||
_processor_coor.resize(_ndimension);
|
||||
|
||||
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||
MPI_Comm_rank(communicator,&_processor);
|
||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||
|
||||
for(int i=0;i<_ndimension;i++){
|
||||
_Nprocessors*=_processors[i];
|
||||
}
|
||||
|
||||
int Size;
|
||||
MPI_Comm_size(communicator,&Size);
|
||||
|
||||
assert(Size==_Nprocessors);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||
}
|
||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||
{
|
||||
int rank;
|
||||
MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||
return rank;
|
||||
}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||
{
|
||||
coor.resize(_ndimension);
|
||||
MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Request reqs[2];
|
||||
MPI_Status OkeyDokey[2];
|
||||
int rank = _processor;
|
||||
MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&reqs[0]);
|
||||
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&reqs[1]);
|
||||
MPI_Waitall(2,reqs,OkeyDokey);
|
||||
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
MPI_Barrier(communicator);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator);
|
||||
}
|
||||
|
||||
}
|
||||
|
264
lib/Grid_comparison.h
Normal file
264
lib/Grid_comparison.h
Normal file
@ -0,0 +1,264 @@
|
||||
#ifndef GRID_COMPARISON_H
|
||||
#define GRID_COMPARISON_H
|
||||
namespace Grid {
|
||||
|
||||
// Generic list of functors
|
||||
template<class lobj,class robj> class veq {
|
||||
public:
|
||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs == rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class vne {
|
||||
public:
|
||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs != rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class vlt {
|
||||
public:
|
||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs < rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class vle {
|
||||
public:
|
||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs <= rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class vgt {
|
||||
public:
|
||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs > rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class vge {
|
||||
public:
|
||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs >= rhs;
|
||||
}
|
||||
};
|
||||
|
||||
// Generic list of functors
|
||||
template<class lobj,class robj> class seq {
|
||||
public:
|
||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs == rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class sne {
|
||||
public:
|
||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs != rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class slt {
|
||||
public:
|
||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs < rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class sle {
|
||||
public:
|
||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs <= rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class sgt {
|
||||
public:
|
||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs > rhs;
|
||||
}
|
||||
};
|
||||
template<class lobj,class robj> class sge {
|
||||
public:
|
||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
||||
{
|
||||
return lhs >= rhs;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Integer gets extra relational functions. Could also implement these for RealF, RealD etc..
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class sfunctor>
|
||||
inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs)
|
||||
{
|
||||
std::vector<Integer> vlhs(vInteger::Nsimd()); // Use functors to reduce this to single implementation
|
||||
std::vector<Integer> vrhs(vInteger::Nsimd());
|
||||
vInteger ret;
|
||||
extract(lhs,vlhs);
|
||||
extract(rhs,vrhs);
|
||||
for(int s=0;s<vInteger::Nsimd();s++){
|
||||
vlhs[s] = sop(vlhs[s],vrhs[s]);
|
||||
}
|
||||
merge(ret,vlhs);
|
||||
return ret;
|
||||
}
|
||||
inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
|
||||
{
|
||||
return Comparison(slt<Integer,Integer>(),lhs,rhs);
|
||||
}
|
||||
inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs)
|
||||
{
|
||||
return Comparison(sle<Integer,Integer>(),lhs,rhs);
|
||||
}
|
||||
inline vInteger operator > (const vInteger & lhs, const vInteger & rhs)
|
||||
{
|
||||
return Comparison(sgt<Integer,Integer>(),lhs,rhs);
|
||||
}
|
||||
inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs)
|
||||
{
|
||||
return Comparison(sge<Integer,Integer>(),lhs,rhs);
|
||||
}
|
||||
inline vInteger operator == (const vInteger & lhs, const vInteger & rhs)
|
||||
{
|
||||
return Comparison(seq<Integer,Integer>(),lhs,rhs);
|
||||
}
|
||||
inline vInteger operator != (const vInteger & lhs, const vInteger & rhs)
|
||||
{
|
||||
return Comparison(sne<Integer,Integer>(),lhs,rhs);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// relational operators
|
||||
//
|
||||
// Support <,>,<=,>=,==,!=
|
||||
//
|
||||
//Query supporting bitwise &, |, ^, !
|
||||
//Query supporting logical &&, ||,
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template<class vfunctor,class lobj,class robj>
|
||||
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
||||
{
|
||||
Lattice<vInteger> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class vfunctor,class lobj,class robj>
|
||||
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
||||
{
|
||||
Lattice<vInteger> ret(lhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<lhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=op(lhs._odata[ss],rhs);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class vfunctor,class lobj,class robj>
|
||||
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
||||
{
|
||||
Lattice<vInteger> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=op(lhs._odata[ss],rhs);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Less than
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
|
||||
// Less than equal
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||
return LLComparison(vle<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||
return LSComparison(vle<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||
return SLComparison(vle<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
|
||||
// Greater than
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
|
||||
|
||||
// Greater than equal
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||
return LLComparison(vge<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||
return LSComparison(vge<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||
return SLComparison(vge<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
|
||||
|
||||
// equal
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||
return LLComparison(veq<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||
return LSComparison(veq<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||
return SLComparison(veq<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
|
||||
|
||||
// not equal
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
|
||||
return LLComparison(vne<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
|
||||
return LSComparison(vne<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
template<class lobj,class robj>
|
||||
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
|
||||
return SLComparison(vne<lobj,robj>(),lhs,rhs);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#endif
|
104
lib/Grid_config.h
Normal file
104
lib/Grid_config.h
Normal file
@ -0,0 +1,104 @@
|
||||
/* Grid_config.h. Generated from Grid_config.h.in by configure. */
|
||||
/* Grid_config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* AVX */
|
||||
#define AVX1 1
|
||||
|
||||
/* AVX2 */
|
||||
/* #undef AVX2 */
|
||||
|
||||
/* AVX512 */
|
||||
/* #undef AVX512 */
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
#define GRID_COMMS_MPI 1
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
/* #undef GRID_COMMS_NONE */
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#define HAVE_GETTIMEOFDAY 1
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#define HAVE_INTTYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <malloc.h> header file. */
|
||||
/* #undef HAVE_MALLOC_H */
|
||||
|
||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
|
||||
#define HAVE_MALLOC_MALLOC_H 1
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#define HAVE_MEMORY_H 1
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#define HAVE_STDINT_H 1
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#define HAVE_STDLIB_H 1
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#define HAVE_STRINGS_H 1
|
||||
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#define HAVE_STRING_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#define HAVE_SYS_STAT_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#define HAVE_SYS_TYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#define HAVE_UNISTD_H 1
|
||||
|
||||
/* Name of package */
|
||||
#define PACKAGE "grid"
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#define PACKAGE_BUGREPORT "paboyle@ph.ed.ac.uk"
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#define PACKAGE_NAME "Grid"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "Grid 1.0"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "grid"
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "1.0"
|
||||
|
||||
/* SSE4 */
|
||||
/* #undef SSE4 */
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
#define STDC_HEADERS 1
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "1.0"
|
||||
|
||||
/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
|
||||
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
|
||||
#define below would cause a syntax error. */
|
||||
/* #undef _UINT32_T */
|
||||
|
||||
/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
|
||||
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
|
||||
#define below would cause a syntax error. */
|
||||
/* #undef _UINT64_T */
|
||||
|
||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
||||
/* #undef size_t */
|
||||
|
||||
/* Define to the type of an unsigned integer type of width exactly 32 bits if
|
||||
such a type exists and the standard includes do not define it. */
|
||||
/* #undef uint32_t */
|
||||
|
||||
/* Define to the type of an unsigned integer type of width exactly 64 bits if
|
||||
such a type exists and the standard includes do not define it. */
|
||||
/* #undef uint64_t */
|
103
lib/Grid_config.h.in
Normal file
103
lib/Grid_config.h.in
Normal file
@ -0,0 +1,103 @@
|
||||
/* Grid_config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* AVX */
|
||||
#undef AVX1
|
||||
|
||||
/* AVX2 */
|
||||
#undef AVX2
|
||||
|
||||
/* AVX512 */
|
||||
#undef AVX512
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
#undef GRID_COMMS_MPI
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
#undef GRID_COMMS_NONE
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#undef HAVE_GETTIMEOFDAY
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#undef HAVE_INTTYPES_H
|
||||
|
||||
/* Define to 1 if you have the <malloc.h> header file. */
|
||||
#undef HAVE_MALLOC_H
|
||||
|
||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
|
||||
#undef HAVE_MALLOC_MALLOC_H
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#undef HAVE_MEMORY_H
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#undef HAVE_STDINT_H
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#undef HAVE_STDLIB_H
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#undef HAVE_STRINGS_H
|
||||
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#undef HAVE_STRING_H
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#undef HAVE_SYS_STAT_H
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#undef HAVE_SYS_TYPES_H
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#undef HAVE_UNISTD_H
|
||||
|
||||
/* Name of package */
|
||||
#undef PACKAGE
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#undef PACKAGE_BUGREPORT
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#undef PACKAGE_NAME
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#undef PACKAGE_STRING
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
/* SSE4 */
|
||||
#undef SSE4
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
#undef STDC_HEADERS
|
||||
|
||||
/* Version number of package */
|
||||
#undef VERSION
|
||||
|
||||
/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
|
||||
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
|
||||
#define below would cause a syntax error. */
|
||||
#undef _UINT32_T
|
||||
|
||||
/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
|
||||
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
|
||||
#define below would cause a syntax error. */
|
||||
#undef _UINT64_T
|
||||
|
||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
||||
#undef size_t
|
||||
|
||||
/* Define to the type of an unsigned integer type of width exactly 32 bits if
|
||||
such a type exists and the standard includes do not define it. */
|
||||
#undef uint32_t
|
||||
|
||||
/* Define to the type of an unsigned integer type of width exactly 64 bits if
|
||||
such a type exists and the standard includes do not define it. */
|
||||
#undef uint64_t
|
16
lib/Grid_cshift.h
Normal file
16
lib/Grid_cshift.h
Normal file
@ -0,0 +1,16 @@
|
||||
#ifndef _GRID_CSHIFT_H_
|
||||
#define _GRID_CSHIFT_H_
|
||||
#include <Grid_cshift_common.h>
|
||||
|
||||
#ifdef GRID_COMMS_NONE
|
||||
#include <Grid_cshift_none.h>
|
||||
#endif
|
||||
|
||||
#ifdef GRID_COMMS_FAKE
|
||||
#include <Grid_cshift_fake.h>
|
||||
#endif
|
||||
|
||||
#ifdef GRID_COMMS_MPI
|
||||
#include <Grid_cshift_mpi.h>
|
||||
#endif
|
||||
#endif
|
326
lib/Grid_cshift_common.h
Normal file
326
lib/Grid_cshift_common.h
Normal file
@ -0,0 +1,326 @@
|
||||
#ifndef _GRID_CSHIFT_COMMON_H_
|
||||
#define _GRID_CSHIFT_COMMON_H_
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Gather for when there is no need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
buffer[bo++]=rhs._odata[so+o+b];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb &cbmask ) {
|
||||
buffer[bo]=rhs._odata[so+o+b];
|
||||
bo++;
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Gather for when there *is* need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
extract(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb & cbmask ) {
|
||||
extract(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there is no need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
rhs._odata[so+o+b]=buffer[bo++];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
rhs._odata[so+o+b]=buffer[bo++];
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there *is* need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
merge(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb&cbmask ) {
|
||||
merge(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// local to node block strided copies
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int o = 0; // relative offset to base within plane
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int o = 0; // relative offset to base within plane
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*rhs._grid->_ostride[dimension]; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Local to node Cshift
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
|
||||
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
|
||||
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Cshift_local(ret,rhs,dimension,shift,0x3);
|
||||
} else {
|
||||
Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
}
|
||||
|
||||
friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
GridBase *grid = rhs._grid;
|
||||
int fd = grid->_fdimensions[dimension];
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
int ld = grid->_ldimensions[dimension];
|
||||
int gd = grid->_gdimensions[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
|
||||
// the permute type
|
||||
int permute_dim =grid->PermuteDim(dimension);
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int o = 0;
|
||||
int bo = x * grid->_ostride[dimension];
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
|
||||
int sshift = grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
}
|
||||
|
||||
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
|
||||
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
263
lib/Grid_cshift_mpi.h
Normal file
263
lib/Grid_cshift_mpi.h
Normal file
@ -0,0 +1,263 @@
|
||||
#ifndef _GRID_MPI_CSHIFT_H_
|
||||
#define _GRID_MPI_CSHIFT_H_
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||
#endif
|
||||
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
|
||||
// the permute type
|
||||
int simd_layout = rhs._grid->_simd_layout[dimension];
|
||||
int comm_dim = rhs._grid->_processors[dimension] >1 ;
|
||||
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
|
||||
if ( !comm_dim ) {
|
||||
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
||||
} else if ( splice_dim ) {
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift);
|
||||
} else {
|
||||
Cshift_comms(ret,rhs,dimension,shift);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
|
||||
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
|
||||
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Cshift_comms(ret,rhs,dimension,shift,0x3);
|
||||
} else {
|
||||
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
}
|
||||
|
||||
friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
|
||||
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
|
||||
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
||||
} else {
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
GridBase *grid=rhs._grid;
|
||||
Lattice<vobj> temp(rhs._grid);
|
||||
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int simd_layout = rhs._grid->_simd_layout[dimension];
|
||||
int comm_dim = rhs._grid->_processors[dimension] >1 ;
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
|
||||
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
int sshift= rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int offnode = ( x+sshift >= rd );
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = (x+sshift)/rd;
|
||||
|
||||
if (!offnode) {
|
||||
|
||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
|
||||
} else {
|
||||
|
||||
int words = send_buf.size();
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
GridBase *grid=rhs._grid;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int fd = grid->_fdimensions[dimension];
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
int ld = grid->_ldimensions[dimension];
|
||||
int simd_layout = grid->_simd_layout[dimension];
|
||||
int comm_dim = grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||
int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
int bytes = buffer_size*words*sizeof(scalar_type);
|
||||
|
||||
std::vector<scalar_type *> pointers(Nsimd); //
|
||||
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
|
||||
int cb = (cbmask==0x2)? 1 : 0;
|
||||
int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
std::vector<int> comm_offnode(simd_layout);
|
||||
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
||||
std::vector<int> icoor(grid->Nd());
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int comm_any = 0;
|
||||
for(int s=0;s<simd_layout;s++) {
|
||||
int shifted_x = x+s*rd+sshift;
|
||||
comm_offnode[s] = shifted_x >= ld;
|
||||
comm_any = comm_any | comm_offnode[s];
|
||||
comm_proc[s] = shifted_x/ld;
|
||||
}
|
||||
|
||||
int o = 0;
|
||||
int bo = x*grid->_ostride[dimension];
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
if ( comm_any ) {
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
}
|
||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
|
||||
int s;
|
||||
grid->iCoorFromIindex(icoor,i);
|
||||
s = icoor[dimension];
|
||||
|
||||
if(comm_offnode[s]){
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
|
||||
|
||||
|
||||
grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf_extract[i][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
||||
|
||||
} else {
|
||||
|
||||
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Permute by swizzling pointers in merge
|
||||
int permute_slice=0;
|
||||
int lshift=sshift%ld;
|
||||
int wrap =lshift/rd;
|
||||
int num =lshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
int toggle_bit = (Nsimd>>(permute_type+1));
|
||||
int PermuteMap;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if ( permute_slice ) {
|
||||
PermuteMap=i^toggle_bit;
|
||||
pointers[i] = rpointers[PermuteMap];
|
||||
} else {
|
||||
pointers[i] = rpointers[i];
|
||||
}
|
||||
}
|
||||
|
||||
Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
|
||||
|
||||
} else {
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
|
||||
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
12
lib/Grid_cshift_none.h
Normal file
12
lib/Grid_cshift_none.h
Normal file
@ -0,0 +1,12 @@
|
||||
#ifndef _GRID_NONE_CSHIFT_H_
|
||||
#define _GRID_NONE_CSHIFT_H_
|
||||
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
Cshift_local(ret,rhs,dimension,shift);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
91
lib/Grid_init.cc
Executable file
91
lib/Grid_init.cc
Executable file
@ -0,0 +1,91 @@
|
||||
/****************************************************************************/
|
||||
/* PAB: Signal magic. Processor state dump is x86-64 specific */
|
||||
/****************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/time.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include "Grid.h"
|
||||
|
||||
#undef __X86_64
|
||||
namespace Grid {
|
||||
|
||||
void Grid_init(int *argc,char ***argv)
|
||||
{
|
||||
#ifdef GRID_COMMS_MPI
|
||||
MPI_Init(argc,argv);
|
||||
#endif
|
||||
Grid_debug_handler_init();
|
||||
}
|
||||
void Grid_finalize(void)
|
||||
{
|
||||
#ifdef GRID_COMMS_MPI
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
}
|
||||
double usecond(void) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv,NULL);
|
||||
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
|
||||
}
|
||||
|
||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
printf("Caught signal %d\n",si->si_signo);
|
||||
printf(" mem address %llx\n",(uint64_t)si->si_addr);
|
||||
printf(" code %d\n",si->si_code);
|
||||
|
||||
#ifdef __X86_64
|
||||
ucontext_t * uc= (ucontext_t *)ptr;
|
||||
|
||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||
printf(" instruction %llx\n",(uint64_t)sc->rip);
|
||||
|
||||
#define REG(A) printf(" %s %lx\n",#A, sc-> A);
|
||||
REG(rdi);
|
||||
REG(rsi);
|
||||
REG(rbp);
|
||||
REG(rbx);
|
||||
REG(rdx);
|
||||
REG(rax);
|
||||
REG(rcx);
|
||||
REG(rsp);
|
||||
REG(rip);
|
||||
|
||||
|
||||
REG(r8);
|
||||
REG(r9);
|
||||
REG(r10);
|
||||
REG(r11);
|
||||
REG(r12);
|
||||
REG(r13);
|
||||
REG(r14);
|
||||
REG(r15);
|
||||
#endif
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
if ( si->si_signo == SIGSEGV ) {
|
||||
printf("Grid_sa_signal_handler: Oops... this was a sigsegv you naughty naughty programmer. Goodbye\n");
|
||||
fflush(stdout);
|
||||
exit(-1);
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
void Grid_debug_handler_init(void)
|
||||
{
|
||||
struct sigaction sa,osa;
|
||||
sigemptyset (&sa.sa_mask);
|
||||
sa.sa_sigaction= Grid_sa_signal_handler;
|
||||
sa.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGSEGV,&sa,NULL);
|
||||
sigaction(SIGTRAP,&sa,NULL);
|
||||
}
|
||||
}
|
100
lib/Grid_math_type_mapper.h
Normal file
100
lib/Grid_math_type_mapper.h
Normal file
@ -0,0 +1,100 @@
|
||||
#ifndef GRID_MATH_TYPE_MAPPER_H
|
||||
#define GRID_MATH_TYPE_MAPPER_H
|
||||
|
||||
namespace Grid {
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
|
||||
// Use of a helper class like this allows us to template specialise and "dress"
|
||||
// other classes such as RealD == double, ComplexD == std::complex<double> with these
|
||||
// traits.
|
||||
//
|
||||
// It is possible that we could do this more elegantly if I introduced a
|
||||
// queryable trait in iScalar, iMatrix and iVector and used the query on vtype in
|
||||
// place of the type mapper?
|
||||
//
|
||||
// Not sure how to do this, but probably could be done with a research effort
|
||||
// to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class T> class GridTypeMapper {
|
||||
public:
|
||||
typedef typename T::scalar_type scalar_type;
|
||||
typedef typename T::vector_type vector_type;
|
||||
typedef typename T::tensor_reduced tensor_reduced;
|
||||
enum { TensorLevel = T::TensorLevel };
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
// Recursion stops with these template specialisations
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
template<> class GridTypeMapper<RealF> {
|
||||
public:
|
||||
typedef RealF scalar_type;
|
||||
typedef RealF vector_type;
|
||||
typedef RealF tensor_reduced ;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
template<> class GridTypeMapper<RealD> {
|
||||
public:
|
||||
typedef RealD scalar_type;
|
||||
typedef RealD vector_type;
|
||||
typedef RealD tensor_reduced;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
template<> class GridTypeMapper<ComplexF> {
|
||||
public:
|
||||
typedef ComplexF scalar_type;
|
||||
typedef ComplexF vector_type;
|
||||
typedef ComplexF tensor_reduced;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
template<> class GridTypeMapper<ComplexD> {
|
||||
public:
|
||||
typedef ComplexD scalar_type;
|
||||
typedef ComplexD vector_type;
|
||||
typedef ComplexD tensor_reduced;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
|
||||
template<> class GridTypeMapper<vRealF> {
|
||||
public:
|
||||
typedef RealF scalar_type;
|
||||
typedef vRealF vector_type;
|
||||
typedef vRealF tensor_reduced;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
template<> class GridTypeMapper<vRealD> {
|
||||
public:
|
||||
typedef RealD scalar_type;
|
||||
typedef vRealD vector_type;
|
||||
typedef vRealD tensor_reduced;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
template<> class GridTypeMapper<vComplexF> {
|
||||
public:
|
||||
typedef ComplexF scalar_type;
|
||||
typedef vComplexF vector_type;
|
||||
typedef vComplexF tensor_reduced;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
template<> class GridTypeMapper<vComplexD> {
|
||||
public:
|
||||
typedef ComplexD scalar_type;
|
||||
typedef vComplexD vector_type;
|
||||
typedef vComplexD tensor_reduced;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
template<> class GridTypeMapper<vInteger> {
|
||||
public:
|
||||
typedef Integer scalar_type;
|
||||
typedef vInteger vector_type;
|
||||
typedef vInteger tensor_reduced;
|
||||
enum { TensorLevel = 0 };
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
1542
lib/Grid_math_types.h
Normal file
1542
lib/Grid_math_types.h
Normal file
File diff suppressed because it is too large
Load Diff
62
lib/Grid_predicated.h
Normal file
62
lib/Grid_predicated.h
Normal file
@ -0,0 +1,62 @@
|
||||
#ifndef GRID_PREDICATED_H
|
||||
#define GRID_PREDICATED_H
|
||||
|
||||
// Must implement the predicate gating the
|
||||
// Must be able to reduce the predicate down to a single vInteger per site.
|
||||
// Must be able to require the type be iScalar x iScalar x ....
|
||||
// give a GetVtype method in iScalar
|
||||
// and blow away the tensor structures.
|
||||
//
|
||||
template<class vobj>
|
||||
inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
|
||||
{
|
||||
conformable(iftrue,iffalse);
|
||||
conformable(iftrue,predicate);
|
||||
conformable(iftrue,ret);
|
||||
|
||||
GridBase *grid=iftrue._grid;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
const int Nsimd = grid->Nsimd();
|
||||
const int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
std::vector<Integer> mask(Nsimd);
|
||||
std::vector<std::vector<scalar_type> > truevals (Nsimd,std::vector<scalar_type>(words) );
|
||||
std::vector<std::vector<scalar_type> > falsevals(Nsimd,std::vector<scalar_type>(words) );
|
||||
std::vector<scalar_type *> pointers(Nsimd);
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<iftrue._grid->oSites(); ss++){
|
||||
|
||||
for(int s=0;s<Nsimd;s++) pointers[s] = & truevals[s][0];
|
||||
extract(iftrue._odata[ss] ,pointers);
|
||||
|
||||
for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
|
||||
extract(iffalse._odata[ss] ,pointers);
|
||||
|
||||
extract(predicate._odata[ss],mask);
|
||||
|
||||
for(int s=0;s<Nsimd;s++){
|
||||
if (mask[s]) pointers[s]=&truevals[s][0];
|
||||
else pointers[s]=&falsevals[s][0];
|
||||
}
|
||||
|
||||
merge(ret._odata[ss],pointers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline Lattice<vobj> where(const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
|
||||
{
|
||||
conformable(iftrue,iffalse);
|
||||
conformable(iftrue,predicate);
|
||||
|
||||
Lattice<vobj> ret(iftrue._grid);
|
||||
|
||||
where(ret,predicate,iftrue,iffalse);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
293
lib/Grid_simd.h
Normal file
293
lib/Grid_simd.h
Normal file
@ -0,0 +1,293 @@
|
||||
#ifndef GRID_SIMD_H
|
||||
#define GRID_SIMD_H
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Define scalar and vector floating point types
|
||||
//
|
||||
// Scalar: RealF, RealD, ComplexF, ComplexD
|
||||
//
|
||||
// Vector: vRealF, vRealD, vComplexF, vComplexD
|
||||
//
|
||||
// Vector types are arch dependent
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#ifdef SSE4
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
#if defined(AVX1) || defined (AVX2)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
|
||||
typedef float RealF;
|
||||
typedef double RealD;
|
||||
|
||||
typedef std::complex<RealF> ComplexF;
|
||||
typedef std::complex<RealD> ComplexD;
|
||||
|
||||
|
||||
inline RealF adj(const RealF & r){ return r; }
|
||||
inline RealF conj(const RealF & r){ return r; }
|
||||
inline RealF real(const RealF & r){ return r; }
|
||||
|
||||
inline RealD adj(const RealD & r){ return r; }
|
||||
inline RealD conj(const RealD & r){ return r; }
|
||||
inline RealD real(const RealD & r){ return r; }
|
||||
|
||||
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
||||
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
|
||||
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
|
||||
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//Provide support functions for basic real and complex data types required by Grid
|
||||
//Single and double precision versions. Should be able to template this once only.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
|
||||
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
|
||||
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
|
||||
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
|
||||
inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
|
||||
// conj already supported for complex
|
||||
|
||||
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
|
||||
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline ComplexF adj(const ComplexF& r ){ return(conj(r)); }
|
||||
//conj already supported for complex
|
||||
|
||||
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
|
||||
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
|
||||
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
|
||||
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
|
||||
|
||||
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
|
||||
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
|
||||
|
||||
class Zero{};
|
||||
static Zero zero;
|
||||
template<class itype> inline void zeroit(itype &arg){ arg=zero;};
|
||||
template<> inline void zeroit(ComplexF &arg){ arg=0; };
|
||||
template<> inline void zeroit(ComplexD &arg){ arg=0; };
|
||||
template<> inline void zeroit(RealF &arg){ arg=0; };
|
||||
template<> inline void zeroit(RealD &arg){ arg=0; };
|
||||
|
||||
|
||||
#if defined (SSE4)
|
||||
typedef __m128 fvec;
|
||||
typedef __m128d dvec;
|
||||
typedef __m128 cvec;
|
||||
typedef __m128d zvec;
|
||||
typedef __m128i ivec;
|
||||
#endif
|
||||
#if defined (AVX1) || defined (AVX2)
|
||||
typedef __m256 fvec;
|
||||
typedef __m256d dvec;
|
||||
typedef __m256 cvec;
|
||||
typedef __m256d zvec;
|
||||
typedef __m256i ivec;
|
||||
#endif
|
||||
#if defined (AVX512)
|
||||
typedef __m512 fvec;
|
||||
typedef __m512d dvec;
|
||||
typedef __m512 cvec;
|
||||
typedef __m512d zvec;
|
||||
typedef __m512i ivec;
|
||||
#endif
|
||||
#if defined (QPX)
|
||||
typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision
|
||||
typedef float cvec __attribute__ ((vector_size (16)));
|
||||
|
||||
typedef vector4double dvec;
|
||||
typedef vector4double zvec;
|
||||
#endif
|
||||
#if defined (AVX1) || defined (AVX2) || defined (AVX512)
|
||||
inline void v_prefetch0(int size, const char *ptr){
|
||||
for(int i=0;i<size;i+=64){ // Define L1 linesize above// What about SSE?
|
||||
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
|
||||
_mm_prefetch(ptr+i+512,_MM_HINT_T0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
inline void v_prefetch0(int size, const char *ptr){};
|
||||
#endif
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Generic extract/merge/permute
|
||||
/////////////////////////////////////////////////////////////////
|
||||
template<class vsimd,class scalar>
|
||||
inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
|
||||
// FIXME: bounce off stack is painful
|
||||
// temporary hack while I figure out better way.
|
||||
// There are intrinsics to do this work without the storage.
|
||||
int Nextr=extracted.size();
|
||||
int Nsimd=vsimd::Nsimd();
|
||||
int s=Nsimd/Nextr;
|
||||
|
||||
std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd);
|
||||
vstore(y,&buf[0]);
|
||||
for(int i=0;i<Nextr;i++){
|
||||
*extracted[i] = buf[i*s];
|
||||
extracted[i]++;
|
||||
}
|
||||
};
|
||||
template<class vsimd,class scalar>
|
||||
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
|
||||
int Nextr=extracted.size();
|
||||
int Nsimd=vsimd::Nsimd();
|
||||
int s=Nsimd/Nextr;
|
||||
|
||||
std::vector<scalar> buf(Nsimd);
|
||||
for(int i=0;i<Nextr;i++){
|
||||
for(int ii=0;ii<s;ii++){
|
||||
buf[i*s+ii]=*extracted[i];
|
||||
}
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,&buf[0]);
|
||||
};
|
||||
template<class vsimd,class scalar>
|
||||
inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
|
||||
// FIXME: bounce off stack is painful
|
||||
// temporary hack while I figure out better way.
|
||||
// There are intrinsics to do this work without the storage.
|
||||
int Nextr=extracted.size();
|
||||
int Nsimd=vsimd::Nsimd();
|
||||
int s=Nsimd/Nextr;
|
||||
|
||||
std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd);
|
||||
|
||||
vstore(y,&buf[0]);
|
||||
|
||||
for(int i=0;i<Nextr;i++){
|
||||
extracted[i] = buf[i*s];
|
||||
}
|
||||
};
|
||||
template<class vsimd,class scalar>
|
||||
inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
|
||||
int Nextr=extracted.size();
|
||||
int Nsimd=vsimd::Nsimd();
|
||||
int s=Nsimd/Nextr;
|
||||
|
||||
std::vector<scalar> buf(Nsimd);
|
||||
for(int i=0;i<Nextr;i++){
|
||||
for(int ii=0;ii<s;ii++){
|
||||
buf[i*s+ii]=extracted[i];
|
||||
}
|
||||
}
|
||||
vset(y,&buf[0]);
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
||||
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||
// Permute 4 possible on half precision @512bit vectors.
|
||||
//////////////////////////////////////////////////////////
|
||||
template<class vsimd>
|
||||
inline void Gpermute(vsimd &y,const vsimd &b,int perm){
|
||||
switch (perm){
|
||||
#if defined(AVX1)||defined(AVX2)
|
||||
// 8x32 bits=>3 permutes
|
||||
case 2: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
// 16 floats=> permutes
|
||||
// Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo
|
||||
// Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn
|
||||
// Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
|
||||
// Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
|
||||
case 3: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
|
||||
case 2: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
|
||||
case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error not implemented
|
||||
#endif
|
||||
default: assert(0); break;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
#include <Grid_vInteger.h>
|
||||
#include <Grid_vRealF.h>
|
||||
#include <Grid_vRealD.h>
|
||||
#include <Grid_vComplexF.h>
|
||||
#include <Grid_vComplexD.h>
|
||||
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// NB: Template the following on "type Complex" and then implement *,+,- for
|
||||
// ComplexF, ComplexD, RealF, RealD above to
|
||||
// get full generality of binops with scalars.
|
||||
inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
// Default precision
|
||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
|
||||
typedef RealD Real;
|
||||
typedef vRealD vReal;
|
||||
typedef vComplexD vComplex;
|
||||
typedef std::complex<Real> Complex;
|
||||
#else
|
||||
typedef RealF Real;
|
||||
typedef vRealF vReal;
|
||||
typedef vComplexF vComplex;
|
||||
typedef std::complex<Real> Complex;
|
||||
#endif
|
||||
}
|
||||
#endif
|
351
lib/Grid_stencil.h
Normal file
351
lib/Grid_stencil.h
Normal file
@ -0,0 +1,351 @@
|
||||
#ifndef GRID_STENCIL_H
|
||||
#define GRID_STENCIL_H
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Must not lose sight that goal is to be able to construct really efficient
|
||||
// gather to a point stencil code. CSHIFT is not the best way, so need
|
||||
// additional stencil support.
|
||||
//
|
||||
// Stencil based code will pre-exchange haloes and use a table lookup for neighbours.
|
||||
// This will be done with generality to allow easier efficient implementations.
|
||||
// Overlap of comms and compute could be semi-automated by tabulating off-node connected,
|
||||
// and
|
||||
//
|
||||
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
||||
//
|
||||
// Grid could create a neighbour index table for a given stencil.
|
||||
//
|
||||
// Could also implement CovariantCshift, to fuse the loops and enhance performance.
|
||||
//
|
||||
//
|
||||
// General stencil computation:
|
||||
//
|
||||
// Generic services
|
||||
// 0) Prebuild neighbour tables
|
||||
// 1) Compute sizes of all haloes/comms buffers; allocate them.
|
||||
//
|
||||
// 2) Gather all faces, and communicate.
|
||||
// 3) Loop over result sites, giving nbr index/offnode info for each
|
||||
//
|
||||
// Could take a
|
||||
// SpinProjectFaces
|
||||
// start comms
|
||||
// complete comms
|
||||
// Reconstruct Umu
|
||||
//
|
||||
// Approach.
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace Grid {
|
||||
|
||||
struct CommsRequest {
|
||||
int words;
|
||||
int unified_buffer_offset;
|
||||
int tag;
|
||||
int to_rank;
|
||||
int from_rank;
|
||||
} ;
|
||||
|
||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||
public:
|
||||
|
||||
int _checkerboard;
|
||||
int _npoints; // Move to template param?
|
||||
GridBase * _grid;
|
||||
|
||||
// npoints of these
|
||||
std::vector<int> _directions;
|
||||
std::vector<int> _distances;
|
||||
std::vector<int> _comm_buf_size;
|
||||
std::vector<int> _permute_type;
|
||||
|
||||
// npoints x Osites() of these
|
||||
std::vector<std::vector<int> > _offsets;
|
||||
std::vector<std::vector<int> > _is_local;
|
||||
std::vector<std::vector<int> > _permute;
|
||||
|
||||
int _unified_buffer_size;
|
||||
int _request_count;
|
||||
|
||||
std::vector<CommsRequest> CommsRequests;
|
||||
|
||||
CartesianStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances);
|
||||
|
||||
|
||||
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
|
||||
// Can this be avoided with simpler coding of comms?
|
||||
void Local (int point, int dimension,int shift,int cbmask);
|
||||
void Comms (int point, int dimension,int shift,int cbmask);
|
||||
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
|
||||
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
|
||||
|
||||
// Could allow a functional munging of the halo to another type during the comms.
|
||||
// this could implement the 16bit/32bit/64bit compression.
|
||||
template<class vobj> void HaloExchange(Lattice<vobj> &source,
|
||||
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf)
|
||||
{
|
||||
// conformable(source._grid,_grid);
|
||||
assert(source._grid==_grid);
|
||||
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
||||
int u_comm_offset=0;
|
||||
|
||||
// Gather all comms buffers
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
for(int point = 0 ; point < _npoints; point++) {
|
||||
|
||||
printf("Point %d \n",point);fflush(stdout);
|
||||
int dimension = _directions[point];
|
||||
int displacement = _distances[point];
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
int shift = (displacement+fd)%fd;
|
||||
|
||||
int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
|
||||
assert (checkerboard== _checkerboard);
|
||||
|
||||
// the permute type
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
// Gather phase
|
||||
int sshift [2];
|
||||
if ( comm_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
if (splice_dim) {
|
||||
printf("splice 0x3 \n");fflush(stdout);
|
||||
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
|
||||
} else {
|
||||
printf("NO splice 0x3 \n");fflush(stdout);
|
||||
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
|
||||
}
|
||||
} else {
|
||||
if(splice_dim){
|
||||
printf("splice 0x1,2 \n");fflush(stdout);
|
||||
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes
|
||||
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration
|
||||
} else {
|
||||
printf("NO splice 0x1,2 \n");fflush(stdout);
|
||||
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);
|
||||
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
||||
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
|
||||
int &u_comm_offset)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
GridBase *grid=_grid;
|
||||
assert(rhs._grid==_grid);
|
||||
// conformable(_grid,rhs._grid);
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
|
||||
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); // hmm...
|
||||
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
printf("GatherStartComms x %d/%d\n",x,rd);fflush(stdout);
|
||||
int offnode = ( x+sshift >= rd );
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = (x+sshift)/rd;
|
||||
|
||||
if (offnode) {
|
||||
|
||||
printf("GatherStartComms offnode x %d\n",x);fflush(stdout);
|
||||
int words = send_buf.size();
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout);
|
||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||
|
||||
printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout);
|
||||
|
||||
int rank = _grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||
_grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);
|
||||
|
||||
printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout);
|
||||
for(int i=0;i<buffer_size;i++){
|
||||
u_comm_buf[u_comm_offset+i]=recv_buf[i];
|
||||
}
|
||||
u_comm_offset+=buffer_size;
|
||||
printf("GatherStartComms inserted x %d\n",x);fflush(stdout);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class vobj>
|
||||
void GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
||||
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
|
||||
int &u_comm_offset)
|
||||
{
|
||||
const int Nsimd = _grid->Nsimd();
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int permute_type=_grid->PermuteType(dimension);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
/* FIXME ALTERNATE BUFFER DETERMINATION */
|
||||
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
int bytes = buffer_size*words*sizeof(scalar_type);
|
||||
|
||||
std::vector<scalar_type *> pointers(Nsimd); //
|
||||
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
|
||||
int cb = (cbmask==0x2)? 1 : 0;
|
||||
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
std::vector<int> comm_offnode(simd_layout);
|
||||
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
||||
std::vector<int> icoor(_grid->Nd());
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int comm_any = 0;
|
||||
for(int s=0;s<simd_layout;s++) {
|
||||
int shifted_x = x+s*rd+sshift;
|
||||
comm_offnode[s] = shifted_x >= ld;
|
||||
comm_any = comm_any | comm_offnode[s];
|
||||
comm_proc[s] = shifted_x/ld;
|
||||
}
|
||||
|
||||
int o = 0;
|
||||
int bo = x*_grid->_ostride[dimension];
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
if ( comm_any ) {
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
}
|
||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
int s;
|
||||
_grid->iCoorFromIindex(icoor,i);
|
||||
s = icoor[dimension];
|
||||
|
||||
if(comm_offnode[s]){
|
||||
|
||||
int rank = _grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
_grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
|
||||
|
||||
|
||||
_grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf_extract[i][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
||||
|
||||
} else {
|
||||
|
||||
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Permute by swizzling pointers in merge
|
||||
int permute_slice=0;
|
||||
int lshift=sshift%ld;
|
||||
int wrap =lshift/rd;
|
||||
int num =lshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
int toggle_bit = (Nsimd>>(permute_type+1));
|
||||
int PermuteMap;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if ( permute_slice ) {
|
||||
PermuteMap=i^toggle_bit;
|
||||
pointers[i] = rpointers[PermuteMap];
|
||||
} else {
|
||||
pointers[i] = rpointers[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Here we don't want to scatter, just place into a buffer.
|
||||
for(int i=0;i<buffer_size;i++){
|
||||
merge(u_comm_buf[u_comm_offset+i],pointers);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
258
lib/Grid_stencil_common.cc
Normal file
258
lib/Grid_stencil_common.cc
Normal file
@ -0,0 +1,258 @@
|
||||
#include "Grid.h"
|
||||
|
||||
namespace Grid {
|
||||
|
||||
CartesianStencil::CartesianStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances)
|
||||
: _offsets(npoints),
|
||||
_is_local(npoints),
|
||||
_comm_buf_size(npoints),
|
||||
_permute_type(npoints),
|
||||
_permute(npoints)
|
||||
{
|
||||
_npoints = npoints;
|
||||
_grid = grid;
|
||||
_directions = directions;
|
||||
_distances = distances;
|
||||
_unified_buffer_size=0;
|
||||
_request_count =0;
|
||||
CommsRequests.resize(0);
|
||||
|
||||
int osites = _grid->oSites();
|
||||
|
||||
for(int i=0;i<npoints;i++){
|
||||
|
||||
int point = i;
|
||||
|
||||
_offsets[i].resize( osites);
|
||||
_is_local[i].resize(osites);
|
||||
_permute[i].resize( osites);
|
||||
|
||||
int dimension = directions[i];
|
||||
int displacement = distances[i];
|
||||
int shift = displacement;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
_permute_type[point]=_grid->PermuteType(dimension);
|
||||
|
||||
_checkerboard = checkerboard;
|
||||
|
||||
// the permute type
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
int sshift[2];
|
||||
|
||||
// Underlying approach. For each local site build
|
||||
// up a table containing the npoint "neighbours" and whether they
|
||||
// live in lattice or a comms buffer.
|
||||
if ( !comm_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Local(point,dimension,shift,0x3);
|
||||
} else {
|
||||
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Local(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
} else { // All permute extract done in comms phase prior to Stencil application
|
||||
// So tables are the same whether comm_dim or splice_dim
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Comms(point,dimension,shift,0x3);
|
||||
} else {
|
||||
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CartesianStencil::Local (int point, int dimension,int shift,int cbmask)
|
||||
{
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int gd = _grid->_gdimensions[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
// the permute type
|
||||
int permute_dim =_grid->PermuteDim(dimension);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int o = 0;
|
||||
int bo = x * _grid->_ostride[dimension];
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
|
||||
int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
}
|
||||
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void CartesianStencil::Comms (int point,int dimension,int shift,int cbmask)
|
||||
{
|
||||
GridBase *grid=_grid;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||
// send to one or more remote nodes.
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int offnode = ( x+sshift >= rd );
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = (x+sshift)/rd;
|
||||
|
||||
if (!offnode) {
|
||||
|
||||
int permute_slice=0;
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
|
||||
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
// GatherPlaneSimple (point,dimension,sx,cbmask);
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
CommsRequest cr;
|
||||
|
||||
cr.tag = _request_count++;
|
||||
cr.words = words;
|
||||
cr.unified_buffer_offset = _unified_buffer_size;
|
||||
_unified_buffer_size += words;
|
||||
grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank);
|
||||
|
||||
CommsRequests.push_back(cr);
|
||||
|
||||
ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute)
|
||||
{
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int o = 0; // relative offset to base within plane
|
||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
_offsets [point][lo+o+b]=ro+o+b;
|
||||
_is_local[point][lo+o+b]=1;
|
||||
_permute [point][lo+o+b]=permute;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
_offsets [point][lo+o+b]=ro+o+b;
|
||||
_is_local[point][lo+o+b]=1;
|
||||
_permute [point][lo+o+b]=permute;
|
||||
}
|
||||
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset)
|
||||
{
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
_offsets [point][so+o+b]=offset+(bo++);
|
||||
_is_local[point][so+o+b]=0;
|
||||
_permute [point][so+o+b]=0;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
_offsets [point][so+o+b]=offset+(bo++);
|
||||
_is_local[point][so+o+b]=0;
|
||||
_permute [point][so+o+b]=0;
|
||||
}
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
12
lib/Grid_summation.h
Normal file
12
lib/Grid_summation.h
Normal file
@ -0,0 +1,12 @@
|
||||
#ifndef GRID_SUMMATION_H
|
||||
#define GRID_SUMMATION_H
|
||||
|
||||
template<class vobj>
|
||||
inline void sumBlocks(Lattice<vobj> &coarseData,const Lattice<vobj &fineData)
|
||||
{
|
||||
GridBase * fine = findData._grid;
|
||||
GridBase * coarse= findData._grid;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
346
lib/Grid_vComplexD.h
Normal file
346
lib/Grid_vComplexD.h
Normal file
@ -0,0 +1,346 @@
|
||||
#ifndef VCOMPLEXD_H
|
||||
#define VCOMPLEXD_H
|
||||
#include "Grid.h"
|
||||
#include "Grid_vComplexF.h"
|
||||
|
||||
namespace Grid {
|
||||
class vComplexD {
|
||||
public:
|
||||
zvec v;
|
||||
public:
|
||||
typedef zvec vector_type;
|
||||
typedef ComplexD scalar_type;
|
||||
|
||||
vComplexD & operator = ( Zero & z){
|
||||
vzero(*this);
|
||||
return (*this);
|
||||
}
|
||||
vComplexD(){};
|
||||
vComplexD(ComplexD a){
|
||||
vsplat(*this,a);
|
||||
};
|
||||
vComplexD(double a){
|
||||
vsplat(*this,ComplexD(a));
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// mac, mult, sub, add, adj
|
||||
// Should do an AVX2 version with mac.
|
||||
///////////////////////////////////////////////
|
||||
friend inline void mac (vComplexD * __restrict__ y,const vComplexD * __restrict__ a,const vComplexD *__restrict__ x) {*y = (*a)*(*x)+(*y);};
|
||||
friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
|
||||
friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
|
||||
friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
|
||||
friend inline vComplexD adj(const vComplexD &in){ return conj(in); }
|
||||
|
||||
//////////////////////////////////
|
||||
// Initialise to 1,0,i
|
||||
//////////////////////////////////
|
||||
friend inline void vone (vComplexD &ret){ vsplat(ret,1.0,0.0);}
|
||||
friend inline void vzero (vComplexD &ret){ vsplat(ret,0.0,0.0);}
|
||||
friend inline void vcomplex_i(vComplexD &ret){ vsplat(ret,0.0,1.0);}
|
||||
|
||||
////////////////////////////////////
|
||||
// Arithmetic operator overloads +,-,*
|
||||
////////////////////////////////////
|
||||
friend inline vComplexD operator + (vComplexD a, vComplexD b)
|
||||
{
|
||||
vComplexD ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_add_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_add_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_add_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = vec_add(a.v,b.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vComplexD operator - (vComplexD a, vComplexD b)
|
||||
{
|
||||
vComplexD ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_sub_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_sub_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_sub_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = vec_sub(a.v,b.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vComplexD operator * (vComplexD a, vComplexD b)
|
||||
{
|
||||
vComplexD ret;
|
||||
|
||||
//Multiplicationof (ak+ibk)*(ck+idk)
|
||||
// a + i b can be stored as a data structure
|
||||
//From intel optimisation reference guide
|
||||
/*
|
||||
movsldup xmm0, Src1; load real parts into the destination,
|
||||
; a1, a1, a0, a0
|
||||
movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
|
||||
mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
|
||||
shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
|
||||
movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
|
||||
mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
|
||||
addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
|
||||
VSHUFPD (VEX.256 encoded version)
|
||||
IF IMM0[0] = 0
|
||||
THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
|
||||
IF IMM0[1] = 0
|
||||
THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
|
||||
IF IMM0[2] = 0
|
||||
THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
|
||||
IF IMM0[3] = 0
|
||||
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI;
|
||||
*/
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
zvec ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
||||
ymm0 = _mm256_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi b'01,01
|
||||
ymm2 = _mm256_shuffle_pd(a.v,a.v,0xF); // ymm2 <- ai,ai b'11,11
|
||||
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
ret.v= _mm256_addsub_pd(ymm0,ymm1);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
zvec ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar,
|
||||
ymm0 = _mm_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm_shuffle_pd(b.v,b.v,0x1); // ymm1 <- br,bi b01
|
||||
ymm2 = _mm_shuffle_pd(a.v,a.v,0x3); // ymm2 <- ai,ai b11
|
||||
ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
ret.v= _mm_addsub_pd(ymm0,ymm1);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
/* This is from
|
||||
* Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets
|
||||
* @inproceedings{McFarlin:2011:ASV:1995896.1995938,
|
||||
* author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
|
||||
* title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
|
||||
* booktitle = {Proceedings of the International Conference on Supercomputing},
|
||||
* series = {ICS '11},
|
||||
* year = {2011},
|
||||
* isbn = {978-1-4503-0102-2},
|
||||
* location = {Tucson, Arizona, USA},
|
||||
* pages = {265--274},
|
||||
* numpages = {10},
|
||||
* url = {http://doi.acm.org/10.1145/1995896.1995938},
|
||||
* doi = {10.1145/1995896.1995938},
|
||||
* acmid = {1995938},
|
||||
* publisher = {ACM},
|
||||
* address = {New York, NY, USA},
|
||||
* keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
|
||||
* }
|
||||
*/
|
||||
zvec vzero,ymm0,ymm1,real,imag;
|
||||
vzero = _mm512_setzero();
|
||||
ymm0 = _mm512_swizzle_pd(a.v, _MM_SWIZ_REG_CDAB); //
|
||||
real = _mm512_mask_or_epi64(a.v, 0xAAAA,vzero, ymm0);
|
||||
imag = _mm512_mask_sub_pd(a.v, 0x5555,vzero, ymm0);
|
||||
ymm1 = _mm512_mul_pd(real, b.v);
|
||||
ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
|
||||
ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1);
|
||||
/* Imag OK */
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = vec_mul(a.v,b.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute(vComplexD &y,vComplexD b,int perm)
|
||||
{
|
||||
Gpermute<vComplexD>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
|
||||
{
|
||||
Gmerge<vComplexD,ComplexD >(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vComplexD &y,std::vector<ComplexD *> &extracted)
|
||||
{
|
||||
Gextract<vComplexD,ComplexD>(y,extracted);
|
||||
}
|
||||
friend inline void merge(vComplexD &y,std::vector<ComplexD > &extracted)
|
||||
{
|
||||
Gmerge<vComplexD,ComplexD >(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vComplexD &y,std::vector<ComplexD > &extracted)
|
||||
{
|
||||
Gextract<vComplexD,ComplexD>(y,extracted);
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
friend inline void vsplat(vComplexD &ret,ComplexD c){
|
||||
float a= real(c);
|
||||
float b= imag(c);
|
||||
vsplat(ret,a,b);
|
||||
}
|
||||
|
||||
|
||||
friend inline void vsplat(vComplexD &ret,double rl,double ig){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_pd(ig,rl,ig,rl);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_pd(ig,rl);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = {ig,rl,ig,rl};
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline void vset(vComplexD &ret,ComplexD *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_pd(a[0].imag(),a[0].real());
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
// Note v has a0 a1 a2 a3
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[3].imag()};
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
_mm256_store_pd((double *)a,ret.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
_mm_store_pd((double *)a,ret.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_store_pd((double *)a,ret.v);
|
||||
//Note v has a3 a2 a1 a0
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
friend inline void vprefetch(const vComplexD &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
////////////////////////
|
||||
// Conjugate
|
||||
////////////////////////
|
||||
friend inline vComplexD conj(const vComplexD &in){
|
||||
vComplexD ret ; vzero(ret);
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
|
||||
__m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
|
||||
ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_addsub_pd(ret.v,in.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
// Xeon does not have fmaddsub or addsub
|
||||
// with mask 0xa (1010), v[0] -v[1] v[2] -v[3] ....
|
||||
ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);
|
||||
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
// REDUCE FIXME must be a cleaner implementation
|
||||
friend inline ComplexD Reduce(const vComplexD & in)
|
||||
{
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
|
||||
__attribute__ ((aligned(32))) double c_[4];
|
||||
_mm256_store_pd(c_,in.v);
|
||||
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
}
|
||||
|
||||
// Unary negation
|
||||
friend inline vComplexD operator -(const vComplexD &r) {
|
||||
vComplexD ret;
|
||||
vzero(ret);
|
||||
ret = ret - r;
|
||||
return ret;
|
||||
}
|
||||
// *=,+=,-= operators
|
||||
inline vComplexD &operator *=(const vComplexD &r) {
|
||||
*this = (*this)*r;
|
||||
return *this;
|
||||
}
|
||||
inline vComplexD &operator +=(const vComplexD &r) {
|
||||
*this = *this+r;
|
||||
return *this;
|
||||
}
|
||||
inline vComplexD &operator -=(const vComplexD &r) {
|
||||
*this = *this-r;
|
||||
return *this;
|
||||
}
|
||||
|
||||
public:
|
||||
static int Nsimd(void) { return sizeof(zvec)/sizeof(double)/2;}
|
||||
};
|
||||
|
||||
|
||||
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
|
||||
|
||||
|
||||
typedef vComplexD vDComplex;
|
||||
inline void zeroit(vComplexD &z){ vzero(z);}
|
||||
|
||||
inline vComplexD outerProduct(const vComplexD &l, const vComplexD& r)
|
||||
{
|
||||
return l*r;
|
||||
}
|
||||
inline vComplexD trace(const vComplexD &arg){
|
||||
return arg;
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
//// Generic routine to promote object<complex> -> object<vcomplex>
|
||||
//// Supports the array reordering transformation that gives me SIMD utilisation
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
template<template<class> class object>
|
||||
inline object<vComplex> splat(object<Complex >s){
|
||||
object<vComplex> ret;
|
||||
vComplex * v_ptr = (vComplex *)& ret;
|
||||
Complex * s_ptr = (Complex *) &s;
|
||||
for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
|
||||
vsplat(*(v_ptr++),*(s_ptr++));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
*/
|
||||
}
|
||||
#endif
|
384
lib/Grid_vComplexF.h
Normal file
384
lib/Grid_vComplexF.h
Normal file
@ -0,0 +1,384 @@
|
||||
#ifndef VCOMPLEXF
|
||||
#define VCOMPLEXF
|
||||
#include "Grid.h"
|
||||
|
||||
namespace Grid {
|
||||
|
||||
/*
|
||||
inline void Print(const char *A,cvec c) {
|
||||
float *fp=(float *)&c;
|
||||
printf(A);
|
||||
printf(" %le %le %le %le %le %le %le %le\n",
|
||||
fp[0],fp[1],fp[2],fp[3],fp[4],fp[5],fp[6],fp[7]);
|
||||
}
|
||||
*/
|
||||
|
||||
class vComplexF {
|
||||
// protected:
|
||||
|
||||
public:
|
||||
cvec v;
|
||||
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
|
||||
public:
|
||||
typedef cvec vector_type;
|
||||
typedef ComplexF scalar_type;
|
||||
|
||||
vComplexF & operator = ( Zero & z){
|
||||
vzero(*this);
|
||||
return (*this);
|
||||
}
|
||||
vComplexF(){};
|
||||
vComplexF(ComplexF a){
|
||||
vsplat(*this,a);
|
||||
};
|
||||
vComplexF(double a){
|
||||
vsplat(*this,ComplexF(a));
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// mac, mult, sub, add, adj
|
||||
// Should do an AVX2 version with mac.
|
||||
///////////////////////////////////////////////
|
||||
friend inline void mac (vComplexF * __restrict__ y,const vComplexF * __restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
friend inline vComplexF adj(const vComplexF &in){ return conj(in); }
|
||||
|
||||
//////////////////////////////////
|
||||
// Initialise to 1,0,i
|
||||
//////////////////////////////////
|
||||
friend inline void vone(vComplexF &ret) { vsplat(ret,1.0,0.0); }
|
||||
friend inline void vzero(vComplexF &ret) { vsplat(ret,0.0,0.0); }
|
||||
friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0);}
|
||||
|
||||
////////////////////////////////////
|
||||
// Arithmetic operator overloads +,-,*
|
||||
////////////////////////////////////
|
||||
friend inline vComplexF operator + (vComplexF a, vComplexF b)
|
||||
{
|
||||
vComplexF ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_add_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_add_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_add_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vComplexF operator - (vComplexF a, vComplexF b)
|
||||
{
|
||||
vComplexF ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_sub_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_sub_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_sub_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vComplexF operator * (vComplexF a, vComplexF b)
|
||||
{
|
||||
vComplexF ret;
|
||||
|
||||
//Multiplicationof (ak+ibk)*(ck+idk)
|
||||
// a + i b can be stored as a data structure
|
||||
//From intel optimisation reference
|
||||
/*
|
||||
movsldup xmm0, Src1; load real parts into the destination,
|
||||
; a1, a1, a0, a0
|
||||
movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
|
||||
mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
|
||||
shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
|
||||
movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
|
||||
mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
|
||||
addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
|
||||
*/
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
cvec ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm256_mul_ps(ymm0,b.v); // ymm0 <- ar bi, ar br
|
||||
// FIXME AVX2 could MAC
|
||||
ymm1 = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
ret.v= _mm256_addsub_ps(ymm0,ymm1);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
cvec ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm_mul_ps(ymm0,b.v); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
ret.v= _mm_addsub_ps(ymm0,ymm1);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
//
|
||||
cvec vzero,ymm0,ymm1,real, imag;
|
||||
vzero = _mm512_setzero();
|
||||
ymm0 = _mm512_swizzle_ps(a.v, _MM_SWIZ_REG_CDAB); //
|
||||
real = _mm512_mask_or_epi32(a.v, 0xAAAA,vzero, ymm0);
|
||||
imag = _mm512_mask_sub_ps(a.v, 0x5555,vzero, ymm0);
|
||||
ymm1 = _mm512_mul_ps(real, b.v);
|
||||
ymm0 = _mm512_swizzle_ps(b.v, _MM_SWIZ_REG_CDAB); // OK
|
||||
ret.v = _mm512_fmadd_ps(ymm0,imag,ymm1);
|
||||
|
||||
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = vec_mul(a.v,b.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
friend inline void vset(vComplexF &ret, ComplexF *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
// Note v has a0 a1 a2 a3 a4 a5 a6 a7
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
friend inline void vsplat(vComplexF &ret,ComplexF c){
|
||||
float a= real(c);
|
||||
float b= imag(c);
|
||||
vsplat(ret,a,b);
|
||||
}
|
||||
|
||||
friend inline void vstore(const vComplexF &ret, ComplexF *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
_mm256_store_ps((float *)a,ret.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
_mm_store_ps((float *)a,ret.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_store_ps((float *)a,ret.v);
|
||||
//Note v has a3 a2 a1 a0
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
friend inline void vprefetch(const vComplexF &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
friend inline void vsplat(vComplexF &ret,float a,float b){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_ps(b,a,b,a,b,a,b,a);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_ps(a,b,a,b);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = {a,b,a,b};
|
||||
#endif
|
||||
}
|
||||
friend inline ComplexF Reduce(const vComplexF & in)
|
||||
{
|
||||
#ifdef SSE4
|
||||
#error
|
||||
#endif
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
// FIXME this is inefficient and use
|
||||
__attribute__ ((aligned(32))) float c_[8];
|
||||
_mm256_store_ps(c_,in.v);
|
||||
return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
|
||||
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
|
||||
vComplexF va;
|
||||
vsplat(va,a);
|
||||
return va*b;
|
||||
}
|
||||
friend inline vComplexF operator * (vComplexF b,const ComplexF &a){
|
||||
return a*b;
|
||||
}
|
||||
|
||||
/*
|
||||
template<class real>
|
||||
friend inline vComplexF operator * (vComplexF b,const real &a){
|
||||
vComplexF va;
|
||||
Complex ca(a,0);
|
||||
vsplat(va,ca);
|
||||
return va*b;
|
||||
}
|
||||
template<class real>
|
||||
friend inline vComplexF operator * (const real &a,vComplexF b){
|
||||
return a*b;
|
||||
}
|
||||
|
||||
friend inline vComplexF operator + (const Complex &a, vComplexF b){
|
||||
vComplexF va;
|
||||
vsplat(va,a);
|
||||
return va+b;
|
||||
}
|
||||
friend inline vComplexF operator + (vComplexF b,const Complex &a){
|
||||
return a+b;
|
||||
}
|
||||
template<class real>
|
||||
friend inline vComplexF operator + (vComplexF b,const real &a){
|
||||
vComplexF va;
|
||||
Complex ca(a,0);
|
||||
vsplat(va,ca);
|
||||
return va+b;
|
||||
}
|
||||
template<class real>
|
||||
friend inline vComplexF operator + (const real &a,vComplexF b){
|
||||
return a+b;
|
||||
}
|
||||
friend inline vComplexF operator - (const Complex &a, vComplexF b){
|
||||
vComplexF va;
|
||||
vsplat(va,a);
|
||||
return va-b;
|
||||
}
|
||||
friend inline vComplexF operator - (vComplexF b,const Complex &a){
|
||||
vComplexF va;
|
||||
vsplat(va,a);
|
||||
return b-va;
|
||||
}
|
||||
template<class real>
|
||||
friend inline vComplexF operator - (vComplexF b,const real &a){
|
||||
vComplexF va;
|
||||
Complex ca(a,0);
|
||||
vsplat(va,ca);
|
||||
return b-va;
|
||||
}
|
||||
template<class real>
|
||||
friend inline vComplexF operator - (const real &a,vComplexF b){
|
||||
vComplexF va;
|
||||
Complex ca(a,0);
|
||||
vsplat(va,ca);
|
||||
return va-b;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Conjugate
|
||||
///////////////////////
|
||||
|
||||
friend inline vComplexF conj(const vComplexF &in){
|
||||
vComplexF ret ; vzero(ret);
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
cvec tmp;
|
||||
tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
|
||||
ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_addsub_ps(ret.v,in.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Unary negation
|
||||
friend inline vComplexF operator -(const vComplexF &r) {
|
||||
vComplexF ret;
|
||||
vzero(ret);
|
||||
ret = ret - r;
|
||||
return ret;
|
||||
}
|
||||
// *=,+=,-= operators
|
||||
inline vComplexF &operator *=(const vComplexF &r) {
|
||||
*this = (*this)*r;
|
||||
return *this;
|
||||
}
|
||||
inline vComplexF &operator +=(const vComplexF &r) {
|
||||
*this = *this+r;
|
||||
return *this;
|
||||
}
|
||||
inline vComplexF &operator -=(const vComplexF &r) {
|
||||
*this = *this-r;
|
||||
return *this;
|
||||
}
|
||||
|
||||
friend inline void permute(vComplexF &y,vComplexF b,int perm)
|
||||
{
|
||||
Gpermute<vComplexF>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
|
||||
{
|
||||
Gmerge<vComplexF,ComplexF >(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vComplexF &y,std::vector<ComplexF *> &extracted)
|
||||
{
|
||||
Gextract<vComplexF,ComplexF>(y,extracted);
|
||||
}
|
||||
friend inline void merge(vComplexF &y,std::vector<ComplexF > &extracted)
|
||||
{
|
||||
Gmerge<vComplexF,ComplexF >(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vComplexF &y,std::vector<ComplexF > &extracted)
|
||||
{
|
||||
Gextract<vComplexF,ComplexF>(y,extracted);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)
|
||||
{
|
||||
return conj(l)*r;
|
||||
}
|
||||
|
||||
inline void zeroit(vComplexF &z){ vzero(z);}
|
||||
|
||||
inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r)
|
||||
{
|
||||
return l*r;
|
||||
}
|
||||
inline vComplexF trace(const vComplexF &arg){
|
||||
return arg;
|
||||
}
|
||||
}
|
||||
#endif
|
259
lib/Grid_vInteger.h
Normal file
259
lib/Grid_vInteger.h
Normal file
@ -0,0 +1,259 @@
|
||||
#ifndef VINTEGER_H
|
||||
#define VINTEGER_H
|
||||
|
||||
#include "Grid.h"
|
||||
|
||||
namespace Grid {
|
||||
|
||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
|
||||
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
|
||||
|
||||
typedef uint32_t Integer;
|
||||
|
||||
class vInteger {
|
||||
protected:
|
||||
|
||||
public:
|
||||
|
||||
ivec v;
|
||||
|
||||
typedef ivec vector_type;
|
||||
typedef Integer scalar_type;
|
||||
|
||||
vInteger(){};
|
||||
vInteger & operator = (const Zero & z){
|
||||
vzero(*this);
|
||||
return (*this);
|
||||
}
|
||||
vInteger(Integer a){
|
||||
vsplat(*this,a);
|
||||
};
|
||||
////////////////////////////////////
|
||||
// Arithmetic operator overloads +,-,*
|
||||
////////////////////////////////////
|
||||
friend inline vInteger operator + ( vInteger a, vInteger b)
|
||||
{
|
||||
vInteger ret;
|
||||
#if defined (AVX1)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a.v,0);
|
||||
b0 = _mm256_extractf128_si256(b.v,0);
|
||||
a1 = _mm256_extractf128_si256(a.v,1);
|
||||
b1 = _mm256_extractf128_si256(b.v,1);
|
||||
a0 = _mm_add_epi32(a0,b0);
|
||||
a1 = _mm_add_epi32(a1,b1);
|
||||
ret.v = _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
ret.v = _mm256_add_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_add_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_add_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
// Implement as array of ints is only option
|
||||
#error
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vInteger operator - ( vInteger a, vInteger b)
|
||||
{
|
||||
vInteger ret;
|
||||
#if defined (AVX1)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a.v,0);
|
||||
b0 = _mm256_extractf128_si256(b.v,0);
|
||||
a1 = _mm256_extractf128_si256(a.v,1);
|
||||
b1 = _mm256_extractf128_si256(b.v,1);
|
||||
a0 = _mm_sub_epi32(a0,b0);
|
||||
a1 = _mm_sub_epi32(a1,b1);
|
||||
ret.v = _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
ret.v = _mm256_sub_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_sub_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_sub_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
// Implement as array of ints is only option
|
||||
#error
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vInteger operator * ( vInteger a, vInteger b)
|
||||
{
|
||||
vInteger ret;
|
||||
#if defined (AVX1)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a.v,0);
|
||||
b0 = _mm256_extractf128_si256(b.v,0);
|
||||
a1 = _mm256_extractf128_si256(a.v,1);
|
||||
b1 = _mm256_extractf128_si256(b.v,1);
|
||||
a0 = _mm_mul_epi32(a0,b0);
|
||||
a1 = _mm_mul_epi32(a1,b1);
|
||||
ret.v = _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
ret.v = _mm256_mul_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_mul_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_mul_epi32(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
// Implement as array of ints is only option
|
||||
#error
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// mult, sub, add, adj,conj, mac functions
|
||||
///////////////////////////////////////////////
|
||||
friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
|
||||
friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
|
||||
friend inline void add (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) + (*r);}
|
||||
friend inline void mac (vInteger &y,const vInteger a,const vInteger x){
|
||||
y = a*x+y;
|
||||
}
|
||||
|
||||
//////////////////////////////////
|
||||
// Initialise to 1,0,i
|
||||
//////////////////////////////////
|
||||
friend inline void vone (vInteger &ret){vsplat(ret,1);}
|
||||
friend inline void vzero(vInteger &ret){vsplat(ret,0);}
|
||||
friend inline void vtrue (vInteger &ret){vsplat(ret,0xFFFFFFFF);}
|
||||
friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Broadcast a value across Nsimd copies.
|
||||
/////////////////////////////////////////////////////
|
||||
friend inline void vsplat(vInteger &ret,scalar_type a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set1_epi32(a);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set1_epi32(a);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set1_epi32(a);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error
|
||||
#endif
|
||||
}
|
||||
friend inline void vset(vInteger &ret,scalar_type *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline void vstore(const vInteger &ret, Integer *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
_mm256_store_si256((__m256i*)a,ret.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
_mm_store_si128((__m128i *)a,ret.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_store_si512(a,ret.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline void vprefetch(const vInteger &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
// Unary negation
|
||||
friend inline vInteger operator -(const vInteger &r) {
|
||||
vInteger ret;
|
||||
vzero(ret);
|
||||
ret = ret - r;
|
||||
return ret;
|
||||
}
|
||||
friend inline Integer Reduce(const vInteger & in)
|
||||
{
|
||||
// unimplemented
|
||||
assert(0);
|
||||
}
|
||||
// *=,+=,-= operators
|
||||
inline vInteger &operator *=(const vInteger &r) {
|
||||
*this = (*this)*r;
|
||||
return *this;
|
||||
}
|
||||
inline vInteger &operator +=(const vInteger &r) {
|
||||
*this = *this+r;
|
||||
return *this;
|
||||
}
|
||||
inline vInteger &operator -=(const vInteger &r) {
|
||||
*this = *this-r;
|
||||
return *this;
|
||||
}
|
||||
|
||||
friend inline void permute(vInteger &y,const vInteger b,int perm)
|
||||
{
|
||||
Gpermute<vInteger>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gmerge<vInteger,Integer>(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vInteger &y,std::vector<Integer *> &extracted)
|
||||
{
|
||||
Gextract<vInteger,Integer>(y,extracted);
|
||||
}
|
||||
friend inline void merge(vInteger &y,std::vector<Integer> &extracted)
|
||||
{
|
||||
Gmerge<vInteger,Integer>(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vInteger &y,std::vector<Integer> &extracted)
|
||||
{
|
||||
Gextract<vInteger,Integer>(y,extracted);
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
|
||||
};
|
||||
|
||||
inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
|
||||
|
||||
inline void zeroit(vInteger &z){ vzero(z);}
|
||||
|
||||
inline vInteger outerProduct(const vInteger &l, const vInteger& r)
|
||||
{
|
||||
return l*r;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
259
lib/Grid_vRealD.h
Normal file
259
lib/Grid_vRealD.h
Normal file
@ -0,0 +1,259 @@
|
||||
#ifndef VREALD_H
|
||||
#define VREALD_H
|
||||
|
||||
#include "Grid.h"
|
||||
|
||||
namespace Grid {
|
||||
class vRealD {
|
||||
public:
|
||||
dvec v; // dvec is double precision vector
|
||||
|
||||
public:
|
||||
typedef dvec vector_type;
|
||||
typedef RealD scalar_type;
|
||||
|
||||
vRealD(){};
|
||||
vRealD(RealD a){
|
||||
vsplat(*this,a);
|
||||
};
|
||||
|
||||
friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
|
||||
friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
|
||||
friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
|
||||
friend inline vRealD adj(const vRealD &in) { return in; }
|
||||
friend inline vRealD conj(const vRealD &in){ return in; }
|
||||
|
||||
friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
|
||||
#if defined (AVX1) || defined (SSE4)
|
||||
y = a*x+y;
|
||||
#endif
|
||||
#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
|
||||
// accelerates multiply accumulate, but not general multiply add
|
||||
y.v = _mm256_fmadd_pd(a.v,x.v,y.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
// here precision of vector are still single
|
||||
y.v = _mm512_fmadd_pd(a.v,x.v,y.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
y.v = vec_madd(a.v,x.v,y.v);
|
||||
#endif
|
||||
}
|
||||
//////////////////////////////////
|
||||
// Initialise to 1,0
|
||||
//////////////////////////////////
|
||||
friend inline void vone (vRealD &ret){ vsplat(ret,1.0);}
|
||||
friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);}
|
||||
|
||||
|
||||
////////////////////////////////////
|
||||
// Arithmetic operator overloads +,-,*
|
||||
////////////////////////////////////
|
||||
friend inline vRealD operator + (vRealD a, vRealD b)
|
||||
{
|
||||
vRealD ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_add_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_add_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_add_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = vec_add(a.v,b.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
friend inline vRealD operator - (vRealD a, vRealD b)
|
||||
{
|
||||
vRealD ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_sub_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_sub_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_sub_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = vec_sub(a.v,b.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vRealD operator * (vRealD a, vRealD b)
|
||||
{
|
||||
vRealD ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_mul_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_mul_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_mul_pd(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = vec_mul(a.v,b.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute(vRealD &y,vRealD b,int perm)
|
||||
{
|
||||
Gpermute<vRealD>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
|
||||
{
|
||||
Gmerge<vRealD,RealD >(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
|
||||
{
|
||||
Gextract<vRealD,RealD>(y,extracted);
|
||||
}
|
||||
friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
|
||||
{
|
||||
Gmerge<vRealD,RealD >(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
|
||||
{
|
||||
Gextract<vRealD,RealD>(y,extracted);
|
||||
}
|
||||
|
||||
|
||||
friend inline void vsplat(vRealD &ret,double a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_pd(a,a,a,a);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_pd(a,a);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set1_pd(a);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = {a,a,a,a};
|
||||
#endif
|
||||
}
|
||||
friend inline void vset(vRealD &ret, double *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_pd(a[0],a[1]);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
// Note v has a0 a1 a2 a3 a4 a5 a6 a7
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = {a[0],a[1],a[2],a[3]};
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline void vstore(const vRealD &ret, double *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
_mm256_store_pd(a,ret.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
_mm_store_pd(a,ret.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_store_pd(a,ret.v);
|
||||
// Note v has a7 a6 a5ba4 a3 a2 a1 a0
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
friend inline void vprefetch(const vRealD &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
// Unary negation
|
||||
friend inline vRealD operator -(const vRealD &r) {
|
||||
vRealD ret;
|
||||
vzero(ret);
|
||||
ret = ret - r;
|
||||
return ret;
|
||||
}
|
||||
|
||||
friend inline RealD Reduce(const vRealD & in)
|
||||
{
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
typedef union {
|
||||
uint64_t l;
|
||||
double d;
|
||||
} my_conv_t;
|
||||
my_conv_t converter;
|
||||
// more reduce_add
|
||||
/*
|
||||
__attribute__ ((aligned(32))) double c_[16];
|
||||
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
|
||||
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
|
||||
tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp = 3+2,1+0,1+0,3+2
|
||||
hadd = _mm256_hadd_pd(tmp,tmp); // tmp = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
|
||||
_mm256_store_pd(c_,hadd);<3B>
|
||||
return c[0]
|
||||
*/
|
||||
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
|
||||
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
|
||||
hadd = _mm256_hadd_pd(hadd,hadd); // hadd = 1+0+3+2...
|
||||
converter.l = _mm256_extract_epi64(hadd,0);
|
||||
return converter.d;
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return _mm512_reduce_add_pd(in.v);
|
||||
/*
|
||||
__attribute__ ((aligned(32))) double c_[8];
|
||||
_mm512_store_pd(c_,in.v);
|
||||
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
|
||||
*/
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
}
|
||||
|
||||
// *=,+=,-= operators
|
||||
inline vRealD &operator *=(const vRealD &r) {
|
||||
*this = (*this)*r;
|
||||
return *this;
|
||||
}
|
||||
inline vRealD &operator +=(const vRealD &r) {
|
||||
*this = *this+r;
|
||||
return *this;
|
||||
}
|
||||
inline vRealD &operator -=(const vRealD &r) {
|
||||
*this = *this-r;
|
||||
return *this;
|
||||
}
|
||||
|
||||
public:
|
||||
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
|
||||
};
|
||||
|
||||
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
|
||||
inline void zeroit(vRealD &z){ vzero(z);}
|
||||
|
||||
inline vRealD outerProduct(const vRealD &l, const vRealD& r)
|
||||
{
|
||||
return l*r;
|
||||
}
|
||||
inline vRealD trace(const vRealD &arg){
|
||||
return arg;
|
||||
}
|
||||
inline vRealD real(const vRealD &arg){
|
||||
return arg;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#endif
|
279
lib/Grid_vRealF.h
Normal file
279
lib/Grid_vRealF.h
Normal file
@ -0,0 +1,279 @@
|
||||
#ifndef VREALF_H
|
||||
#define VREALF_H
|
||||
|
||||
#include "Grid.h"
|
||||
|
||||
namespace Grid {
|
||||
class vRealF {
|
||||
public:
|
||||
fvec v;
|
||||
|
||||
public:
|
||||
|
||||
typedef fvec vector_type;
|
||||
typedef RealF scalar_type;
|
||||
|
||||
vRealF(){};
|
||||
vRealF(RealF a){
|
||||
vsplat(*this,a);
|
||||
};
|
||||
////////////////////////////////////
|
||||
// Arithmetic operator overloads +,-,*
|
||||
////////////////////////////////////
|
||||
friend inline vRealF operator + ( vRealF a, vRealF b)
|
||||
{
|
||||
vRealF ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_add_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_add_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_add_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
vector4double aa,bb,cc;
|
||||
aa = vec_lda(0,(float *)&a);
|
||||
bb = vec_lda(0,(float *)&b);
|
||||
cc = vec_add(aa,bb);
|
||||
vec_sta(cc,0,(float *)&ret.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vRealF operator - ( vRealF a, vRealF b)
|
||||
{
|
||||
vRealF ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_sub_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_sub_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_sub_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
vector4double aa,bb,cc;
|
||||
aa = vec_lda(0,(float *)&a);
|
||||
bb = vec_lda(0,(float *)&b);
|
||||
cc = vec_sub(aa,bb);
|
||||
vec_sta(cc,0,(float *)&ret.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
friend inline vRealF operator * ( vRealF a, vRealF b)
|
||||
{
|
||||
vRealF ret;
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_mul_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_mul_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_mul_ps(a.v,b.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
vector4double aa,bb,cc; // QPX single we are forced to load as this promotes single mem->double regs.
|
||||
aa = vec_lda(0,(float *)&a);
|
||||
bb = vec_lda(0,(float *)&b);
|
||||
cc = vec_mul(aa,bb);
|
||||
vec_sta(cc,0,(float *)&ret.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// mult, sub, add, adj,conj, mac functions
|
||||
///////////////////////////////////////////////
|
||||
friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
|
||||
friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
|
||||
friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
|
||||
friend inline vRealF adj(const vRealF &in) { return in; }
|
||||
friend inline vRealF conj(const vRealF &in){ return in; }
|
||||
|
||||
friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
|
||||
#if defined (AVX1) || defined (SSE4)
|
||||
y = a*x+y;
|
||||
#endif
|
||||
#ifdef AVX2 // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
|
||||
// accelerates multiply accumulate, but not general multiply add
|
||||
y.v = _mm256_fmadd_ps(a.v,x.v,y.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
y.v = _mm512_fmadd_ps(a.v,x.v,y.v);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
vector4double aa,xx,yy; // QPX single we are forced to load as this promotes single mem->double regs.
|
||||
aa = vec_lda(0,(float *)&a.v);
|
||||
xx = vec_lda(0,(float *)&x.v);
|
||||
yy = vec_lda(0,(float *)&y.v);
|
||||
yy = vec_madd(aa,xx,yy);
|
||||
vec_sta(yy,0,(float *)&y.v);
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////
|
||||
// Initialise to 1,0,i
|
||||
//////////////////////////////////
|
||||
friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
|
||||
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
// add the vector width as a template param for BG/Q for example
|
||||
////////////////////////////////////////////////////////////////////
|
||||
friend inline void permute(vRealF &y,vRealF b,int perm)
|
||||
{
|
||||
Gpermute<vRealF>(y,b,perm);
|
||||
}
|
||||
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
|
||||
{
|
||||
Gmerge<vRealF,RealF >(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vRealF &y,std::vector<RealF *> &extracted)
|
||||
{
|
||||
Gextract<vRealF,RealF>(y,extracted);
|
||||
}
|
||||
friend inline void merge(vRealF &y,std::vector<RealF> &extracted)
|
||||
{
|
||||
Gmerge<vRealF,RealF >(y,extracted);
|
||||
}
|
||||
friend inline void extract(const vRealF &y,std::vector<RealF> &extracted)
|
||||
{
|
||||
Gextract<vRealF,RealF>(y,extracted);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Broadcast a value across Nsimd copies.
|
||||
/////////////////////////////////////////////////////
|
||||
friend inline void vsplat(vRealF &ret,float a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_ps(a,a,a,a);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
//ret.v = _mm512_set_ps(a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a);
|
||||
ret.v = _mm512_set1_ps(a);
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = {a,a,a,a};
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
friend inline void vset(vRealF &ret, float *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
ret.v = _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
// Note v has a0 a1 a2 a3 a4 a5 a6 a7
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
friend inline void vstore(const vRealF &ret, float *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
_mm256_store_ps(a,ret.v);
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
_mm_store_ps(a,ret.v);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
_mm512_store_ps(a,ret.v);
|
||||
// Note v has a7 a6 a5ba4 a3 a2 a1 a0
|
||||
#endif
|
||||
#ifdef QPX
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
friend inline void vprefetch(const vRealF &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
// Unary negation
|
||||
friend inline vRealF operator -(const vRealF &r) {
|
||||
vRealF ret;
|
||||
vzero(ret);
|
||||
ret = ret - r;
|
||||
return ret;
|
||||
}
|
||||
friend inline RealF Reduce(const vRealF & in)
|
||||
{
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
__attribute__ ((aligned(32))) float c_[16];
|
||||
__m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
|
||||
__m256 hadd = _mm256_hadd_ps(in.v,tmp);
|
||||
tmp = _mm256_permute2f128_ps(hadd,hadd,0x01);
|
||||
hadd = _mm256_hadd_ps(tmp,tmp);
|
||||
_mm256_store_ps(c_,hadd);
|
||||
return (float)c_[0];
|
||||
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return _mm512_reduce_add_ps(in.v);
|
||||
/*
|
||||
__attribute__ ((aligned(64))) float c_[16];
|
||||
_mm512_store_ps(c_,in.v);
|
||||
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
|
||||
+c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
|
||||
*/
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
}
|
||||
|
||||
// *=,+=,-= operators
|
||||
inline vRealF &operator *=(const vRealF &r) {
|
||||
*this = (*this)*r;
|
||||
return *this;
|
||||
}
|
||||
inline vRealF &operator +=(const vRealF &r) {
|
||||
*this = *this+r;
|
||||
return *this;
|
||||
}
|
||||
inline vRealF &operator -=(const vRealF &r) {
|
||||
*this = *this-r;
|
||||
return *this;
|
||||
}
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
||||
};
|
||||
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
||||
inline void zeroit(vRealF &z){ vzero(z);}
|
||||
|
||||
inline vRealF outerProduct(const vRealF &l, const vRealF& r)
|
||||
{
|
||||
return l*r;
|
||||
}
|
||||
inline vRealF trace(const vRealF &arg){
|
||||
return arg;
|
||||
}
|
||||
inline vRealF real(const vRealF &arg){
|
||||
return arg;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#endif
|
44
lib/Makefile.am
Normal file
44
lib/Makefile.am
Normal file
@ -0,0 +1,44 @@
|
||||
# additional include paths necessary to compile the C++ library
|
||||
AM_CXXFLAGS = -I$(top_srcdir)/
|
||||
|
||||
|
||||
extra_sources=
|
||||
if BUILD_COMMS_MPI
|
||||
extra_sources+=Grid_communicator_mpi.cc
|
||||
extra_sources+=Grid_stencil_common.cc
|
||||
endif
|
||||
if BUILD_COMMS_NONE
|
||||
extra_sources+=Grid_communicator_fake.cc
|
||||
extra_sources+=Grid_stencil_common.cc
|
||||
endif
|
||||
|
||||
#
|
||||
# Libraries
|
||||
#
|
||||
lib_LIBRARIES = libGrid.a
|
||||
libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
|
||||
|
||||
#
|
||||
# Include files
|
||||
#
|
||||
include_HEADERS = Grid_config.h\
|
||||
Grid.h\
|
||||
Grid_simd.h\
|
||||
Grid_vComplexD.h\
|
||||
Grid_vComplexF.h\
|
||||
Grid_vRealD.h\
|
||||
Grid_vRealF.h\
|
||||
Grid_Cartesian.h\
|
||||
Grid_Lattice.h\
|
||||
Grid_Communicator.h\
|
||||
Grid_QCD.h\
|
||||
Grid_aligned_allocator.h\
|
||||
Grid_cshift.h\
|
||||
Grid_cshift_common.h\
|
||||
Grid_cshift_mpi.h\
|
||||
Grid_cshift_none.h\
|
||||
Grid_stencil.h\
|
||||
Grid_math_types.h
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user