mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Bringing in LatticeInteger with the idea of implemented predicated
assignment, subsets etc. c.f the QDP++ "where" syntax
This commit is contained in:
parent
d5eee231e0
commit
e06a11ee5e
281
Grid_Cartesian.h
281
Grid_Cartesian.h
@ -6,126 +6,236 @@
|
|||||||
namespace Grid{
|
namespace Grid{
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Grid Support. Following will go into Grid.h.
|
// Grid Support.
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Cartesian grids
|
//
|
||||||
// Grid::Grid
|
// Cartesian grid inheritance
|
||||||
// Grid::GridCartesian
|
// Grid::GridBase
|
||||||
// Grid::GridCartesianRedBlack
|
// |
|
||||||
|
// __________|___________
|
||||||
|
// | |
|
||||||
|
// Grid::GridCartesian Grid::GridCartesianRedBlack
|
||||||
|
//
|
||||||
|
// TODO: document the following as an API guaranteed public interface
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Rough map of functionality against QDP++ Layout
|
||||||
|
*
|
||||||
|
* Param | Grid | QDP++
|
||||||
|
* -----------------------------------------
|
||||||
|
* | |
|
||||||
|
* void | oSites, iSites, lSites | sitesOnNode
|
||||||
|
* void | gSites | vol
|
||||||
|
* | |
|
||||||
|
* gcoor | oIndex, iIndex | linearSiteIndex // no virtual node in QDP
|
||||||
|
* lcoor | |
|
||||||
|
*
|
||||||
|
* void | CheckerBoarded | - // No checkerboarded in QDP
|
||||||
|
* void | FullDimensions | lattSize
|
||||||
|
* void | GlobalDimensions | lattSize // No checkerboarded in QDP
|
||||||
|
* void | LocalDimensions | subgridLattSize
|
||||||
|
* void | VirtualLocalDimensions | subgridLattSize // no virtual node in QDP
|
||||||
|
* | |
|
||||||
|
* int x 3 | oiSiteRankToGlobal | siteCoords
|
||||||
|
* | ProcessorCoorLocalCoorToGlobalCoor |
|
||||||
|
* | |
|
||||||
|
* vector<int> | GlobalCoorToRankIndex | nodeNumber(coord)
|
||||||
|
* vector<int> | GlobalCoorToProcessorCoorLocalCoor| nodeCoord(coord)
|
||||||
|
* | |
|
||||||
|
* void | Processors | logicalSize // returns cart array shape
|
||||||
|
* void | ThisRank | nodeNumber(); // returns this node rank
|
||||||
|
* void | ThisProcessorCoor | // returns this node coor
|
||||||
|
* void | isBoss(void) | primaryNode();
|
||||||
|
* | |
|
||||||
|
* | RankFromProcessorCoor | getLogicalCoorFrom(node)
|
||||||
|
* | ProcessorCoorFromRank | getNodeNumberFrom(logical_coord)
|
||||||
|
*/
|
||||||
|
|
||||||
class SimdGrid : public CartesianCommunicator {
|
class GridBase : public CartesianCommunicator {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
// Give Lattice access
|
||||||
|
template<class object> friend class Lattice;
|
||||||
|
|
||||||
SimdGrid(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
|
GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
|
||||||
|
|
||||||
// Give Lattice access
|
|
||||||
template<class object> friend class Lattice;
|
|
||||||
|
|
||||||
//protected:
|
//protected:
|
||||||
|
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
||||||
|
// and one generator per site.
|
||||||
|
// std::default_random_engine generator;
|
||||||
|
// static std::mt19937 generator( 9 );
|
||||||
|
|
||||||
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
//////////////////////////////////////////////////////////////////////
|
||||||
// and one generator per site.
|
// Commicator provides information on the processor grid
|
||||||
//std::default_random_engine generator;
|
//////////////////////////////////////////////////////////////////////
|
||||||
// static std::mt19937 generator( 9 );
|
|
||||||
|
|
||||||
// Grid information.
|
|
||||||
|
|
||||||
// Commicator provides
|
|
||||||
// unsigned long _ndimension;
|
// unsigned long _ndimension;
|
||||||
// std::vector<int> _processors; // processor grid
|
// std::vector<int> _processors; // processor grid
|
||||||
// int _processor; // linear processor rank
|
// int _processor; // linear processor rank
|
||||||
// std::vector<int> _processor_coor; // linear processor rank
|
// std::vector<int> _processor_coor; // linear processor rank
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Physics Grid information.
|
||||||
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
|
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
|
std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
|
||||||
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
|
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
|
||||||
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
|
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
|
||||||
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
|
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
|
||||||
|
std::vector<int> _ostride; // Outer stride for each dimension
|
||||||
|
std::vector<int> _istride; // Inner stride i.e. within simd lane
|
||||||
|
int _osites; // _isites*_osites = product(dimensions).
|
||||||
|
int _isites;
|
||||||
|
std::vector<int> _slice_block; // subslice information
|
||||||
|
std::vector<int> _slice_stride;
|
||||||
|
std::vector<int> _slice_nblock;
|
||||||
|
|
||||||
|
// Might need these at some point
|
||||||
// std::vector<int> _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
|
// std::vector<int> _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
|
||||||
// std::vector<int> _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
|
// std::vector<int> _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> _ostride; // Outer stride for each dimension
|
|
||||||
std::vector<int> _istride; // Inner stride i.e. within simd lane
|
|
||||||
|
|
||||||
int _osites; // _isites*_osites = product(dimensions).
|
|
||||||
int _isites;
|
|
||||||
|
|
||||||
// subslice information
|
|
||||||
std::vector<int> _slice_block;
|
|
||||||
std::vector<int> _slice_stride;
|
|
||||||
std::vector<int> _slice_nblock;
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// These routines are key. Subdivide the linearised cartesian index into
|
////////////////////////////////////////////////////////////////
|
||||||
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
|
// Checkerboarding interface is virtual and overridden by
|
||||||
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
|
// GridCartesian / GridRedBlackCartesian
|
||||||
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
|
////////////////////////////////////////////////////////////////
|
||||||
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
|
virtual int CheckerBoarded(int dim)=0;
|
||||||
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
|
virtual int CheckerBoard(std::vector<int> site)=0;
|
||||||
// lanes are operated upon simultaneously.
|
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
|
||||||
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||||
inline int oIndexReduced(std::vector<int> &rcoor)
|
inline int CheckerBoardFromOindex (int Oindex){
|
||||||
{
|
|
||||||
int idx=0;
|
|
||||||
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*rcoor[d];
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
virtual int oIndex(std::vector<int> &coor)
|
|
||||||
{
|
|
||||||
int idx=0;
|
|
||||||
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
inline int iIndex(std::vector<int> &rcoor)
|
|
||||||
{
|
|
||||||
int idx=0;
|
|
||||||
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
inline int iCoordFromIsite(int lane,int mu)
|
|
||||||
{
|
|
||||||
std::vector<int> coor(_ndimension);
|
|
||||||
for(int d=0;d<_ndimension;d++){
|
|
||||||
coor[d] = lane % _simd_layout[d];
|
|
||||||
lane = lane / _simd_layout[d];
|
|
||||||
}
|
|
||||||
return coor[mu];
|
|
||||||
}
|
|
||||||
|
|
||||||
inline int oSites(void) { return _osites; };
|
|
||||||
inline int iSites(void) { return _isites; };
|
|
||||||
|
|
||||||
inline int CheckerBoardFromOsite (int Osite){
|
|
||||||
std::vector<int> ocoor;
|
std::vector<int> ocoor;
|
||||||
CoordFromOsite(ocoor,Osite);
|
oCoorFromOindex(ocoor,Oindex);
|
||||||
int ss=0;
|
int ss=0;
|
||||||
for(int d=0;d<_ndimension;d++){
|
for(int d=0;d<_ndimension;d++){
|
||||||
ss=ss+ocoor[d];
|
ss=ss+ocoor[d];
|
||||||
}
|
}
|
||||||
return ss&0x1;
|
return ss&0x1;
|
||||||
}
|
}
|
||||||
inline void CoordFromOsite (std::vector<int>& coor,int Osite){
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Local layout calculations
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// These routines are key. Subdivide the linearised cartesian index into
|
||||||
|
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
|
||||||
|
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
|
||||||
|
//
|
||||||
|
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
|
||||||
|
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
|
||||||
|
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
|
||||||
|
// lanes are operated upon simultaneously.
|
||||||
|
|
||||||
|
virtual int oIndex(std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
int idx=0;
|
||||||
|
// Works with either global or local coordinates
|
||||||
|
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
inline int oIndexReduced(std::vector<int> &ocoor)
|
||||||
|
{
|
||||||
|
int idx=0;
|
||||||
|
// ocoor is already reduced so can eliminate the modulo operation
|
||||||
|
// for fast indexing and inline the routine
|
||||||
|
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
|
||||||
coor.resize(_ndimension);
|
coor.resize(_ndimension);
|
||||||
for(int d=0;d<_ndimension;d++){
|
for(int d=0;d<_ndimension;d++){
|
||||||
coor[d] = Osite % _rdimensions[d];
|
coor[d] = Oindex % _rdimensions[d];
|
||||||
Osite = Osite / _rdimensions[d];
|
Oindex = Oindex / _rdimensions[d];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual int CheckerBoarded(int dim)=0;
|
//////////////////////////////////////////////////////////
|
||||||
virtual int CheckerBoard(std::vector<int> site)=0;
|
// SIMD lane addressing
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
|
//////////////////////////////////////////////////////////
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
inline int iIndex(std::vector<int> &lcoor)
|
||||||
|
{
|
||||||
|
int idx=0;
|
||||||
|
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
inline void iCoorFromIindex(std::vector<int> &coor,int lane)
|
||||||
|
{
|
||||||
|
coor.resize(_ndimension);
|
||||||
|
for(int d=0;d<_ndimension;d++){
|
||||||
|
coor[d] = lane % _simd_layout[d];
|
||||||
|
lane = lane / _simd_layout[d];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Array sizing queries
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
inline int iSites(void) { return _isites; };
|
||||||
|
inline int Nsimd(void) { return _isites; };// Synonymous with iSites
|
||||||
|
inline int oSites(void) { return _osites; };
|
||||||
|
inline int lSites(void) { return _isites*_osites; };
|
||||||
|
inline int gSites(void) { return _isites*_osites*_Nprocessors; };
|
||||||
|
inline int Nd (void) { return _ndimension;};
|
||||||
|
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
|
||||||
|
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
|
||||||
|
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
|
||||||
|
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Global addressing
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
|
||||||
|
{
|
||||||
|
gcoor.resize(_ndimension);
|
||||||
|
std::vector<int> coor(_ndimension);
|
||||||
|
|
||||||
|
ProcessorCoorFromRank(rank,coor);
|
||||||
|
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]&coor[mu];
|
||||||
|
|
||||||
|
iCoorFromIindex(coor,i_idx);
|
||||||
|
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]&coor[mu];
|
||||||
|
|
||||||
|
oCoorFromOindex (coor,o_idx);
|
||||||
|
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
|
||||||
|
|
||||||
|
}
|
||||||
|
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
|
||||||
|
{
|
||||||
|
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
|
||||||
|
if(CheckerBoarded(0)){
|
||||||
|
fcoor[0] = fcoor[0]*2+cb;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
|
||||||
|
{
|
||||||
|
gcoor.resize(_ndimension);
|
||||||
|
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
|
||||||
|
}
|
||||||
|
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
|
||||||
|
{
|
||||||
|
pcoor.resize(_ndimension);
|
||||||
|
lcoor.resize(_ndimension);
|
||||||
|
for(int mu=0;mu<_ndimension;mu++){
|
||||||
|
pcoor[mu] = gcoor[mu]/_ldimensions[mu];
|
||||||
|
lcoor[mu] = gcoor[mu]%_ldimensions[mu];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
|
||||||
|
{
|
||||||
|
std::vector<int> pcoor;
|
||||||
|
std::vector<int> lcoor;
|
||||||
|
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
|
||||||
|
rank = RankFromProcessorCoor(pcoor);
|
||||||
|
i_idx= iIndex(lcoor);
|
||||||
|
o_idx= oIndex(lcoor);
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class GridCartesian: public SimdGrid {
|
class GridCartesian: public GridBase {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
virtual int CheckerBoarded(int dim){
|
virtual int CheckerBoarded(int dim){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -141,7 +251,7 @@ public:
|
|||||||
GridCartesian(std::vector<int> &dimensions,
|
GridCartesian(std::vector<int> &dimensions,
|
||||||
std::vector<int> &simd_layout,
|
std::vector<int> &simd_layout,
|
||||||
std::vector<int> &processor_grid
|
std::vector<int> &processor_grid
|
||||||
) : SimdGrid(processor_grid)
|
) : GridBase(processor_grid)
|
||||||
{
|
{
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Grid information
|
// Grid information
|
||||||
@ -201,13 +311,11 @@ public:
|
|||||||
block = block*_rdimensions[d];
|
block = block*_rdimensions[d];
|
||||||
}
|
}
|
||||||
|
|
||||||
assert( _isites == vComplex::Nsimd());
|
|
||||||
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
// Specialise this for red black grids storing half the data like a chess board.
|
// Specialise this for red black grids storing half the data like a chess board.
|
||||||
class GridRedBlackCartesian : public SimdGrid
|
class GridRedBlackCartesian : public GridBase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
virtual int CheckerBoarded(int dim){
|
virtual int CheckerBoarded(int dim){
|
||||||
@ -230,7 +338,7 @@ public:
|
|||||||
|
|
||||||
// Probably faster with table lookup;
|
// Probably faster with table lookup;
|
||||||
// or by looping over x,y,z and multiply rather than computing checkerboard.
|
// or by looping over x,y,z and multiply rather than computing checkerboard.
|
||||||
int ocb=CheckerBoardFromOsite(osite);
|
int ocb=CheckerBoardFromOindex(osite);
|
||||||
|
|
||||||
if ( (source_cb+ocb)&1 ) {
|
if ( (source_cb+ocb)&1 ) {
|
||||||
return (shift)/2;
|
return (shift)/2;
|
||||||
@ -248,7 +356,7 @@ public:
|
|||||||
};
|
};
|
||||||
GridRedBlackCartesian(std::vector<int> &dimensions,
|
GridRedBlackCartesian(std::vector<int> &dimensions,
|
||||||
std::vector<int> &simd_layout,
|
std::vector<int> &simd_layout,
|
||||||
std::vector<int> &processor_grid) : SimdGrid(processor_grid)
|
std::vector<int> &processor_grid) : GridBase(processor_grid)
|
||||||
{
|
{
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Grid information
|
// Grid information
|
||||||
@ -310,7 +418,6 @@ public:
|
|||||||
block = block*_rdimensions[d];
|
block = block*_rdimensions[d];
|
||||||
}
|
}
|
||||||
|
|
||||||
assert ( _isites == vComplex::Nsimd());
|
|
||||||
};
|
};
|
||||||
protected:
|
protected:
|
||||||
virtual int oIndex(std::vector<int> &coor)
|
virtual int oIndex(std::vector<int> &coor)
|
||||||
|
@ -22,23 +22,63 @@ class CartesianCommunicator {
|
|||||||
MPI_Comm communicator;
|
MPI_Comm communicator;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Constructor
|
||||||
CartesianCommunicator(std::vector<int> &pdimensions_in);
|
CartesianCommunicator(std::vector<int> &pdimensions_in);
|
||||||
|
|
||||||
|
// Wraps MPI_Cart routines
|
||||||
void ShiftedRanks(int dim,int shift,int & source, int & dest);
|
void ShiftedRanks(int dim,int shift,int & source, int & dest);
|
||||||
int Rank(std::vector<int> coor);
|
int RankFromProcessorCoor(std::vector<int> &coor);
|
||||||
int MyRank(void);
|
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
|
||||||
void GlobalSumF(float &);
|
|
||||||
void GlobalSumFVector(float *,int N);
|
|
||||||
|
|
||||||
void GlobalSumF(double &);
|
/////////////////////////////////
|
||||||
void GlobalSumFVector(double *,int N);
|
// Grid information queries
|
||||||
|
/////////////////////////////////
|
||||||
|
int IsBoss(void) { return _processor==0; };
|
||||||
|
int ThisRank(void) { return _processor; };
|
||||||
|
const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
|
||||||
|
const std::vector<int> & ProcessorGrid(void) { return _processors; };
|
||||||
|
int ProcessorCount(void) { return _Nprocessors; };
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Reduction
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
void GlobalSum(float &);
|
||||||
|
void GlobalSumVector(float *,int N);
|
||||||
|
|
||||||
|
void GlobalSum(double &);
|
||||||
|
void GlobalSumVector(double *,int N);
|
||||||
|
|
||||||
|
template<class obj> void GlobalSumObj(obj &o){
|
||||||
|
|
||||||
|
typedef typename obj::scalar_type scalar_type;
|
||||||
|
int words = sizeof(obj)/sizeof(scalar_type);
|
||||||
|
|
||||||
|
scalar_type * ptr = (scalar_type *)& o;
|
||||||
|
GlobalSum(ptr,words);
|
||||||
|
}
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Face exchange
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
void SendToRecvFrom(void *xmit,
|
void SendToRecvFrom(void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
void *recv,
|
void *recv,
|
||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Barrier
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
void Barrier(void);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Broadcast a buffer and composite larger
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
void Broadcast(int root,void* data, int bytes);
|
||||||
|
template<class obj> void Broadcast(int root,obj &data)
|
||||||
|
{
|
||||||
|
Broadcast(root,(void *)&data,sizeof(data));
|
||||||
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ template<class vobj>
|
|||||||
class Lattice
|
class Lattice
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SimdGrid *_grid;
|
GridBase *_grid;
|
||||||
int checkerboard;
|
int checkerboard;
|
||||||
std::vector<vobj,alignedAllocator<vobj> > _odata;
|
std::vector<vobj,alignedAllocator<vobj> > _odata;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
@ -28,7 +28,7 @@ public:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
|
|
||||||
Lattice(SimdGrid *grid) : _grid(grid) {
|
Lattice(GridBase *grid) : _grid(grid) {
|
||||||
_odata.reserve(_grid->oSites());
|
_odata.reserve(_grid->oSites());
|
||||||
assert((((uint64_t)&_odata[0])&0xF) ==0);
|
assert((((uint64_t)&_odata[0])&0xF) ==0);
|
||||||
checkerboard=0;
|
checkerboard=0;
|
||||||
@ -95,56 +95,67 @@ public:
|
|||||||
template<class sobj>
|
template<class sobj>
|
||||||
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
||||||
|
|
||||||
typedef typename vobj::scalar_type stype;
|
GridBase *grid=l._grid;
|
||||||
typedef typename vobj::vector_type vtype;
|
|
||||||
|
|
||||||
assert( l.checkerboard == l._grid->CheckerBoard(site));
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
int o_index = l._grid->oIndex(site);
|
int Nsimd = grid->Nsimd();
|
||||||
int i_index = l._grid->iIndex(site);
|
|
||||||
|
|
||||||
stype *v_ptr = (stype *)&l._odata[o_index];
|
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
||||||
stype *s_ptr = (stype *)&s;
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
v_ptr = v_ptr + 2*i_index;
|
|
||||||
|
int rank,odx,idx;
|
||||||
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
|
|
||||||
|
// Optional to broadcast from node 0.
|
||||||
|
grid->Broadcast(0,s);
|
||||||
|
|
||||||
|
std::vector<sobj> buf(Nsimd);
|
||||||
|
std::vector<scalar_type *> pointers(Nsimd);
|
||||||
|
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
||||||
|
|
||||||
|
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||||
|
extract(l._odata[odx],pointers);
|
||||||
|
|
||||||
|
buf[idx] = s;
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
||||||
|
merge(l._odata[odx],pointers);
|
||||||
|
|
||||||
for(int i=0;i<sizeof(sobj);i+=2*sizeof(stype)){
|
|
||||||
v_ptr[0] = s_ptr[0];
|
|
||||||
v_ptr[1] = s_ptr[1];
|
|
||||||
v_ptr+=2*vtype::Nsimd();
|
|
||||||
s_ptr+=2;
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// Peek a scalar object from the SIMD array
|
// Peek a scalar object from the SIMD array
|
||||||
template<class sobj>
|
template<class sobj>
|
||||||
friend void peekSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
|
friend void peekSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
||||||
|
|
||||||
typedef typename vobj::scalar_type stype;
|
GridBase *grid=l._grid;
|
||||||
typedef typename vobj::vector_type vtype;
|
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
||||||
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
int o_index = l._grid->oIndex(site);
|
int rank,odx,idx;
|
||||||
int i_index = l._grid->iIndex(site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
|
std::vector<sobj> buf(Nsimd);
|
||||||
|
std::vector<scalar_type *> pointers(Nsimd);
|
||||||
|
for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
|
||||||
|
|
||||||
stype *v_ptr = (stype *)&l._odata[o_index];
|
extract(l._odata[odx],pointers);
|
||||||
stype *s_ptr = (stype *)&s;
|
|
||||||
v_ptr = v_ptr + 2*i_index;
|
s = buf[idx];
|
||||||
|
grid->Broadcast(rank,s);
|
||||||
|
|
||||||
for(int i=0;i<sizeof(sobj);i+=2*sizeof(stype)){
|
|
||||||
s_ptr[0] = v_ptr[0];
|
|
||||||
s_ptr[1] = v_ptr[1];
|
|
||||||
v_ptr+=2*vtype::Nsimd();
|
|
||||||
s_ptr+=2;
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Randomise
|
// FIXME Randomise; deprecate this
|
||||||
friend void random(Lattice<vobj> &l){
|
friend void random(Lattice<vobj> &l){
|
||||||
|
|
||||||
Real *v_ptr = (Real *)&l._odata[0];
|
Real *v_ptr = (Real *)&l._odata[0];
|
||||||
size_t v_len = l._grid->oSites()*sizeof(vobj);
|
size_t v_len = l._grid->oSites()*sizeof(vobj);
|
||||||
size_t d_len = v_len/sizeof(Real);
|
size_t d_len = v_len/sizeof(Real);
|
||||||
@ -155,8 +166,8 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// FIXME for debug; deprecate this
|
||||||
friend void lex_sites(Lattice<vobj> &l){
|
friend void lex_sites(Lattice<vobj> &l){
|
||||||
|
|
||||||
Real *v_ptr = (Real *)&l._odata[0];
|
Real *v_ptr = (Real *)&l._odata[0];
|
||||||
size_t o_len = l._grid->oSites();
|
size_t o_len = l._grid->oSites();
|
||||||
size_t v_len = sizeof(vobj)/sizeof(vRealF);
|
size_t v_len = sizeof(vobj)/sizeof(vRealF);
|
||||||
@ -183,7 +194,6 @@ public:
|
|||||||
for(int i=0;i<d_len;i++){
|
for(int i=0;i<d_len;i++){
|
||||||
v_ptr[i]= drand48();
|
v_ptr[i]= drand48();
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -241,6 +251,7 @@ public:
|
|||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline friend Lattice<vobj> conj(const Lattice<vobj> &lhs){
|
inline friend Lattice<vobj> conj(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs._grid);
|
Lattice<vobj> ret(lhs._grid);
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
@ -259,7 +270,7 @@ public:
|
|||||||
std::vector<int> coor;
|
std::vector<int> coor;
|
||||||
int cbos;
|
int cbos;
|
||||||
|
|
||||||
full._grid->CoordFromOsite(coor,ss);
|
full._grid->oCoorFromOindex(coor,ss);
|
||||||
cbos=half._grid->CheckerBoard(coor);
|
cbos=half._grid->CheckerBoard(coor);
|
||||||
|
|
||||||
if (cbos==cb) {
|
if (cbos==cb) {
|
||||||
@ -277,7 +288,7 @@ public:
|
|||||||
std::vector<int> coor;
|
std::vector<int> coor;
|
||||||
int cbos;
|
int cbos;
|
||||||
|
|
||||||
full._grid->CoordFromOsite(coor,ss);
|
full._grid->oCoorFromOindex(coor,ss);
|
||||||
cbos=half._grid->CheckerBoard(coor);
|
cbos=half._grid->CheckerBoard(coor);
|
||||||
|
|
||||||
if (cbos==cb) {
|
if (cbos==cb) {
|
||||||
@ -351,7 +362,6 @@ public:
|
|||||||
template<class left,class right>
|
template<class left,class right>
|
||||||
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
|
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
|
||||||
{
|
{
|
||||||
std::cerr <<"Oscalar * Lattice calling mult"<<std::endl;
|
|
||||||
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
|
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
@ -383,7 +393,6 @@ public:
|
|||||||
template<class left,class right>
|
template<class left,class right>
|
||||||
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
|
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
|
||||||
{
|
{
|
||||||
std::cerr <<"Lattice * Oscalar calling mult"<<std::endl;
|
|
||||||
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
|
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
|
14
Grid_QCD.h
14
Grid_QCD.h
@ -21,7 +21,13 @@ namespace QCD {
|
|||||||
|
|
||||||
typedef iSinglet<Complex > TComplex; // This is painful. Tensor singlet complex type.
|
typedef iSinglet<Complex > TComplex; // This is painful. Tensor singlet complex type.
|
||||||
typedef iSinglet<vComplex > vTComplex;
|
typedef iSinglet<vComplex > vTComplex;
|
||||||
typedef iSinglet<Real > TReal; // This is painful. Tensor singlet complex type.
|
typedef iSinglet<Real > TReal; // This is painful. Tensor singlet complex type.
|
||||||
|
|
||||||
|
|
||||||
|
typedef iSinglet<vIntegerF > vTIntegerF;
|
||||||
|
typedef iSinglet<vIntegerD > vTIntegerD;
|
||||||
|
typedef iSinglet<vIntegerC > vTIntegerC;
|
||||||
|
typedef iSinglet<vIntegerZ > vTIntegerZ;
|
||||||
|
|
||||||
typedef iSpinMatrix<Complex > SpinMatrix;
|
typedef iSpinMatrix<Complex > SpinMatrix;
|
||||||
typedef iColourMatrix<Complex > ColourMatrix;
|
typedef iColourMatrix<Complex > ColourMatrix;
|
||||||
@ -40,9 +46,13 @@ namespace QCD {
|
|||||||
typedef iColourVector<vComplex > vColourVector;
|
typedef iColourVector<vComplex > vColourVector;
|
||||||
typedef iSpinColourVector<vComplex > vSpinColourVector;
|
typedef iSpinColourVector<vComplex > vSpinColourVector;
|
||||||
|
|
||||||
|
|
||||||
typedef Lattice<vTComplex> LatticeComplex;
|
typedef Lattice<vTComplex> LatticeComplex;
|
||||||
|
|
||||||
|
typedef Lattice<vTIntegerF> LatticeIntegerF; // Predicates for "where"
|
||||||
|
typedef Lattice<vTIntegerD> LatticeIntegerD;
|
||||||
|
typedef Lattice<vTIntegerC> LatticeIntegerC;
|
||||||
|
typedef Lattice<vTIntegerZ> LatticeIntegerZ;
|
||||||
|
|
||||||
typedef Lattice<vColourMatrix> LatticeColourMatrix;
|
typedef Lattice<vColourMatrix> LatticeColourMatrix;
|
||||||
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
|
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
|
||||||
typedef Lattice<vSpinColourMatrix> LatticePropagator;
|
typedef Lattice<vSpinColourMatrix> LatticePropagator;
|
||||||
|
@ -45,7 +45,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
|||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
buffer[bo]=rhs._odata[so+o+b];
|
buffer[bo]=rhs._odata[so+o+b];
|
||||||
bo++;
|
bo++;
|
||||||
@ -90,7 +90,7 @@ friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> p
|
|||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
extract(rhs._odata[so+o+b],pointers);
|
extract(rhs._odata[so+o+b],pointers);
|
||||||
}
|
}
|
||||||
@ -135,7 +135,7 @@ friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAll
|
|||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
rhs._odata[so+o+b]=buffer[bo++];
|
rhs._odata[so+o+b]=buffer[bo++];
|
||||||
}
|
}
|
||||||
@ -179,7 +179,7 @@ friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> po
|
|||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
merge(rhs._odata[so+o+b],pointers);
|
merge(rhs._odata[so+o+b],pointers);
|
||||||
}
|
}
|
||||||
@ -224,7 +224,7 @@ friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int
|
|||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
|
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||||
@ -267,7 +267,7 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
|
|||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
|
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
||||||
|
@ -7,10 +7,12 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
|||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
const int Nsimd = vector_type::Nsimd();
|
|
||||||
|
|
||||||
Lattice<vobj> ret(rhs._grid);
|
Lattice<vobj> ret(rhs._grid);
|
||||||
|
|
||||||
|
GridBase *grid=rhs._grid;
|
||||||
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
int fd = rhs._grid->_fdimensions[dimension];
|
int fd = rhs._grid->_fdimensions[dimension];
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
//int ld = rhs._grid->_ldimensions[dimension];
|
//int ld = rhs._grid->_ldimensions[dimension];
|
||||||
@ -43,7 +45,7 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
|||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
|
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,7 +81,7 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
|||||||
o +=rhs._grid->_slice_stride[dimension];
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
|
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -73,7 +73,7 @@ friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int
|
|||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
SimdGrid *grid=rhs._grid;
|
GridBase *grid=rhs._grid;
|
||||||
Lattice<vobj> temp(rhs._grid);
|
Lattice<vobj> temp(rhs._grid);
|
||||||
|
|
||||||
int fd = rhs._grid->_fdimensions[dimension];
|
int fd = rhs._grid->_fdimensions[dimension];
|
||||||
@ -130,8 +130,8 @@ friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int
|
|||||||
|
|
||||||
friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
{
|
{
|
||||||
const int Nsimd = vector_type::Nsimd();
|
GridBase *grid=rhs._grid;
|
||||||
SimdGrid *grid=rhs._grid;
|
const int Nsimd = grid->Nsimd();
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
@ -173,6 +173,7 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
|
|
||||||
std::vector<int> comm_offnode(simd_layout);
|
std::vector<int> comm_offnode(simd_layout);
|
||||||
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
||||||
|
std::vector<int> icoor(grid->Nd());
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
@ -197,7 +198,10 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
int s = grid->iCoordFromIsite(i,dimension);
|
|
||||||
|
int s;
|
||||||
|
grid->iCoorFromIindex(icoor,i);
|
||||||
|
s = icoor[dimension];
|
||||||
|
|
||||||
if(comm_offnode[s]){
|
if(comm_offnode[s]){
|
||||||
|
|
||||||
@ -232,7 +236,7 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
if ( x< rd-num ) permute_slice=wrap;
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
else permute_slice = 1-wrap;
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
if ( permute_slice ) {
|
if ( permute_slice ) {
|
||||||
pointers[i] = rpointers[permute_map[permute_type][i]];
|
pointers[i] = rpointers[permute_map[permute_type][i]];
|
||||||
} else {
|
} else {
|
||||||
|
51
Grid_main.cc
51
Grid_main.cc
@ -19,9 +19,9 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<int> simd_layout(4);
|
std::vector<int> simd_layout(4);
|
||||||
|
|
||||||
std::vector<int> mpi_layout(4);
|
std::vector<int> mpi_layout(4);
|
||||||
mpi_layout[0]=4;
|
mpi_layout[0]=2;
|
||||||
mpi_layout[1]=2;
|
mpi_layout[1]=1;
|
||||||
mpi_layout[2]=4;
|
mpi_layout[2]=1;
|
||||||
mpi_layout[3]=2;
|
mpi_layout[3]=2;
|
||||||
|
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -34,7 +34,7 @@ int main (int argc, char ** argv)
|
|||||||
omp_set_num_threads(omp);
|
omp_set_num_threads(omp);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for(int lat=16;lat<=16;lat+=40){
|
for(int lat=8;lat<=16;lat+=40){
|
||||||
latt_size[0] = lat;
|
latt_size[0] = lat;
|
||||||
latt_size[1] = lat;
|
latt_size[1] = lat;
|
||||||
latt_size[2] = lat;
|
latt_size[2] = lat;
|
||||||
@ -166,6 +166,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
|
|
||||||
// Lattice SU(3) x SU(3)
|
// Lattice SU(3) x SU(3)
|
||||||
|
Fine.Barrier();
|
||||||
FooBar = Foo * Bar;
|
FooBar = Foo * Bar;
|
||||||
|
|
||||||
// Lattice 12x12 GEMM
|
// Lattice 12x12 GEMM
|
||||||
@ -179,37 +180,47 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
flops = ncall*1.0*volume*(8*Nc*Nc*Nc);
|
flops = ncall*1.0*volume*(8*Nc*Nc*Nc);
|
||||||
bytes = ncall*1.0*volume*Nc*Nc *2*3*sizeof(Grid::Real);
|
bytes = ncall*1.0*volume*Nc*Nc *2*3*sizeof(Grid::Real);
|
||||||
printf("%f flop and %f bytes\n",flops,bytes/ncall);
|
if ( Fine.IsBoss() ) {
|
||||||
|
printf("%f flop and %f bytes\n",flops,bytes/ncall);
|
||||||
|
}
|
||||||
FooBar = Foo * Bar;
|
FooBar = Foo * Bar;
|
||||||
|
Fine.Barrier();
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
mult(FooBar,Foo,Bar); // this is better
|
Fine.Barrier();
|
||||||
|
mult(FooBar,Foo,Bar); // this is better
|
||||||
}
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
|
Fine.Barrier();
|
||||||
|
if ( Fine.IsBoss() ) {
|
||||||
#ifdef OMP
|
#ifdef OMP
|
||||||
printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp_get_max_threads(),lat,(t1-t0)/ncall);
|
printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp_get_max_threads(),lat,(t1-t0)/ncall);
|
||||||
#endif
|
#endif
|
||||||
printf("mult NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
|
printf("mult NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
|
||||||
printf("mult NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
|
printf("mult NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
|
||||||
|
}
|
||||||
mult(FooBar,Foo,Bar);
|
mult(FooBar,Foo,Bar);
|
||||||
FooBar = Foo * Bar;
|
FooBar = Foo * Bar;
|
||||||
|
|
||||||
bytes = ncall*1.0*volume*Nc*Nc *2*5*sizeof(Grid::Real);
|
bytes = ncall*1.0*volume*Nc*Nc *2*5*sizeof(Grid::Real);
|
||||||
|
Fine.Barrier();
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
mult(FooBar,Foo,Cshift(Bar,1,-2));
|
Fine.Barrier();
|
||||||
|
mult(FooBar,Foo,Cshift(Bar,1,-1));
|
||||||
//mult(FooBar,Foo,Bar);
|
//mult(FooBar,Foo,Bar);
|
||||||
//FooBar = Foo * Bar; // this is bad
|
//FooBar = Foo * Bar; // this is bad
|
||||||
}
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
|
Fine.Barrier();
|
||||||
|
|
||||||
FooBar = Foo * Bar;
|
FooBar = Foo * Bar;
|
||||||
|
|
||||||
printf("Cshift Mult: NumThread %d , Lattice size %d , %f us per call\n",omp,lat,(t1-t0)/ncall);
|
if ( Fine.IsBoss() ) {
|
||||||
printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
|
printf("Cshift Mult: NumThread %d , Lattice size %d , %f us per call\n",omp,lat,(t1-t0)/ncall);
|
||||||
printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
|
printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
|
||||||
|
printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
|
||||||
|
}
|
||||||
// pickCheckerboard(0,rFoo,FooBar);
|
// pickCheckerboard(0,rFoo,FooBar);
|
||||||
// pickCheckerboard(1,bFoo,FooBar);
|
// pickCheckerboard(1,bFoo,FooBar);
|
||||||
// setCheckerboard(FooBar,rFoo);
|
// setCheckerboard(FooBar,rFoo);
|
||||||
@ -225,12 +236,12 @@ int main (int argc, char ** argv)
|
|||||||
pickCheckerboard(0,rFoo,Foo); // Pick out red or black checkerboards
|
pickCheckerboard(0,rFoo,Foo); // Pick out red or black checkerboards
|
||||||
pickCheckerboard(1,bFoo,Foo);
|
pickCheckerboard(1,bFoo,Foo);
|
||||||
|
|
||||||
std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
|
if ( Fine.IsBoss() ) {
|
||||||
|
std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
|
||||||
|
}
|
||||||
Shifted = Cshift(Foo,dir,shift); // Shift everything
|
Shifted = Cshift(Foo,dir,shift); // Shift everything
|
||||||
|
|
||||||
std::cout << "Shifting even source parities to odd result"<<std::endl;
|
|
||||||
bShifted = Cshift(rFoo,dir,shift); // Shift red->black
|
bShifted = Cshift(rFoo,dir,shift); // Shift red->black
|
||||||
std::cout << "Shifting odd parities to even result"<<std::endl;
|
|
||||||
rShifted = Cshift(bFoo,dir,shift); // Shift black->red
|
rShifted = Cshift(bFoo,dir,shift); // Shift black->red
|
||||||
|
|
||||||
ShiftedCheck=zero;
|
ShiftedCheck=zero;
|
||||||
@ -332,8 +343,10 @@ int main (int argc, char ** argv)
|
|||||||
nrm = nrm + real(conj(diff)*diff);
|
nrm = nrm + real(conj(diff)*diff);
|
||||||
}}
|
}}
|
||||||
}}}}
|
}}}}
|
||||||
|
if( Fine.IsBoss() ){
|
||||||
|
std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;
|
||||||
|
}
|
||||||
}}
|
}}
|
||||||
std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;
|
|
||||||
|
|
||||||
} // loop for lat
|
} // loop for lat
|
||||||
} // loop for omp
|
} // loop for omp
|
||||||
|
@ -66,7 +66,7 @@ namespace Grid {
|
|||||||
template<class vtype> class iScalar
|
template<class vtype> class iScalar
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SIMDalign vtype _internal;
|
vtype _internal;
|
||||||
|
|
||||||
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||||
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||||
@ -113,7 +113,7 @@ public:
|
|||||||
template<class vtype,int N> class iVector
|
template<class vtype,int N> class iVector
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SIMDalign vtype _internal[N];
|
vtype _internal[N];
|
||||||
|
|
||||||
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||||
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||||
@ -170,7 +170,7 @@ public:
|
|||||||
template<class vtype,int N> class iMatrix
|
template<class vtype,int N> class iMatrix
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SIMDalign vtype _internal[N][N];
|
vtype _internal[N][N];
|
||||||
|
|
||||||
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||||
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||||
@ -908,21 +908,5 @@ inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._inte
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
// Generic routine to promote object<complex> -> object<vcomplex>
|
|
||||||
// Supports the array reordering transformation that gives me SIMD utilisation
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
/*
|
|
||||||
template<template<class> class object>
|
|
||||||
inline object<vComplex> splat(object<Complex >s){
|
|
||||||
object<vComplex> ret;
|
|
||||||
vComplex * v_ptr = (vComplex *)& ret;
|
|
||||||
Complex * s_ptr = (Complex *) &s;
|
|
||||||
for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
|
|
||||||
vsplat(*(v_ptr++),*(s_ptr++));
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
29
Grid_mpi.cc
29
Grid_mpi.cc
@ -28,18 +28,18 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
|||||||
assert(Size==_Nprocessors);
|
assert(Size==_Nprocessors);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::GlobalSumF(float &f){
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumFVector(float *f,int N)
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
{
|
{
|
||||||
MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumF(double &d)
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
{
|
{
|
||||||
MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumFVector(double *d,int N)
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
{
|
{
|
||||||
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
}
|
}
|
||||||
@ -48,12 +48,17 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
|
|||||||
{
|
{
|
||||||
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||||
}
|
}
|
||||||
int CartesianCommunicator::Rank(std::vector<int> coor)
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||||
{
|
{
|
||||||
int rank;
|
int rank;
|
||||||
MPI_Cart_rank (communicator, &coor[0], &rank);
|
MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||||
return rank;
|
return rank;
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
coor.resize(_ndimension);
|
||||||
|
MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||||
|
}
|
||||||
|
|
||||||
// Basic Halo comms primitive
|
// Basic Halo comms primitive
|
||||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
@ -71,5 +76,19 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Barrier(void)
|
||||||
|
{
|
||||||
|
MPI_Barrier(communicator);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
MPI_Bcast(data,
|
||||||
|
bytes,
|
||||||
|
MPI_BYTE,
|
||||||
|
root,
|
||||||
|
communicator);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
119
Grid_simd.h
119
Grid_simd.h
@ -37,6 +37,29 @@
|
|||||||
// Fourier transform equivalent.
|
// Fourier transform equivalent.
|
||||||
//
|
//
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// SIMD Alignment controls
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED
|
||||||
|
#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A)))
|
||||||
|
#else
|
||||||
|
#define ALIGN_DIRECTIVE(A) __declspec(align(A))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef SSE2
|
||||||
|
#include <pmmintrin.h>
|
||||||
|
#define SIMDalign ALIGN_DIRECTIVE(16)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(AVX1) || defined (AVX2)
|
||||||
|
#include <immintrin.h>
|
||||||
|
#define SIMDalign ALIGN_DIRECTIVE(32)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef AVX512
|
||||||
|
#include <immintrin.h>
|
||||||
|
#define SIMDalign ALIGN_DIRECTIVE(64)
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
@ -49,6 +72,8 @@ namespace Grid {
|
|||||||
typedef std::complex<Real> Complex;
|
typedef std::complex<Real> Complex;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
inline RealF adj(const RealF & r){ return r; }
|
inline RealF adj(const RealF & r){ return r; }
|
||||||
inline RealF conj(const RealF & r){ return r; }
|
inline RealF conj(const RealF & r){ return r; }
|
||||||
inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
||||||
@ -97,47 +122,27 @@ namespace Grid {
|
|||||||
template<> inline void ZeroIt(RealD &arg){ arg=0; };
|
template<> inline void ZeroIt(RealD &arg){ arg=0; };
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
|
||||||
// SIMD Alignment controls
|
|
||||||
////////////////////////////////////////////////////////////
|
|
||||||
#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED
|
|
||||||
#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A)))
|
|
||||||
#else
|
|
||||||
#define ALIGN_DIRECTIVE(A) __declspec(align(A))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef SSE2
|
|
||||||
#include <pmmintrin.h>
|
|
||||||
#define SIMDalign ALIGN_DIRECTIVE(16)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(AVX1) || defined (AVX2)
|
|
||||||
#include <immintrin.h>
|
|
||||||
#define SIMDalign ALIGN_DIRECTIVE(32)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef AVX512
|
|
||||||
#include <immintrin.h>
|
|
||||||
#define SIMDalign ALIGN_DIRECTIVE(64)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined (SSE2)
|
#if defined (SSE2)
|
||||||
typedef __m128 fvec;
|
typedef __m128 fvec;
|
||||||
typedef __m128d dvec;
|
typedef __m128d dvec;
|
||||||
typedef __m128 cvec;
|
typedef __m128 cvec;
|
||||||
typedef __m128d zvec;
|
typedef __m128d zvec;
|
||||||
|
typedef __m128i ivec;
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX1) || defined (AVX2)
|
#if defined (AVX1) || defined (AVX2)
|
||||||
typedef __m256 fvec;
|
typedef __m256 fvec;
|
||||||
typedef __m256d dvec;
|
typedef __m256d dvec;
|
||||||
typedef __m256 cvec;
|
typedef __m256 cvec;
|
||||||
typedef __m256d zvec;
|
typedef __m256d zvec;
|
||||||
|
typedef __m256i ivec;
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX512)
|
#if defined (AVX512)
|
||||||
typedef __m512 fvec;
|
typedef __m512 fvec;
|
||||||
typedef __m512d dvec;
|
typedef __m512d dvec;
|
||||||
typedef __m512 cvec;
|
typedef __m512 cvec;
|
||||||
typedef __m512d zvec;
|
typedef __m512d zvec;
|
||||||
|
typedef __m512i ivec;
|
||||||
#endif
|
#endif
|
||||||
#if defined (QPX)
|
#if defined (QPX)
|
||||||
typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision
|
typedef float fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision
|
||||||
@ -159,10 +164,76 @@ namespace Grid {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Generic extract/merge/permute
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
template<class vsimd,class scalar,int Nsimd>
|
||||||
|
inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
|
||||||
|
// Bounce off stack is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
scalar buf[Nsimd];
|
||||||
|
vstore(y,buf);
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
*extracted[i] = buf[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class vsimd,class scalar,int Nsimd>
|
||||||
|
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
|
||||||
|
scalar buf[Nsimd];
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
buf[i]=*extracted[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
vset(y,buf);
|
||||||
|
};
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////
|
||||||
|
// Permute
|
||||||
|
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||||
|
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
||||||
|
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
||||||
|
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||||
|
// Permute 4 possible on half precision @512bit vectors.
|
||||||
|
//////////////////////////////////////////////////////////
|
||||||
|
// Should be able to make the permute/extract/merge independent of the
|
||||||
|
// vector subtype and reduce the volume of code.
|
||||||
|
template<class vsimd>
|
||||||
|
inline void Gpermute(vsimd &y,vsimd b,int perm){
|
||||||
|
switch (perm){
|
||||||
|
#if defined(AVX1)||defined(AVX2)
|
||||||
|
// 8x32 bits=>3 permutes
|
||||||
|
case 2: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
|
||||||
|
case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
|
||||||
|
case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
|
||||||
|
#endif
|
||||||
|
#ifdef SSE2
|
||||||
|
case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
|
||||||
|
case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
|
||||||
|
#endif
|
||||||
|
#ifdef AVX512
|
||||||
|
// 16 floats=> permutes
|
||||||
|
// Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo
|
||||||
|
// Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn
|
||||||
|
// Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
|
||||||
|
// Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
|
||||||
|
case 3: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
|
||||||
|
case 2: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
|
||||||
|
case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||||
|
case 0: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||||
|
#endif
|
||||||
|
#ifdef QPX
|
||||||
|
#error not implemented
|
||||||
|
#endif
|
||||||
|
default: assert(0); break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
#include <Grid_vRealF.h>
|
#include <Grid_vRealF.h>
|
||||||
#include <Grid_vRealD.h>
|
#include <Grid_vRealD.h>
|
||||||
#include <Grid_vComplexF.h>
|
#include <Grid_vComplexF.h>
|
||||||
#include <Grid_vComplexD.h>
|
#include <Grid_vComplexD.h>
|
||||||
|
#include <Grid_vInteger.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
307
Grid_vInteger.h
Normal file
307
Grid_vInteger.h
Normal file
@ -0,0 +1,307 @@
|
|||||||
|
#ifndef VINTEGER_H
|
||||||
|
#define VINTEGER_H
|
||||||
|
|
||||||
|
#include "Grid.h"
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
|
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
|
||||||
|
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
|
||||||
|
|
||||||
|
typedef uint32_t Integer;
|
||||||
|
|
||||||
|
class vInteger {
|
||||||
|
protected:
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
ivec v;
|
||||||
|
|
||||||
|
typedef ivec vector_type;
|
||||||
|
typedef Integer scalar_type;
|
||||||
|
|
||||||
|
vInteger(){};
|
||||||
|
////////////////////////////////////
|
||||||
|
// Arithmetic operator overloads +,-,*
|
||||||
|
////////////////////////////////////
|
||||||
|
friend inline vInteger operator + ( vInteger a, vInteger b)
|
||||||
|
{
|
||||||
|
vInteger ret;
|
||||||
|
#if defined (AVX1)
|
||||||
|
__m128i a0,a1;
|
||||||
|
__m128i b0,b1;
|
||||||
|
a0 = _mm256_extractf128_si256(a.v,0);
|
||||||
|
b0 = _mm256_extractf128_si256(b.v,0);
|
||||||
|
a1 = _mm256_extractf128_si256(a.v,1);
|
||||||
|
b1 = _mm256_extractf128_si256(b.v,1);
|
||||||
|
a0 = _mm_add_epi32(a0,b0);
|
||||||
|
a1 = _mm_add_epi32(a1,b1);
|
||||||
|
ret.v = _mm256_set_m128i(a1,a0);
|
||||||
|
#endif
|
||||||
|
#if defined (AVX2)
|
||||||
|
ret.v = _mm256_add_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef SSE2
|
||||||
|
ret.v = _mm_add_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef AVX512
|
||||||
|
ret.v = _mm512_add_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef QPX
|
||||||
|
// Implement as array of ints is only option
|
||||||
|
#error
|
||||||
|
#endif
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
friend inline vInteger operator - ( vInteger a, vInteger b)
|
||||||
|
{
|
||||||
|
vInteger ret;
|
||||||
|
#if defined (AVX1)
|
||||||
|
__m128i a0,a1;
|
||||||
|
__m128i b0,b1;
|
||||||
|
a0 = _mm256_extractf128_si256(a.v,0);
|
||||||
|
b0 = _mm256_extractf128_si256(b.v,0);
|
||||||
|
a1 = _mm256_extractf128_si256(a.v,1);
|
||||||
|
b1 = _mm256_extractf128_si256(b.v,1);
|
||||||
|
a0 = _mm_sub_epi32(a0,b0);
|
||||||
|
a1 = _mm_sub_epi32(a1,b1);
|
||||||
|
ret.v = _mm256_set_m128i(a1,a0);
|
||||||
|
#endif
|
||||||
|
#if defined (AVX2)
|
||||||
|
ret.v = _mm256_sub_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef SSE2
|
||||||
|
ret.v = _mm_sub_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef AVX512
|
||||||
|
ret.v = _mm512_sub_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef QPX
|
||||||
|
// Implement as array of ints is only option
|
||||||
|
#error
|
||||||
|
#endif
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
friend inline vInteger operator * ( vInteger a, vInteger b)
|
||||||
|
{
|
||||||
|
vInteger ret;
|
||||||
|
#if defined (AVX1)
|
||||||
|
__m128i a0,a1;
|
||||||
|
__m128i b0,b1;
|
||||||
|
a0 = _mm256_extractf128_si256(a.v,0);
|
||||||
|
b0 = _mm256_extractf128_si256(b.v,0);
|
||||||
|
a1 = _mm256_extractf128_si256(a.v,1);
|
||||||
|
b1 = _mm256_extractf128_si256(b.v,1);
|
||||||
|
a0 = _mm_mul_epi32(a0,b0);
|
||||||
|
a1 = _mm_mul_epi32(a1,b1);
|
||||||
|
ret.v = _mm256_set_m128i(a1,a0);
|
||||||
|
#endif
|
||||||
|
#if defined (AVX2)
|
||||||
|
ret.v = _mm256_mul_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef SSE2
|
||||||
|
ret.v = _mm_mul_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef AVX512
|
||||||
|
ret.v = _mm512_mul_epi32(a.v,b.v);
|
||||||
|
#endif
|
||||||
|
#ifdef QPX
|
||||||
|
// Implement as array of ints is only option
|
||||||
|
#error
|
||||||
|
#endif
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
// mult, sub, add, adj,conj, mac functions
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
|
||||||
|
friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
|
||||||
|
friend inline void add (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) + (*r);}
|
||||||
|
friend inline void mac (vInteger &y,const vInteger a,const vInteger x){
|
||||||
|
y = a*x+y;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////
|
||||||
|
// Initialise to 1,0,i
|
||||||
|
//////////////////////////////////
|
||||||
|
friend inline void vone (vInteger &ret){vsplat(ret,1);}
|
||||||
|
friend inline void vzero(vInteger &ret){vsplat(ret,0);}
|
||||||
|
friend inline void vtrue (vInteger &ret){vsplat(ret,0xFFFFFFFF);}
|
||||||
|
friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
// Broadcast a value across Nsimd copies.
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
friend inline void vsplat(vInteger &ret,scalar_type a){
|
||||||
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
|
ret.v = _mm256_set1_epi32(a);
|
||||||
|
#endif
|
||||||
|
#ifdef SSE2
|
||||||
|
ret.v = _mm_set1_epi32(a);
|
||||||
|
#endif
|
||||||
|
#ifdef AVX512
|
||||||
|
ret.v = _mm512_set1_epi32(a);
|
||||||
|
#endif
|
||||||
|
#ifdef QPX
|
||||||
|
#error
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
friend inline void vset(vInteger &ret,scalar_type *a){
|
||||||
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
|
ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||||
|
#endif
|
||||||
|
#ifdef SSE2
|
||||||
|
ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
|
||||||
|
#endif
|
||||||
|
#ifdef AVX512
|
||||||
|
ret.v = _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||||
|
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||||
|
#endif
|
||||||
|
#ifdef QPX
|
||||||
|
#error
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
friend inline void vstore(vInteger &ret, Integer *a){
|
||||||
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
|
_mm256_store_si256((__m256i*)a,ret.v);
|
||||||
|
#endif
|
||||||
|
#ifdef SSE2
|
||||||
|
_mm_store_si128(a,ret.v);
|
||||||
|
#endif
|
||||||
|
#ifdef AVX512
|
||||||
|
_mm512_store_si512(a,ret.v);
|
||||||
|
#endif
|
||||||
|
#ifdef QPX
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
friend inline void vprefetch(const vInteger &v)
|
||||||
|
{
|
||||||
|
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||||
|
}
|
||||||
|
// Unary negation
|
||||||
|
friend inline vInteger operator -(const vInteger &r) {
|
||||||
|
vInteger ret;
|
||||||
|
vzero(ret);
|
||||||
|
ret = ret - r;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
friend inline Integer Reduce(const vInteger & in)
|
||||||
|
{
|
||||||
|
// unimplemented
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
// *=,+=,-= operators
|
||||||
|
inline vInteger &operator *=(const vInteger &r) {
|
||||||
|
*this = (*this)*r;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
inline vInteger &operator +=(const vInteger &r) {
|
||||||
|
*this = *this+r;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
inline vInteger &operator -=(const vInteger &r) {
|
||||||
|
*this = *this-r;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
public:
|
||||||
|
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
||||||
|
};
|
||||||
|
|
||||||
|
inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
|
||||||
|
|
||||||
|
inline void zeroit(vInteger &z){ vzero(z);}
|
||||||
|
|
||||||
|
inline vInteger outerProduct(const vInteger &l, const vInteger& r)
|
||||||
|
{
|
||||||
|
return l*r;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class vIntegerF : public vInteger
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(float);}
|
||||||
|
|
||||||
|
friend inline void permute(vIntegerF &y,vIntegerF b,int perm)
|
||||||
|
{
|
||||||
|
Gpermute<vIntegerF>(y,b,perm);
|
||||||
|
}
|
||||||
|
friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
|
||||||
|
{
|
||||||
|
Gmerge<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
|
||||||
|
}
|
||||||
|
friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
|
||||||
|
{
|
||||||
|
Gextract<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
class vIntegerD : public vInteger
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(double);}
|
||||||
|
|
||||||
|
friend inline void permute(vIntegerD &y,vIntegerD b,int perm)
|
||||||
|
{
|
||||||
|
Gpermute<vIntegerD>(y,b,perm);
|
||||||
|
}
|
||||||
|
friend inline void merge(vIntegerD &y,std::vector<Integer *> &extracted)
|
||||||
|
{
|
||||||
|
Gmerge<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
|
||||||
|
}
|
||||||
|
friend inline void extract(vIntegerD &y,std::vector<Integer *> &extracted)
|
||||||
|
{
|
||||||
|
Gextract<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
class vIntegerC : public vInteger
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexF);}
|
||||||
|
|
||||||
|
friend inline void permute(vIntegerC &y,vIntegerC b,int perm)
|
||||||
|
{
|
||||||
|
Gpermute<vIntegerC>(y,b,perm);
|
||||||
|
}
|
||||||
|
friend inline void merge(vIntegerC &y,std::vector<Integer *> &extracted)
|
||||||
|
{
|
||||||
|
Gmerge<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
|
||||||
|
}
|
||||||
|
friend inline void extract(vIntegerC &y,std::vector<Integer *> &extracted)
|
||||||
|
{
|
||||||
|
Gextract<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class vIntegerZ : public vInteger
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexD);}
|
||||||
|
|
||||||
|
friend inline void permute(vIntegerZ &y,vIntegerZ b,int perm)
|
||||||
|
{
|
||||||
|
Gpermute<vIntegerZ>(y,b,perm);
|
||||||
|
}
|
||||||
|
friend inline void merge(vIntegerZ &y,std::vector<Integer *> &extracted)
|
||||||
|
{
|
||||||
|
Gmerge<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
|
||||||
|
}
|
||||||
|
friend inline void extract(vIntegerZ &y,std::vector<Integer *> &extracted)
|
||||||
|
{
|
||||||
|
Gextract<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
3
TODO
3
TODO
@ -1,8 +1,9 @@
|
|||||||
* Make peekSite, pokeSite work in an MPI context.
|
* FIXME audit
|
||||||
|
|
||||||
* Conditional execution Subset, where etc...
|
* Conditional execution Subset, where etc...
|
||||||
* Coordinate information, integers etc...
|
* Coordinate information, integers etc...
|
||||||
* Integer type padding/union to vector.
|
* Integer type padding/union to vector.
|
||||||
|
* LatticeCoordinate[mu]
|
||||||
|
|
||||||
* Optimise the extract/merge SIMD routines
|
* Optimise the extract/merge SIMD routines
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user