1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Fixing the Checkerboarding cshift.

Implemented "fake" communications in preparation for the leap to MPI.
This commit is contained in:
Peter Boyle 2015-03-29 20:35:37 +01:00
parent 82eeb9d07e
commit 196fd203e2
23 changed files with 1976 additions and 343 deletions

View File

@ -0,0 +1,239 @@
// !$*UTF8*$!
{
archiveVersion = 1;
classes = {
};
objectVersion = 46;
objects = {
/* Begin PBXBuildFile section */
0BF5DA461AC370840045EA34 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0BF5DA451AC370840045EA34 /* main.cpp */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
0BF5DA401AC370840045EA34 /* CopyFiles */ = {
isa = PBXCopyFilesBuildPhase;
buildActionMask = 2147483647;
dstPath = /usr/share/man/man1/;
dstSubfolderSpec = 0;
files = (
);
runOnlyForDeploymentPostprocessing = 1;
};
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
0BF5DA421AC370840045EA34 /* Grid */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Grid; sourceTree = BUILT_PRODUCTS_DIR; };
0BF5DA451AC370840045EA34 /* main.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = main.cpp; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
0BF5DA3F1AC370840045EA34 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
0BF5DA391AC370840045EA34 = {
isa = PBXGroup;
children = (
0BF5DA441AC370840045EA34 /* Grid */,
0BF5DA431AC370840045EA34 /* Products */,
);
sourceTree = "<group>";
};
0BF5DA431AC370840045EA34 /* Products */ = {
isa = PBXGroup;
children = (
0BF5DA421AC370840045EA34 /* Grid */,
);
name = Products;
sourceTree = "<group>";
};
0BF5DA441AC370840045EA34 /* Grid */ = {
isa = PBXGroup;
children = (
0BF5DA451AC370840045EA34 /* main.cpp */,
);
path = Grid;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
0BF5DA411AC370840045EA34 /* Grid */ = {
isa = PBXNativeTarget;
buildConfigurationList = 0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */;
buildPhases = (
0BF5DA3E1AC370840045EA34 /* Sources */,
0BF5DA3F1AC370840045EA34 /* Frameworks */,
0BF5DA401AC370840045EA34 /* CopyFiles */,
);
buildRules = (
);
dependencies = (
);
name = Grid;
productName = Grid;
productReference = 0BF5DA421AC370840045EA34 /* Grid */;
productType = "com.apple.product-type.tool";
};
/* End PBXNativeTarget section */
/* Begin PBXProject section */
0BF5DA3A1AC370840045EA34 /* Project object */ = {
isa = PBXProject;
attributes = {
LastUpgradeCheck = 0610;
ORGANIZATIONNAME = "University of Edinburgh";
TargetAttributes = {
0BF5DA411AC370840045EA34 = {
CreatedOnToolsVersion = 6.1.1;
};
};
};
buildConfigurationList = 0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */;
compatibilityVersion = "Xcode 3.2";
developmentRegion = English;
hasScannedForEncodings = 0;
knownRegions = (
en,
);
mainGroup = 0BF5DA391AC370840045EA34;
productRefGroup = 0BF5DA431AC370840045EA34 /* Products */;
projectDirPath = "";
projectRoot = "";
targets = (
0BF5DA411AC370840045EA34 /* Grid */,
);
};
/* End PBXProject section */
/* Begin PBXSourcesBuildPhase section */
0BF5DA3E1AC370840045EA34 /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
0BF5DA461AC370840045EA34 /* main.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */
/* Begin XCBuildConfiguration section */
0BF5DA471AC370840045EA34 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
CLANG_CXX_LIBRARY = "libc++";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES;
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_DYNAMIC_NO_PIC = NO;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
);
GCC_SYMBOLS_PRIVATE_EXTERN = NO;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
MACOSX_DEPLOYMENT_TARGET = 10.10;
MTL_ENABLE_DEBUG_INFO = YES;
ONLY_ACTIVE_ARCH = YES;
SDKROOT = macosx;
};
name = Debug;
};
0BF5DA481AC370840045EA34 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
CLANG_CXX_LIBRARY = "libc++";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = YES;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
ENABLE_NS_ASSERTIONS = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES;
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
MACOSX_DEPLOYMENT_TARGET = 10.10;
MTL_ENABLE_DEBUG_INFO = NO;
SDKROOT = macosx;
};
name = Release;
};
0BF5DA4A1AC370840045EA34 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Debug;
};
0BF5DA4B1AC370840045EA34 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Release;
};
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */ = {
isa = XCConfigurationList;
buildConfigurations = (
0BF5DA471AC370840045EA34 /* Debug */,
0BF5DA481AC370840045EA34 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */ = {
isa = XCConfigurationList;
buildConfigurations = (
0BF5DA4A1AC370840045EA34 /* Debug */,
0BF5DA4B1AC370840045EA34 /* Release */,
);
defaultConfigurationIsVisible = 0;
};
/* End XCConfigurationList section */
};
rootObject = 0BF5DA3A1AC370840045EA34 /* Project object */;
}

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<Workspace
version = "1.0">
<FileRef
location = "self:Grid.xcodeproj">
</FileRef>
</Workspace>

View File

@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<Scheme
LastUpgradeVersion = "0610"
version = "1.3">
<BuildAction
parallelizeBuildables = "YES"
buildImplicitDependencies = "YES">
</BuildAction>
<TestAction
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
shouldUseLaunchSchemeArgsEnv = "YES"
buildConfiguration = "Debug">
<Testables>
</Testables>
</TestAction>
<LaunchAction
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
launchStyle = "0"
useCustomWorkingDirectory = "NO"
buildConfiguration = "Debug"
ignoresPersistentStateOnLaunch = "NO"
debugDocumentVersioning = "YES"
allowLocationSimulation = "YES">
<AdditionalOptions>
</AdditionalOptions>
</LaunchAction>
<ProfileAction
shouldUseLaunchSchemeArgsEnv = "YES"
savedToolIdentifier = ""
useCustomWorkingDirectory = "NO"
buildConfiguration = "Release"
debugDocumentVersioning = "YES">
</ProfileAction>
<AnalyzeAction
buildConfiguration = "Debug">
</AnalyzeAction>
<ArchiveAction
buildConfiguration = "Release"
revealArchiveInOrganizer = "YES">
</ArchiveAction>
</Scheme>

View File

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>SchemeUserState</key>
<dict>
<key>Grid.xcscheme</key>
<dict>
<key>orderHint</key>
<integer>0</integer>
</dict>
</dict>
<key>SuppressBuildableAutocreation</key>
<dict>
<key>0BF5DA411AC370840045EA34</key>
<dict>
<key>primary</key>
<true/>
</dict>
</dict>
</dict>
</plist>

View File

@ -1,10 +1,9 @@
#ifndef GRID_CARTESIAN_H
#define GRID_CARTESIAN_H
#include "Grid.h"
#include <Grid.h>
#include <Grid_Communicator.h>
namespace dpo{
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support. Following will go into Grid.h.
@ -13,28 +12,47 @@ namespace dpo{
// dpo::Grid
// dpo::GridCartesian
// dpo::GridCartesianRedBlack
class Grid {
class SimdGrid : public CartesianCommunicator {
public:
SimdGrid(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
// Give Lattice access
template<class object> friend class Lattice;
//protected:
// Lattice wide random support. not yet fully implemented. Need seed strategy
// and one generator per site.
//std::default_random_engine generator;
// Lattice wide random support. not yet fully implemented. Need seed strategy
// and one generator per site.
//std::default_random_engine generator;
// static std::mt19937 generator( 9 );
// Grid information.
unsigned long _ndimension;
std::vector<int> _layout; // Which dimensions get relayed out over simd lanes.
std::vector<int> _dimensions; // Dimensions of array
std::vector<int> _rdimensions;// Reduced dimensions with simd lane images removed
// Commicator provides
// unsigned long _ndimension;
// std::vector<int> _processors; // processor grid
// int _processor; // linear processor rank
// std::vector<int> _processor_coor; // linear processor rank
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// Global dimensions of array with cb removed
std::vector<int> _gdimensions;// Global dimensions of array
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
// std::vector<int> _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
// std::vector<int> _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
std::vector<int> _ostride; // Outer stride for each dimension
std::vector<int> _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
@ -70,15 +88,33 @@ public:
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
return idx;
}
inline int oSites(void) { return _osites; };
inline int iSites(void) { return _isites; };
inline int CheckerboardFromOsite (int Osite){
std::vector<int> ocoor;
CoordFromOsite(ocoor,Osite);
int ss=0;
for(int d=0;d<_ndimension;d++){
ss=ss+ocoor[d];
}
return ss&0x1;
}
inline void CoordFromOsite (std::vector<int>& coor,int Osite){
coor.resize(_ndimension);
for(int d=0;d<_ndimension;d++){
coor[d] = Osite % _rdimensions[d];
Osite = Osite / _rdimensions[d];
}
}
virtual int CheckerBoard(std::vector<int> site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
};
class GridCartesian: public Grid {
class GridCartesian: public SimdGrid {
public:
virtual int CheckerBoard(std::vector<int> site){
return 0;
@ -86,19 +122,24 @@ public:
virtual int CheckerBoardDestination(int cb,int shift){
return 0;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift){
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
GridCartesian(std::vector<int> &dimensions,std::vector<int> layout)
GridCartesian(std::vector<int> &dimensions,
std::vector<int> &simd_layout,
std::vector<int> &processor_grid
) : SimdGrid(processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_ndimension = dimensions.size();
_dimensions.resize(_ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_layout.resize(_ndimension);
_simd_layout.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
@ -106,21 +147,26 @@ public:
_osites = 1;
_isites = 1;
for(int d=0;d<_ndimension;d++){
_dimensions[d] = dimensions[d];
_layout[d] = layout[d];
// Use a reduced simd grid
_rdimensions[d]= _dimensions[d]/_layout[d]; //<-- _layout[d] is zero
_osites *= _rdimensions[d];
_isites *= _layout[d];
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
//FIXME check for exact division
// Use a reduced simd grid
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if ( d==0 ) {
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_layout[d-1];
}
// Addressing support
if ( d==0 ) {
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
}
///////////////////////
@ -148,46 +194,57 @@ public:
}
};
};
// Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public Grid
class GridRedBlackCartesian : public SimdGrid
{
public:
virtual int CheckerBoard(std::vector<int> site){
return site[0]&0x1;
return (site[0]+site[1]+site[2]+site[3])&0x1;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift){
if ( dim == 0 ){
int fulldim =2*_dimensions[0];
shift = (shift+fulldim)%fulldim;
if ( source_cb ) {
// Shift 0,1 -> 0
return (shift+1)/2;
} else {
// Shift 0->0, 1 -> 1, 2->1
return (shift)/2;
}
} else {
return shift;
}
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != 0) return shift;
int fulldim =_fdimensions[0];
shift = (shift+fulldim)%fulldim;
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
int ocb=CheckerboardFromOsite(osite);
if ( (source_cb+ocb)&1 ) {
return (shift)/2;
} else {
return (shift+1)/2;
}
}
virtual int CheckerBoardDestination(int source_cb,int shift){
if ((shift+2*_dimensions[0])&0x1) {
if ((shift+_fdimensions[0])&0x1) {
return 1-source_cb;
} else {
return source_cb;
}
};
GridRedBlackCartesian(std::vector<int> &dimensions,std::vector<int> layout)
GridRedBlackCartesian(std::vector<int> &dimensions,
std::vector<int> &simd_layout,
std::vector<int> &processor_grid) : SimdGrid(processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_ndimension = dimensions.size();
_dimensions.resize(_ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_layout.resize(_ndimension);
_simd_layout.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
@ -195,14 +252,17 @@ public:
_osites = 1;
_isites = 1;
for(int d=0;d<_ndimension;d++){
_dimensions[d] = dimensions[d];
_dimensions[0] = dimensions[0]/2;
_layout[d] = layout[d];
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
_ldimensions[d] = _gdimensions[d]/_processors[d];
// Use a reduced simd grid
_rdimensions[d]= _dimensions[d]/_layout[d];
_simd_layout[d] = simd_layout[d];
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
_osites *= _rdimensions[d];
_isites *= _layout[d];
_isites *= _simd_layout[d];
// Addressing support
if ( d==0 ) {
@ -210,7 +270,7 @@ public:
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_layout[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
}

39
Grid_Communicator.h Normal file
View File

@ -0,0 +1,39 @@
#ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H
///////////////////////////////////
// Processor layout information
///////////////////////////////////
namespace dpo {
class CartesianCommunicator {
public:
// Communicator should know nothing of the physics grid, only processor grid.
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
#ifdef GRID_COMMS_MPI
MPI_Comm communicator;
#endif
CartesianCommunicator(std::vector<int> &pdimensions_in);
int Rank(std::vector<int> coor);
void GlobalSumF(float &);
void GlobalSumFVector(float *,int N);
void GlobalSumF(double &);
void GlobalSumFVector(double *,int N);
void SendToRecvFrom(void *xmit,
std::vector<int> to_coordinate,
void *recv,
std::vector<int> from_coordinate,
int bytes);
};
}
#endif

View File

@ -3,19 +3,31 @@
#include "Grid.h"
namespace dpo {
// Permute the pointers 32bitx16 = 512
static int permute_map[4][16] = {
{ 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
{ 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13},
{ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11},
{ 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8}
};
template<class vobj>
class Lattice
{
public:
Grid *_grid;
SimdGrid *_grid;
int checkerboard;
std::vector<vobj,alignedAllocator<vobj> > _odata;
public:
Lattice(Grid *grid) : _grid(grid) {
Lattice(SimdGrid *grid) : _grid(grid) {
_odata.reserve(_grid->oSites());
if ( ((uint64_t)&_odata[0])&0xF) {
exit(-1);
@ -23,6 +35,19 @@ public:
checkerboard=0;
}
#ifdef GRID_COMMS_NONE
#include <Grid_none_cshift.h>
#endif
#ifdef GRID_COMMS_FAKE
#include <Grid_fake_cshift.h>
#endif
#ifdef GRID_COMMS_MPI
#include <Grid_mpi_cshift.h>
#endif
// overloading dpo::conformable but no conformable in dpo ...?:w
template<class obj1,class obj2>
@ -69,212 +94,6 @@ public:
return ret;
};
#if 0
// Collapse doesn't appear to work the way I think it should in icpc
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
int sx,so,o;
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_dimensions[dimension];
// Map to always positive shift.
shift = (shift+ld)%ld;
// Work out whether to permute and the permute type
// ABCDEFGH -> AE BF CG DH permute
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
int permute_dim =rhs._grid->_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++)
if (rhs._grid->_layout[d]>1 ) permute_type++;
// loop over perp slices.
// Threading considerations:
// Need to map thread_num to
//
// x_min,x_max for Loop-A
// n_min,n_max for Loop-B
// b_min,b_max for Loop-C
// In a way that maximally load balances.
//
// Optimal:
// There are rd*n_block*block items of work.
// These serialise as item "w"
// b=w%block; w=w/block
// n=w%nblock; x=w/nblock. Perhaps 20 cycles?
//
// Logic:
// x_chunk = (rd+thread)/nthreads simply divide work across nodes.
//
// rd=5 , threads = 8;
// 0 1 2 3 4 5 6 7
// 0 0 0 1 1 1 1 1
for(int x=0;x<rd;x++){ // Loop A
sx = (x-shift+ld)%rd;
o = x*rhs._grid->_ostride[dimension];
so =sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
if ( permute_dim ) {
permute_slice = shift/rd;
if ( x<shift%rd ) permute_slice = 1-permute_slice;
}
#if 0
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
int num =rhs._grid->_slice_block[dimension]*internal;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
vComplex *optr = (vComplex *)&ret._odata[o];
vComplex *iptr = (vComplex *)&rhs._odata[so];
for(int b=0;b<num;b++){
permute(optr[b],iptr[b],permute_type);
}
o+=rhs._grid->_slice_stride[dimension];
so+=rhs._grid->_slice_stride[dimension];
}
} else {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
ret._odata[o+i]=rhs._odata[so+i];
}
o+=rhs._grid->_slice_stride[dimension];
so+=rhs._grid->_slice_stride[dimension];
}
}
#else
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
int num =rhs._grid->_slice_block[dimension]*internal;
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<num;b++){
vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
permute(optr[b],iptr[b],permute_type);
}
}
} else {
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
int oo = o+ n*rhs._grid->_slice_stride[dimension];
int soo=so+ n*rhs._grid->_slice_stride[dimension];
ret._odata[oo+i]=rhs._odata[soo+i];
}
}
}
#endif
}
return ret;
}
#else
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_dimensions[dimension];
// Map to always positive shift.
shift = (shift+ld)%ld;
// Work out whether to permute and the permute type
// ABCDEFGH -> AE BF CG DH permute
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
int permute_dim =rhs._grid->_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++)
if (rhs._grid->_layout[d]>1 ) permute_type++;
// loop over all work
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
#pragma omp parallel for
for(int ww=0;ww<work;ww++){
// can optimise this if know w moves congtiguously for a given thread.
// b=(b+1);
// if (b==_slice_block) {b=0; n=n+1;}
// if (n==_slice_nblock) { n=0; x=x+1}
//
// Perhaps a five cycle iterator, or so.
int w=ww;
int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
int x = w;
int sx,so,o;
sx = (x-shift+ld)%rd;
o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
int permute_slice=0;
if ( permute_dim ) {
permute_slice = shift/rd;
if ( x<shift%rd ) permute_slice = 1-permute_slice;
}
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
vComplex *optr = (vComplex *)&ret._odata[o+b];
vComplex *iptr = (vComplex *)&rhs._odata[so+b];
const char *pf = (const char *)iptr;
for(int i=0;i<sizeof(vobj);i+=64){
_mm_prefetch(pf+i,_MM_HINT_T0);
}
for(int i=0;i<internal;i++){
permute(optr[i],iptr[i],permute_type);
}
} else {
const char *pf = (const char *) &rhs._odata[so+b];
for(int i=0;i<sizeof(vobj);i+=64){
_mm_prefetch(pf+i,_MM_HINT_T0);
}
ret._odata[o+b]=rhs._odata[so+b];
}
}
return ret;
}
#endif
template<class sobj>
inline Lattice<vobj> & operator = (const sobj & r){
#pragma omp parallel for
@ -288,14 +107,15 @@ public:
template<class sobj>
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
if ( l._grid.checkerboard != l._grid->Checkerboard(site)){
printf("Poking wrong checkerboard\n");
exit(EXIT_FAILURE);
if ( l.checkerboard != l._grid->CheckerBoard(site)){
printf("Poking wrong checkerboard\n");
exit(EXIT_FAILURE);
}
int o_index = l._grid->oIndex(site);
int i_index = l._grid->iIndex(site);
// BUGGY. This assumes complex real
Real *v_ptr = (Real *)&l._odata[o_index];
Real *s_ptr = (Real *)&s;
v_ptr = v_ptr + 2*i_index;
@ -348,6 +168,22 @@ public:
}
};
friend void lex_sites(Lattice<vobj> &l){
Real *v_ptr = (Real *)&l._odata[0];
size_t o_len = l._grid->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
};
friend void gaussian(Lattice<vobj> &l){
// Zero mean, unit variance.
std::normal_distribution<double> distribution(0.0,1.0);
@ -427,22 +263,41 @@ public:
return ret;
};
// remove and insert a half checkerboard
friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
half.checkerboard = cb;
int ssh=0;
#pragma omp parallel for
for(int ss=0;ss<half._grid->oSites();ss++){
half._odata[ss] = full._odata[ss*2+cb];
}
half.checkerboard = cb;
for(int ss=0;ss<full._grid->oSites();ss++){
std::vector<int> coor;
int cbos;
full._grid->CoordFromOsite(coor,ss);
cbos=half._grid->CheckerBoard(coor);
if (cbos==cb) {
half._odata[ssh] = full._odata[ss];
ssh++;
}
}
}
friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.checkerboard;
int cb = half.checkerboard;
int ssh=0;
#pragma omp parallel for
for(int ss=0;ss<half._grid->oSites();ss++){
full._odata[ss*2+cb]=half._odata[ss];
}
for(int ss=0;ss<full._grid->oSites();ss++){
std::vector<int> coor;
int cbos;
full._grid->CoordFromOsite(coor,ss);
cbos=half._grid->CheckerBoard(coor);
if (cbos==cb) {
full._odata[ss]=half._odata[ssh];
ssh++;
}
}
}
}; // class Lattice

View File

@ -5,7 +5,6 @@ namespace QCD {
static const int Nc=3;
static const int Ns=4;
static const int CbRed =0;
static const int CbBlack=1;

View File

@ -10,6 +10,15 @@
/* AVX512 */
/* #undef AVX512 */
/* GRID_COMMS_FAKE */
#define GRID_COMMS_FAKE 1
/* GRID_COMMS_MPI */
/* #undef GRID_COMMS_MPI */
/* GRID_COMMS_NONE */
/* #undef GRID_COMMS_NONE */
/* Define to 1 if you have the `gettimeofday' function. */
#define HAVE_GETTIMEOFDAY 1
@ -61,6 +70,9 @@
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "grid"
/* Define to the home page for this package. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.0"

34
Grid_fake.cc Normal file
View File

@ -0,0 +1,34 @@
#include "Grid.h"
namespace dpo {
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
{
_ndimension = _processors.size();
_processor_coor.resize(_ndimension);
_processors = processors;
// Require 1^N processor grid for fake
for(int d=0;d<_ndimension;d++) if(_processors[d]!=1) exit(-1);
_processor = 0;// I am the one. The only one..
for(int d=0;d<_ndimension;d++) _processor_coor[d] = 0;
}
void CartesianCommunicator::GlobalSumF(float &){}
void CartesianCommunicator::GlobalSumFVector(float *,int N){}
void CartesianCommunicator::GlobalSumF(double &){}
void CartesianCommunicator::GlobalSumFVector(double *,int N){}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
std::vector<int> to_coordinate,
void *recv,
std::vector<int> from_coordinate,
int bytes)
{
exit(-1);
bcopy(xmit,recv,bytes);
}
}

246
Grid_fake_cshift.h Normal file
View File

@ -0,0 +1,246 @@
#ifndef _GRID_FAKE_H_
#define _GRID_FAKE_H_
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
const int Nsimd = vector_type::Nsimd();
Lattice<vobj> ret(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
//int ld = rhs._grid->_ldimensions[dimension];
//int gd = rhs._grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
// the permute type
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++){
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
}
///////////////////////////////////////////////
// Move via a fake comms buffer
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
std::vector<scalar_type *> pointers(Nsimd);
for(int x=0;x<rd;x++){
for(int i=0;i<vobj::vector_type::Nsimd();i++){
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
}
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
if ( permute_dim ) {
int o = 0; // relative offset to base
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
// base offset for source
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
extract(rhs._odata[so+o+b],pointers);
} else {
ret._odata[ro+o+b]=rhs._odata[so+o+b];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
for(int i=0;i<vobj::vector_type::Nsimd();i++){
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
}
o = 0; // relative offset to base
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
// base offset for source
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
merge(ret._odata[ro+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int co; // comm offset
int o;
co=0; o=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
// This call in inner loop is annoying but necessary for dimension=0
// in the case of RedBlack grids. Could optimise away with
// alternate code paths for all other cases.
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
comm_buf[co++]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
// Step through a copy into a comms buffer and pull back in.
// Genuine fake implementation could calculate if loops back
co=0; o=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
ret._odata[ro+o+b]=comm_buf[co++];
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
return ret;
}
/*
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_ldimensions[dimension];
int gd = rhs._grid->_gdimensions[dimension];
// Map to always positive shift.
shift = (shift+gd)%gd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
// Work out whether to permute and the permute type
// ABCDEFGH -> AE BF CG DH permute
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++)
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
// loop over all work
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
// Packed gather sequence is clean
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
typedef typename vobj::scalar_type scalar_t;
typedef typename vobj::vector_type vector_t;
const int ns=sizeof(vobj)/sizeof(scalar_t);
const int nv=sizeof(vobj)/sizeof(vector_t);
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
for(int x=0;x<rd;x++){
int sx = (x+shift)%rd;
int o = x*rhs._grid->_ostride[dimension];
int so =sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
if ( permute_dim ) {
permute_slice = shift/rd;
if ( x<shift%rd ) permute_slice = 1-permute_slice;
}
if ( permute_slice ) {
exit(0);
// For fake communication ALWAYS extract and either merge one way or other
scalar_t * bptr = (scalar_t *) &comm_buf[0];
int bo=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
vector_t *optr = (vector_t *)&ret._odata[o];
vector_t *iptr = (vector_t *)&rhs._odata[so];
int skew = buffer_size*ns/2;
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
for(int n=0;n<nv;n++){// number of simd vecsscalars in a vector
extract(iptr[b*nv+n],&bptr[n],skew,permute_type);
}
}
o+=rhs._grid->_slice_stride[dimension];
// bo+=rhs._grid->_slice_stride[dimension]*ns/2;
}
} else {
int bo=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
comm_buf[bo++] =rhs._odata[so+i];
}
so+=rhs._grid->_slice_stride[dimension];
}
bo=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
ret._odata[o+i]=comm_buf[bo++];
}
o+=rhs._grid->_slice_stride[dimension];
}
}
}
return ret;
};
*/
#endif

View File

@ -28,13 +28,13 @@ double usecond(void) {
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
{
ucontext_t * uc= (ucontext_t *)ptr;
printf("Caught signal %d\n",si->si_signo);
printf(" mem address %llx\n",(uint64_t)si->si_addr);
printf(" code %d\n",si->si_code);
#ifdef __X86_64
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
printf(" instruction %llx\n",(uint64_t)sc->rip);

View File

@ -14,7 +14,7 @@ using namespace dpo::QCD;
int main (int argc, char ** argv)
{
#ifdef KNL
#if 0
struct sigaction sa,osa;
sigemptyset (&sa.sa_mask);
sa.sa_sigaction= sa_action;
@ -27,6 +27,9 @@ int main (int argc, char ** argv)
std::vector<int> latt_size(4);
std::vector<int> simd_layout(4);
std::vector<int> mpi_layout(4);
for(int d=0;d<4;d++) mpi_layout[d]=1;
#ifdef AVX512
for(int omp=128;omp<236;omp+=16){
#else
@ -37,7 +40,7 @@ int main (int argc, char ** argv)
omp_set_num_threads(omp);
#endif
for(int lat=16;lat<=20;lat+=4){
for(int lat=4;lat<=16;lat+=40){
latt_size[0] = lat;
latt_size[1] = lat;
latt_size[2] = lat;
@ -62,11 +65,9 @@ int main (int argc, char ** argv)
simd_layout[2] = 1;
simd_layout[3] = 2;
#endif
GridCartesian Fine(latt_size,simd_layout);
GridRedBlackCartesian rbFine(latt_size,simd_layout);
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
LatticeColourMatrix Foo(&Fine);
LatticeColourMatrix Bar(&Fine);
@ -152,19 +153,23 @@ int main (int argc, char ** argv)
trscMat = trace(scMat); // Trace
FooBar = Bar;
int shift=1;
int dir=0;
/*
{
std::vector<int> coor(4);
for(int d=0;d<4;d++) coor[d] = 0;
peekSite(cmat,Foo,coor);
Foo = zero;
pokeSite(cmat,Foo,coor);
}
random(Foo);
pickCheckerboard(CbRed,rFoo,Foo); // Pick out red or black checkerboards
pickCheckerboard(CbBlack,bFoo,Foo);
*/
lex_sites(Foo);
//setCheckerboard(ShiftedCheck,rFoo);
//setCheckerboard(ShiftedCheck,bFoo);
Shifted = Cshift(Foo,dir,shift); // Shift everything
bShifted = Cshift(rFoo,dir,shift); // Shift red
rShifted = Cshift(bFoo,dir,shift); // Shift black
setCheckerboard(ShiftedCheck,bShifted); // Put them all together
setCheckerboard(ShiftedCheck,rShifted); // and check the results (later)
// Lattice SU(3) x SU(3)
FooBar = Foo * Bar;
@ -211,25 +216,69 @@ int main (int argc, char ** argv)
printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
pickCheckerboard(0,rFoo,FooBar);
pickCheckerboard(1,bFoo,FooBar);
setCheckerboard(FooBar,rFoo);
setCheckerboard(FooBar,bFoo);
// pickCheckerboard(0,rFoo,FooBar);
// pickCheckerboard(1,bFoo,FooBar);
// setCheckerboard(FooBar,rFoo);
// setCheckerboard(FooBar,bFoo);
double nrm=0;
for(int dir=0;dir<4;dir++){
for(int shift=0;shift<latt_size[dir];shift++){
pickCheckerboard(0,rFoo,Foo); // Pick out red or black checkerboards
pickCheckerboard(1,bFoo,Foo);
std::vector<int> coor(4);
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
Shifted = Cshift(Foo,dir,shift); // Shift everything
std::cout << "Shifting even parities"<<std::endl;
bShifted = Cshift(rFoo,dir,shift); // Shift red->black
std::cout << "Shifting odd parities"<<std::endl;
rShifted = Cshift(bFoo,dir,shift); // Shift black->red
ShiftedCheck=zero;
setCheckerboard(ShiftedCheck,bShifted); // Put them all together
setCheckerboard(ShiftedCheck,rShifted); // and check the results (later)
// Check results
std::vector<int> coor(4);
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
std::complex<dpo::Real> diff;
std::vector<int> shiftcoor = coor;
shiftcoor[dir]=(shiftcoor[dir]-shift+latt_size[dir])%latt_size[dir];
shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%latt_size[dir];
std::vector<int> rl(4);
for(int dd=0;dd<4;dd++){
rl[dd] = latt_size[dd]/simd_layout[dd];
}
int lex = coor[0]%rl[0]
+ (coor[1]%rl[1])*rl[0]
+ (coor[2]%rl[2])*rl[0]*rl[1]
+ (coor[3]%rl[3])*rl[0]*rl[1]*rl[2];
lex +=
+1000*(coor[0]/rl[0])
+1000*(coor[1]/rl[1])*simd_layout[0]
+1000*(coor[2]/rl[2])*simd_layout[0]*simd_layout[1]
+1000*(coor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2];
int lex_coor = shiftcoor[0]%rl[0]
+ (shiftcoor[1]%rl[1])*rl[0]
+ (shiftcoor[2]%rl[2])*rl[0]*rl[1]
+ (shiftcoor[3]%rl[3])*rl[0]*rl[1]*rl[2];
lex_coor +=
+1000*(shiftcoor[0]/rl[0])
+1000*(shiftcoor[1]/rl[1])*simd_layout[0]
+1000*(shiftcoor[2]/rl[2])*simd_layout[0]*simd_layout[1]
+1000*(shiftcoor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2];
ColourMatrix foo;
ColourMatrix bar;
ColourMatrix shifted1;
@ -242,7 +291,6 @@ int main (int argc, char ** argv)
peekSite(shifted1,Shifted,coor);
peekSite(shifted2,Foo,shiftcoor);
peekSite(shifted3,ShiftedCheck,coor);
peekSite(foo,Foo,coor);
mdiff = shifted1-shifted2;
@ -258,13 +306,13 @@ int main (int argc, char ** argv)
diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c];
nn=real(conj(diff)*diff);
if ( nn > 0 )
cout<<"Shift fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
<<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
<< " "<< foo._internal._internal[r][c]<<endl;
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
else if(0)
cout<<"Shift pass "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
cout<<"Shift pass 1vs2 "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
<<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
<< " "<< foo._internal._internal[r][c]<<endl;
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
}}
for(int r=0;r<3;r++){
@ -272,13 +320,13 @@ int main (int argc, char ** argv)
diff =shifted3._internal._internal[r][c]-shifted2._internal._internal[r][c];
nn=real(conj(diff)*diff);
if ( nn > 0 )
cout<<"Shift rb fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
<<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
<< " "<< foo._internal._internal[r][c]<<endl;
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
else if(0)
cout<<"Shift rb pass "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
cout<<"Shift rb pass 3vs2 "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
<<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
<< " "<< foo._internal._internal[r][c]<<endl;
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
}}
peekSite(bar,Bar,coor);
@ -290,7 +338,7 @@ int main (int argc, char ** argv)
nrm = nrm + real(conj(diff)*diff);
}}
}}}}
}}
std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;
} // loop for lat

View File

@ -1,6 +1,63 @@
#ifndef GRID_MATH_TYPES_H
#define GRID_MATH_TYPES_H
namespace dpo {
//////////////////////////////////////////////////////////////////////////////////
// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
//////////////////////////////////////////////////////////////////////////////////
template <class T> class GridTypeMapper {
public:
typedef typename T::scalar_type scalar_type;
typedef typename T::vector_type vector_type;
};
template<> class GridTypeMapper<RealF> {
public:
typedef RealF scalar_type;
typedef RealF vector_type;
};
template<> class GridTypeMapper<RealD> {
public:
typedef RealD scalar_type;
typedef RealD vector_type;
};
template<> class GridTypeMapper<ComplexF> {
public:
typedef ComplexF scalar_type;
typedef ComplexF vector_type;
};
template<> class GridTypeMapper<ComplexD> {
public:
typedef ComplexD scalar_type;
typedef ComplexD vector_type;
};
template<> class GridTypeMapper<vRealF> {
public:
typedef RealF scalar_type;
typedef vRealF vector_type;
};
template<> class GridTypeMapper<vRealD> {
public:
typedef RealD scalar_type;
typedef vRealD vector_type;
};
template<> class GridTypeMapper<vComplexF> {
public:
typedef ComplexF scalar_type;
typedef vComplexF vector_type;
};
template<> class GridTypeMapper<vComplexD> {
public:
typedef ComplexD scalar_type;
typedef vComplexD vector_type;
};
///////////////////////////////////////////////////
// Scalar, Vector, Matrix objects.
// These can be composed to form tensor products of internal indices.
@ -10,6 +67,10 @@ template<class vtype> class iScalar
{
public:
SIMDalign vtype _internal;
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
iScalar(){};
iScalar(Zero &z){ *this = zero; };
iScalar<vtype> & operator= (const Zero &hero){
@ -19,6 +80,15 @@ public:
friend void zeroit(iScalar<vtype> &that){
zeroit(that._internal);
}
friend void permute(iScalar<vtype> &out,iScalar<vtype> &in,int permutetype){
permute(out._internal,in._internal,permutetype);
}
friend void extract(iScalar<vtype> &in,std::vector<scalar_type *> &out){
extract(in._internal,out); // extract advances the pointers in out
}
friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
merge(in._internal,out); // extract advances the pointers in out
}
// Unary negation
friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
iScalar<vtype> ret;
@ -38,14 +108,16 @@ public:
*this = (*this)+r;
return *this;
}
};
template<class vtype,int N> class iVector
{
public:
SIMDalign vtype _internal[N];
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
iVector(Zero &z){ *this = zero; };
iVector() {};
iVector<vtype,N> & operator= (Zero &hero){
@ -57,6 +129,21 @@ public:
zeroit(that._internal[i]);
}
}
friend void permute(iVector<vtype,N> &out,iVector<vtype,N> &in,int permutetype){
for(int i=0;i<N;i++){
permute(out._internal[i],in._internal[i],permutetype);
}
}
friend void extract(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
for(int i=0;i<N;i++){
extract(in._internal[i],out);// extract advances pointers in out
}
}
friend void merge(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
for(int i=0;i<N;i++){
merge(in._internal[i],out);// extract advances pointers in out
}
}
// Unary negation
friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
iVector<vtype,N> ret;
@ -84,6 +171,10 @@ template<class vtype,int N> class iMatrix
{
public:
SIMDalign vtype _internal[N][N];
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
iMatrix(Zero &z){ *this = zero; };
iMatrix() {};
iMatrix<vtype,N> & operator= (Zero &hero){
@ -96,6 +187,24 @@ public:
zeroit(that._internal[i][j]);
}}
}
friend void permute(iMatrix<vtype,N> &out,iMatrix<vtype,N> &in,int permutetype){
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
permute(out._internal[i][j],in._internal[i][j],permutetype);
}}
}
friend void extract(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
extract(in._internal[i][j],out);// extract advances pointers in out
}}
}
friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
merge(in._internal[i][j],out);// extract advances pointers in out
}}
}
// Unary negation
friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
iMatrix<vtype,N> ret;

85
Grid_mpi.cc Normal file
View File

@ -0,0 +1,85 @@
#include "Grid.h"
#include <mpi.h>
namespace dpo {
// Should error check all MPI calls.
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
{
_ndimension = _processors.size();
std::vector<int> periodic(_ndimension,1);
_processors = processors;
_processor_coords.resize(_ndimension);
MPI_Cart_create(MPI_COMM_WORLD _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coords[0]);
}
void CartesianCommunicator::GlobalSumF(float &f){
MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSumFVector(float *f,int N)
{
MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSumF(double &d)
{
MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSumFVector(double *d,int N)
{
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
}
int CartesianCommunicator::ShiftedRank(int dim,int shift)
{
int rank;
MPI_Cart_shift(communicator,dim,shift,&_processor,&rank);
return rank;
}
int CartesianCommunicator::Rank(std::vector<int> coor)
{
int rank;
MPI_Cart_rank (communicator, &coor[0], &rank);
return rank;
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
std::vector<int> &to_coordinate,
void *recv,
std::vector<int> &from_coordinate,
int bytes)
{
int dest = Rank(to_coordinate);
int from = Rank(from_coordinate);
MPI_Request x_req;
MPI_Request r_req;
MPI_Status OkeyDokey;
MPI_Isend(xmit, bytes, MPI_CHAR,dest,dest,communicator,&x_req);
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&r_req);
MPI_Wait(&x_req,&OkeyDokey);
MPI_Wait(&r_req,&OkeyDokey);
MPI_Barrier();
}
}
#if 0
// Could possibly do a direct block strided send?
int MPI_Type_vector(
int count,
int blocklength,
int stride,
MPI_Datatype old_type,
MPI_Datatype *newtype_p
);
#endif

384
Grid_mpi_cshift.h Normal file
View File

@ -0,0 +1,384 @@
#ifndef _GRID_MPI_CSHIFT_H_
#define _GRID_MPI_CSHIFT_H_
//////////////////////////////////////////////
// Q. Split this into seperate sub functions?
//////////////////////////////////////////////
// CshiftCB_comms_splice
// CshiftCB_comms
// CshiftCB_local
// CshiftCB_local_permute
// Cshift_comms_splice
// Cshift_comms
// Cshift_local
// Cshift_local_permute
// Broadly I remain annoyed that the iteration is so painful
// for red black data layout, when simple block strided descriptors suffice for non-cb.
//
// The other option is to do it table driven, or perhaps store the CB of each site in a table.
//
// Must not lose sight that goal is to be able to construct really efficient
// gather to a point stencil code. CSHIFT is not the best way, so probably need
// additional stencil support.
//
// Could still do a templated syntax tree and make CSHIFT return lattice vector.
//
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
//
// Lattice <foo> could also allocate haloes which get used for stencil code.
//
// Grid could create a neighbour index table for a given stencil.
// Could also implement CovariantCshift.
//////////////////////////////////////////////////////
//Non checkerboarded support functions
//////////////////////////////////////////////////////
friend void Gather_plane (Lattice<vobj> &rhs,std::vector<vobj> &buffer, int dimension,int plane)
{
const int Nsimd = vector_type::Nsimd();
int rd = rhs._grid->_rdimensions[dimension];
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
} else {
ret._odata[ro+o+b]=rhs._odata[so+o+b];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//friend void Scatter_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//
//////////////////////////////////////////////////////
//Checkerboarded support functions
//////////////////////////////////////////////////////
//friend void GatherCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//friend void ScatterCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
const int Nsimd = vector_type::Nsimd();
Lattice<vobj> ret(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
//int ld = rhs._grid->_ldimensions[dimension];
//int gd = rhs._grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
// the permute type
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
int permute_dim = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
int permute_type=0;
for(int d=0;d<dimension;d++){
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
}
// Logic for non-distributed dimension
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_to (simd_layout);
std::vector<int> comm_from (simd_layout);
std::vector<int> comm_rx (simd_layout); // reduced coordinate of neighbour plane
std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane
///////////////////////////////////////////////
// Move via a fake comms buffer
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
std::vector<scalar_type *> pointers(Nsimd);
if ( permute_dim ) {
for(int x=0;x<rd;x++){
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
int o = 0; // relative offset to base
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
} else {
ret._odata[ro+o+b]=rhs._odata[so+o+b];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
} else if ( splice_dim ) {
if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace
if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);
for(int i=0;i<vobj::vector_type::Nsimd();i++){
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
}
for(int x=0;x<rd;x++){
///////////////////////////////////////////
// Extract one orthogonal slice at a time
///////////////////////////////////////////
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
o = 0; // relative offset to base
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
// base offset for source
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
extract(rhs._odata[so+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
for(int s=0;s<simd_layout;s++) {
// shift to "neighbour" takes us off node
// coordinates (rx, simd_lane) of neighbour
// how many nodes away is this shift
// where we should send to
// where we should receive from
int shifted_x = x+s*rd+shift;
comm_offnode[s] = shifted_x > ld;
comm_send_rx[s] = shifted_x%rd; // which slice geton the other node
comm_send_simd_lane [s] = shifted_x/rd; // which slice on the other node
comm_from[s] = shifted_x/ld;
comm_to [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];
comm_from[s] = (comm_from[s]+_processors[dimension]) % _processors[dimension];
}
////////////////////////////////////////////////
// Insert communication phase
////////////////////////////////////////////////
#if 0
} else if (comm_dim ) {
// Packed gather sequence is clean
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
// off node; communcate slice (ld==rd)
if ( x+shift > rd ) {
int sb=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
send_buf[sb++]=rhs._odata[so+i];
}
so+=rhs._grid->_slice_stride[dimension];
}
// Make a comm_fake them mimics comms in periodic case.
// scatter face
int rb=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
ret._odata[so+i]=recv_buf[rb++];
}
so+=rhs._grid->_slice_stride[dimension];
}
} else {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
ret._odata[o+i]=rhs._odata[so+i];
}
o+=rhs._grid->_slice_stride[dimension];
so+=rhs._grid->_slice_stride[dimension];
}
}
#endif
////////////////////////////////////////////////
// Pull receive buffers and permuted buffers in
////////////////////////////////////////////////
for(int i=0;i<vobj::vector_type::Nsimd();i++){
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
}
o = 0; // relative offset to base
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
// base offset for source
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
merge(ret._odata[ro+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
} else if ( comm_dim ) {
int co; // comm offset
int o;
co=0;
for(int x=0;x<rd;x++){
o=0;
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
// This call in inner loop is annoying but necessary for dimension=0
// in the case of RedBlack grids. Could optimise away with
// alternate code paths for all other cases.
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
comm_buf[co++]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
// Step through a copy into a comms buffer and pull back in.
// Genuine fake implementation could calculate if loops back
co=0; o=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
ret._odata[ro+o+b]=comm_buf[co++];
}
o +=rhs._grid->_slice_stride[dimension];
}
}
} else { // Local dimension, no permute required
for(int x=0;x<rd;x++){
int o=0;
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
// This call in inner loop is annoying but necessary for dimension=0
// in the case of RedBlack grids. Could optimise away with
// alternate code paths for all other cases.
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
ret._odata[bo+o+b]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
return ret;
}
#endif

313
Grid_none_cshift.h Normal file
View File

@ -0,0 +1,313 @@
#ifndef _GRID_NONE_CSHIFT_H_
#define _GRID_NONE_CSHIFT_H_
// Work out whether to permute
// ABCDEFGH -> AE BF CG DH permute wrap num
//
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
// Suppose 4way simd in one dim.
// ABCDEFGH -> AECG BFDH permute wrap num
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
// Generalisation to 8 way simd, 16 way simd required.
//
// Need log2 Nway masks. consisting of
// 1 bit 256 bit granule
// 2 bit 128 bit granule
// 4 bits 64 bit granule
// 8 bits 32 bit granules
//
// 15 bits....
// For optimisation:
//
// split into Cshift_none_rb_permute
// split into Cshift_none_rb_simple
//
// split into Cshift_none_permute
// split into Cshift_none_simple
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_ldimensions[dimension];
int gd = rhs._grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
// the permute type
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++){
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
}
for(int x=0;x<rd;x++){
int bo = x*rhs._grid->_ostride[dimension];
int o = 0;
if ( permute_dim ) {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type);
} else {
ret._odata[bo+o+b]=rhs._odata[so+o+b];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
// This call in inner loop is annoying but necessary for dimension=0
// in the case of RedBlack grids. Could optimise away with
// alternate code paths for all other cases.
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
ret._odata[bo+o+b]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
return ret;
}
#if 0
// Collapse doesn't appear to work the way I think it should in icpc
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
int sx,so,o;
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_dimensions[dimension];
// Map to always positive shift.
shift = (shift+ld)%ld;
// Work out whether to permute and the permute type
// ABCDEFGH -> AE BF CG DH permute
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
int permute_dim =rhs._grid->_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++)
if (rhs._grid->_layout[d]>1 ) permute_type++;
// loop over perp slices.
// Threading considerations:
// Need to map thread_num to
//
// x_min,x_max for Loop-A
// n_min,n_max for Loop-B
// b_min,b_max for Loop-C
// In a way that maximally load balances.
//
// Optimal:
// There are rd*n_block*block items of work.
// These serialise as item "w"
// b=w%block; w=w/block
// n=w%nblock; x=w/nblock. Perhaps 20 cycles?
//
// Logic:
// x_chunk = (rd+thread)/nthreads simply divide work across nodes.
//
// rd=5 , threads = 8;
// 0 1 2 3 4 5 6 7
// 0 0 0 1 1 1 1 1
for(int x=0;x<rd;x++){ // Loop A
sx = (x-shift+ld)%rd;
o = x*rhs._grid->_ostride[dimension];
so =sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
if ( permute_dim ) {
permute_slice = shift/rd;
if ( x<shift%rd ) permute_slice = 1-permute_slice;
}
#if 0
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
int num =rhs._grid->_slice_block[dimension]*internal;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
vComplex *optr = (vComplex *)&ret._odata[o];
vComplex *iptr = (vComplex *)&rhs._odata[so];
for(int b=0;b<num;b++){
permute(optr[b],iptr[b],permute_type);
}
o+=rhs._grid->_slice_stride[dimension];
so+=rhs._grid->_slice_stride[dimension];
}
} else {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
ret._odata[o+i]=rhs._odata[so+i];
}
o+=rhs._grid->_slice_stride[dimension];
so+=rhs._grid->_slice_stride[dimension];
}
}
#else
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
int num =rhs._grid->_slice_block[dimension]*internal;
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<num;b++){
vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
permute(optr[b],iptr[b],permute_type);
}
}
} else {
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
int oo = o+ n*rhs._grid->_slice_stride[dimension];
int soo=so+ n*rhs._grid->_slice_stride[dimension];
ret._odata[oo+i]=rhs._odata[soo+i];
}
}
}
#endif
}
return ret;
}
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_dimensions[dimension];
// Map to always positive shift.
shift = (shift+ld)%ld;
// Work out whether to permute and the permute type
// ABCDEFGH -> AE BF CG DH permute
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
int permute_dim =rhs._grid->_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++)
if (rhs._grid->_layout[d]>1 ) permute_type++;
// loop over all work
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
#pragma omp parallel for
for(int ww=0;ww<work;ww++){
// can optimise this if know w moves congtiguously for a given thread.
// b=(b+1);
// if (b==_slice_block) {b=0; n=n+1;}
// if (n==_slice_nblock) { n=0; x=x+1}
//
// Perhaps a five cycle iterator, or so.
int w=ww;
int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
int x = w;
int sx,so,o;
sx = (x-shift+ld)%rd;
o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
int permute_slice=0;
if ( permute_dim ) {
permute_slice = shift/rd;
if ( x<shift%rd ) permute_slice = 1-permute_slice;
}
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
vComplex *optr = (vComplex *)&ret._odata[o+b];
vComplex *iptr = (vComplex *)&rhs._odata[so+b];
const char *pf = (const char *)iptr;
for(int i=0;i<sizeof(vobj);i+=64){
_mm_prefetch(pf+i,_MM_HINT_T0);
}
for(int i=0;i<internal;i++){
permute(optr[i],iptr[i],permute_type);
}
} else {
const char *pf = (const char *) &rhs._odata[so+b];
for(int i=0;i<sizeof(vobj);i+=64){
_mm_prefetch(pf+i,_MM_HINT_T0);
}
ret._odata[o+b]=rhs._odata[so+b];
}
}
return ret;
}
#endif
#endif

View File

@ -47,7 +47,7 @@ namespace dpo {
typedef std::complex<RealF> ComplexF;
typedef std::complex<RealD> ComplexD;
typedef std::complex<Real> Complex;
inline RealF adj(const RealF & r){ return r; }
inline RealF conj(const RealF & r){ return r; }
@ -130,7 +130,7 @@ namespace dpo {
#if defined (AVX1) || defined (AVX2)
typedef __m256 fvec;
typedef __m256d dvec;
typedef __m256 cvec;
typedef __m256 cvec;
typedef __m256d zvec;
#endif
#if defined (AVX512)

View File

@ -8,6 +8,9 @@ namespace dpo {
protected:
zvec v;
public:
typedef zvec vector_type;
typedef ComplexD scalar_type;
vComplexD & operator = ( Zero & z){
vzero(*this);
return (*this);
@ -145,14 +148,43 @@ namespace dpo {
ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1);
/* Imag OK */
#endif
#ifdef QPX
ret.v = vec_mul(a.v,b.v);
#endif
return ret;
};
/////////////////////////////////////////////////////////////////
// Extract
/////////////////////////////////////////////////////////////////
friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted){
// Bounce off stack is painful
// temporary hack while I figure out the right interface
const int Nsimd = vComplexD::Nsimd();
std::vector<ComplexD> buf(Nsimd);
vstore(y,&buf[0]);
for(int i=0;i<Nsimd;i++){
*extracted[i] = buf[i];
extracted[i]++;
}
};
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted){
// Bounce off stack is painful
// temporary hack while I figure out the right interface
const int Nsimd = vComplexD::Nsimd();
std::vector<ComplexD> buf(Nsimd);
for(int i=0;i<Nsimd;i++){
buf[i]=*extracted[i];
extracted[i]++;
}
vset(y,&buf[0]);
};
/////////////////////////////////////////////////////////////////
// Permute
@ -165,7 +197,7 @@ namespace dpo {
// AB => BA i.e. ab cd =>cd ab
#endif
#ifdef SSE2
//No cases here
break;
#endif
#ifdef AVX512
// 4 complex=>2 permute
@ -205,7 +237,7 @@ namespace dpo {
#endif
}
friend inline void vset(vComplexD &ret, std::complex<double> *a){
friend inline void vset(vComplexD &ret,ComplexD *a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
#endif
@ -221,7 +253,7 @@ namespace dpo {
#endif
}
friend inline void vstore(vComplexD &ret, std::complex<double> *a){
friend inline void vstore(vComplexD &ret, ComplexD *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_pd((double *)a,ret.v);
#endif
@ -273,10 +305,10 @@ friend inline void vstore(vComplexD &ret, std::complex<double> *a){
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
__attribute__ ((aligned(32))) double c_[4];
_mm256_store_pd(c_,in.v);
return std::complex<double>(c_[0]+c_[2],c_[1]+c_[3]);
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
#endif
#ifdef AVX512
return std::complex<double>(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
#endif
#ifdef QPX
#endif

View File

@ -8,6 +8,11 @@ namespace dpo {
cvec v;
public:
static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
public:
typedef cvec vector_type;
typedef ComplexF scalar_type;
vComplexF & operator = ( Zero & z){
vzero(*this);
return (*this);
@ -125,6 +130,39 @@ namespace dpo {
return ret;
};
/////////////////////////////////////////////////////////////////
// Extract
/////////////////////////////////////////////////////////////////
friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted){
// Bounce off heap is painful
// temporary hack while I figure out the right interface
vComplexF vbuf;
ComplexF *buf = (ComplexF *)&vbuf;
vstore(y,&buf[0]);
for(int i=0;i<vComplexF::Nsimd();i++){
*extracted[i] = buf[i];
extracted[i]++;
}
};
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted){
// Bounce off stack is painful
// temporary hack while I figure out the right interface
const int Nsimd = vComplexF::Nsimd();
vComplexF vbuf;
ComplexF *buf = (ComplexF *)&vbuf;
for(int i=0;i<Nsimd;i++){
buf[i]=*extracted[i];
extracted[i]++;
}
vset(y,&buf[0]);
};
/////////////////////////////////////////////////////////////////
// Permute
/////////////////////////////////////////////////////////////////
@ -185,7 +223,7 @@ namespace dpo {
vsplat(ret,a,b);
}
friend inline void vstore(vComplexF &ret, std::complex<float> *a){
friend inline void vstore(vComplexF &ret, ComplexF *a){
#if defined (AVX1)|| defined (AVX2)
_mm256_store_ps((float *)a,ret.v);
#endif
@ -225,11 +263,11 @@ exit(-1);
#if defined (AVX1) || defined(AVX2)
__attribute__ ((aligned(32))) float c_[8];
_mm256_store_ps(c_,in.v);
return std::complex<float>(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
#endif
#ifdef AVX512
return std::complex<float>(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
#endif
#ifdef QPX
#endif
@ -321,9 +359,6 @@ exit(-1);
return *this;
}
public:
static int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
};
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }

View File

@ -7,7 +7,11 @@ namespace dpo{
class vRealD {
protected:
dvec v; // dvec is double precision vector
public:
typedef dvec vector_type;
typedef RealD scalar_type;
vRealD(){};
friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
@ -94,6 +98,37 @@ namespace dpo{
#endif
return ret;
};
/////////////////////////////////////////////////////////////////
// Extract
/////////////////////////////////////////////////////////////////
friend inline void extract(vRealD &y,std::vector<RealD *> &extracted){
// Bounce off stack is painful
// temporary hack while I figure out the right interface
const int Nsimd = vRealD::Nsimd();
RealD buf[Nsimd];
vstore(y,buf);
for(int i=0;i<Nsimd;i++){
*extracted[i] = buf[i];
extracted[i]++;
}
};
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted){
// Bounce off stack is painful
// temporary hack while I figure out the right interface
const int Nsimd = vRealD::Nsimd();
RealD buf[Nsimd];
for(int i=0;i<Nsimd;i++){
buf[i]=*extracted[i];
extracted[i]++;
}
vset(y,buf);
};
// Permute plans
// Permute 0 every ABCDEFGH -> BA DC FE HG

View File

@ -7,7 +7,12 @@ namespace dpo {
class vRealF {
protected:
fvec v;
public:
typedef fvec vector_type;
typedef RealF scalar_type;
vRealF(){};
////////////////////////////////////
// Arithmetic operator overloads +,-,*
@ -113,6 +118,37 @@ namespace dpo {
//////////////////////////////////
friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
/////////////////////////////////////////////////////////////////
// Extract
/////////////////////////////////////////////////////////////////
friend inline void extract(vRealF &y,std::vector<RealF *> &extracted){
// Bounce off stack is painful
// temporary hack while I figure out the right interface
const int Nsimd = vRealF::Nsimd();
RealF buf[Nsimd];
vstore(y,buf);
for(int i=0;i<Nsimd;i++){
*extracted[i] = buf[i];
extracted[i]++;
}
};
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted){
// Bounce off stack is painful
// temporary hack while I figure out the right interface
const int Nsimd = vRealF::Nsimd();
RealF buf[Nsimd];
for(int i=0;i<Nsimd;i++){
buf[i]=*extracted[i];
extracted[i]++;
}
vset(y,buf);
};
//////////////////////////////////////////////////////////
// Permute
@ -256,7 +292,7 @@ friend inline void vstore(vRealF &ret, float *a){
return *this;
}
public:
static int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
};
inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
inline void zeroit(vRealF &z){ vzero(z);}