mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Fixing the Checkerboarding cshift.
Implemented "fake" communications in preparation for the leap to MPI.
This commit is contained in:
parent
82eeb9d07e
commit
196fd203e2
239
Grid/Grid.xcodeproj/project.pbxproj
Normal file
239
Grid/Grid.xcodeproj/project.pbxproj
Normal file
@ -0,0 +1,239 @@
|
|||||||
|
// !$*UTF8*$!
|
||||||
|
{
|
||||||
|
archiveVersion = 1;
|
||||||
|
classes = {
|
||||||
|
};
|
||||||
|
objectVersion = 46;
|
||||||
|
objects = {
|
||||||
|
|
||||||
|
/* Begin PBXBuildFile section */
|
||||||
|
0BF5DA461AC370840045EA34 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0BF5DA451AC370840045EA34 /* main.cpp */; };
|
||||||
|
/* End PBXBuildFile section */
|
||||||
|
|
||||||
|
/* Begin PBXCopyFilesBuildPhase section */
|
||||||
|
0BF5DA401AC370840045EA34 /* CopyFiles */ = {
|
||||||
|
isa = PBXCopyFilesBuildPhase;
|
||||||
|
buildActionMask = 2147483647;
|
||||||
|
dstPath = /usr/share/man/man1/;
|
||||||
|
dstSubfolderSpec = 0;
|
||||||
|
files = (
|
||||||
|
);
|
||||||
|
runOnlyForDeploymentPostprocessing = 1;
|
||||||
|
};
|
||||||
|
/* End PBXCopyFilesBuildPhase section */
|
||||||
|
|
||||||
|
/* Begin PBXFileReference section */
|
||||||
|
0BF5DA421AC370840045EA34 /* Grid */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Grid; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
|
0BF5DA451AC370840045EA34 /* main.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = main.cpp; sourceTree = "<group>"; };
|
||||||
|
/* End PBXFileReference section */
|
||||||
|
|
||||||
|
/* Begin PBXFrameworksBuildPhase section */
|
||||||
|
0BF5DA3F1AC370840045EA34 /* Frameworks */ = {
|
||||||
|
isa = PBXFrameworksBuildPhase;
|
||||||
|
buildActionMask = 2147483647;
|
||||||
|
files = (
|
||||||
|
);
|
||||||
|
runOnlyForDeploymentPostprocessing = 0;
|
||||||
|
};
|
||||||
|
/* End PBXFrameworksBuildPhase section */
|
||||||
|
|
||||||
|
/* Begin PBXGroup section */
|
||||||
|
0BF5DA391AC370840045EA34 = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
0BF5DA441AC370840045EA34 /* Grid */,
|
||||||
|
0BF5DA431AC370840045EA34 /* Products */,
|
||||||
|
);
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
0BF5DA431AC370840045EA34 /* Products */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
0BF5DA421AC370840045EA34 /* Grid */,
|
||||||
|
);
|
||||||
|
name = Products;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
0BF5DA441AC370840045EA34 /* Grid */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
0BF5DA451AC370840045EA34 /* main.cpp */,
|
||||||
|
);
|
||||||
|
path = Grid;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
|
/* End PBXGroup section */
|
||||||
|
|
||||||
|
/* Begin PBXNativeTarget section */
|
||||||
|
0BF5DA411AC370840045EA34 /* Grid */ = {
|
||||||
|
isa = PBXNativeTarget;
|
||||||
|
buildConfigurationList = 0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */;
|
||||||
|
buildPhases = (
|
||||||
|
0BF5DA3E1AC370840045EA34 /* Sources */,
|
||||||
|
0BF5DA3F1AC370840045EA34 /* Frameworks */,
|
||||||
|
0BF5DA401AC370840045EA34 /* CopyFiles */,
|
||||||
|
);
|
||||||
|
buildRules = (
|
||||||
|
);
|
||||||
|
dependencies = (
|
||||||
|
);
|
||||||
|
name = Grid;
|
||||||
|
productName = Grid;
|
||||||
|
productReference = 0BF5DA421AC370840045EA34 /* Grid */;
|
||||||
|
productType = "com.apple.product-type.tool";
|
||||||
|
};
|
||||||
|
/* End PBXNativeTarget section */
|
||||||
|
|
||||||
|
/* Begin PBXProject section */
|
||||||
|
0BF5DA3A1AC370840045EA34 /* Project object */ = {
|
||||||
|
isa = PBXProject;
|
||||||
|
attributes = {
|
||||||
|
LastUpgradeCheck = 0610;
|
||||||
|
ORGANIZATIONNAME = "University of Edinburgh";
|
||||||
|
TargetAttributes = {
|
||||||
|
0BF5DA411AC370840045EA34 = {
|
||||||
|
CreatedOnToolsVersion = 6.1.1;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
buildConfigurationList = 0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */;
|
||||||
|
compatibilityVersion = "Xcode 3.2";
|
||||||
|
developmentRegion = English;
|
||||||
|
hasScannedForEncodings = 0;
|
||||||
|
knownRegions = (
|
||||||
|
en,
|
||||||
|
);
|
||||||
|
mainGroup = 0BF5DA391AC370840045EA34;
|
||||||
|
productRefGroup = 0BF5DA431AC370840045EA34 /* Products */;
|
||||||
|
projectDirPath = "";
|
||||||
|
projectRoot = "";
|
||||||
|
targets = (
|
||||||
|
0BF5DA411AC370840045EA34 /* Grid */,
|
||||||
|
);
|
||||||
|
};
|
||||||
|
/* End PBXProject section */
|
||||||
|
|
||||||
|
/* Begin PBXSourcesBuildPhase section */
|
||||||
|
0BF5DA3E1AC370840045EA34 /* Sources */ = {
|
||||||
|
isa = PBXSourcesBuildPhase;
|
||||||
|
buildActionMask = 2147483647;
|
||||||
|
files = (
|
||||||
|
0BF5DA461AC370840045EA34 /* main.cpp in Sources */,
|
||||||
|
);
|
||||||
|
runOnlyForDeploymentPostprocessing = 0;
|
||||||
|
};
|
||||||
|
/* End PBXSourcesBuildPhase section */
|
||||||
|
|
||||||
|
/* Begin XCBuildConfiguration section */
|
||||||
|
0BF5DA471AC370840045EA34 /* Debug */ = {
|
||||||
|
isa = XCBuildConfiguration;
|
||||||
|
buildSettings = {
|
||||||
|
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||||
|
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
|
||||||
|
CLANG_CXX_LIBRARY = "libc++";
|
||||||
|
CLANG_ENABLE_MODULES = YES;
|
||||||
|
CLANG_ENABLE_OBJC_ARC = YES;
|
||||||
|
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||||
|
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||||
|
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||||
|
CLANG_WARN_EMPTY_BODY = YES;
|
||||||
|
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||||
|
CLANG_WARN_INT_CONVERSION = YES;
|
||||||
|
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||||
|
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||||
|
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||||
|
COPY_PHASE_STRIP = NO;
|
||||||
|
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||||
|
GCC_C_LANGUAGE_STANDARD = gnu99;
|
||||||
|
GCC_DYNAMIC_NO_PIC = NO;
|
||||||
|
GCC_OPTIMIZATION_LEVEL = 0;
|
||||||
|
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||||
|
"DEBUG=1",
|
||||||
|
"$(inherited)",
|
||||||
|
);
|
||||||
|
GCC_SYMBOLS_PRIVATE_EXTERN = NO;
|
||||||
|
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||||
|
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||||
|
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||||
|
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||||
|
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||||
|
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||||
|
MACOSX_DEPLOYMENT_TARGET = 10.10;
|
||||||
|
MTL_ENABLE_DEBUG_INFO = YES;
|
||||||
|
ONLY_ACTIVE_ARCH = YES;
|
||||||
|
SDKROOT = macosx;
|
||||||
|
};
|
||||||
|
name = Debug;
|
||||||
|
};
|
||||||
|
0BF5DA481AC370840045EA34 /* Release */ = {
|
||||||
|
isa = XCBuildConfiguration;
|
||||||
|
buildSettings = {
|
||||||
|
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||||
|
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
|
||||||
|
CLANG_CXX_LIBRARY = "libc++";
|
||||||
|
CLANG_ENABLE_MODULES = YES;
|
||||||
|
CLANG_ENABLE_OBJC_ARC = YES;
|
||||||
|
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||||
|
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||||
|
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||||
|
CLANG_WARN_EMPTY_BODY = YES;
|
||||||
|
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||||
|
CLANG_WARN_INT_CONVERSION = YES;
|
||||||
|
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||||
|
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||||
|
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||||
|
COPY_PHASE_STRIP = YES;
|
||||||
|
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||||
|
ENABLE_NS_ASSERTIONS = NO;
|
||||||
|
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||||
|
GCC_C_LANGUAGE_STANDARD = gnu99;
|
||||||
|
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||||
|
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||||
|
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||||
|
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||||
|
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||||
|
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||||
|
MACOSX_DEPLOYMENT_TARGET = 10.10;
|
||||||
|
MTL_ENABLE_DEBUG_INFO = NO;
|
||||||
|
SDKROOT = macosx;
|
||||||
|
};
|
||||||
|
name = Release;
|
||||||
|
};
|
||||||
|
0BF5DA4A1AC370840045EA34 /* Debug */ = {
|
||||||
|
isa = XCBuildConfiguration;
|
||||||
|
buildSettings = {
|
||||||
|
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||||
|
};
|
||||||
|
name = Debug;
|
||||||
|
};
|
||||||
|
0BF5DA4B1AC370840045EA34 /* Release */ = {
|
||||||
|
isa = XCBuildConfiguration;
|
||||||
|
buildSettings = {
|
||||||
|
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||||
|
};
|
||||||
|
name = Release;
|
||||||
|
};
|
||||||
|
/* End XCBuildConfiguration section */
|
||||||
|
|
||||||
|
/* Begin XCConfigurationList section */
|
||||||
|
0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */ = {
|
||||||
|
isa = XCConfigurationList;
|
||||||
|
buildConfigurations = (
|
||||||
|
0BF5DA471AC370840045EA34 /* Debug */,
|
||||||
|
0BF5DA481AC370840045EA34 /* Release */,
|
||||||
|
);
|
||||||
|
defaultConfigurationIsVisible = 0;
|
||||||
|
defaultConfigurationName = Release;
|
||||||
|
};
|
||||||
|
0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */ = {
|
||||||
|
isa = XCConfigurationList;
|
||||||
|
buildConfigurations = (
|
||||||
|
0BF5DA4A1AC370840045EA34 /* Debug */,
|
||||||
|
0BF5DA4B1AC370840045EA34 /* Release */,
|
||||||
|
);
|
||||||
|
defaultConfigurationIsVisible = 0;
|
||||||
|
};
|
||||||
|
/* End XCConfigurationList section */
|
||||||
|
};
|
||||||
|
rootObject = 0BF5DA3A1AC370840045EA34 /* Project object */;
|
||||||
|
}
|
7
Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
7
Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Workspace
|
||||||
|
version = "1.0">
|
||||||
|
<FileRef
|
||||||
|
location = "self:Grid.xcodeproj">
|
||||||
|
</FileRef>
|
||||||
|
</Workspace>
|
BIN
Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate
generated
Normal file
BIN
Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate
generated
Normal file
Binary file not shown.
@ -0,0 +1,43 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Scheme
|
||||||
|
LastUpgradeVersion = "0610"
|
||||||
|
version = "1.3">
|
||||||
|
<BuildAction
|
||||||
|
parallelizeBuildables = "YES"
|
||||||
|
buildImplicitDependencies = "YES">
|
||||||
|
</BuildAction>
|
||||||
|
<TestAction
|
||||||
|
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
|
||||||
|
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
|
||||||
|
shouldUseLaunchSchemeArgsEnv = "YES"
|
||||||
|
buildConfiguration = "Debug">
|
||||||
|
<Testables>
|
||||||
|
</Testables>
|
||||||
|
</TestAction>
|
||||||
|
<LaunchAction
|
||||||
|
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
|
||||||
|
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
|
||||||
|
launchStyle = "0"
|
||||||
|
useCustomWorkingDirectory = "NO"
|
||||||
|
buildConfiguration = "Debug"
|
||||||
|
ignoresPersistentStateOnLaunch = "NO"
|
||||||
|
debugDocumentVersioning = "YES"
|
||||||
|
allowLocationSimulation = "YES">
|
||||||
|
<AdditionalOptions>
|
||||||
|
</AdditionalOptions>
|
||||||
|
</LaunchAction>
|
||||||
|
<ProfileAction
|
||||||
|
shouldUseLaunchSchemeArgsEnv = "YES"
|
||||||
|
savedToolIdentifier = ""
|
||||||
|
useCustomWorkingDirectory = "NO"
|
||||||
|
buildConfiguration = "Release"
|
||||||
|
debugDocumentVersioning = "YES">
|
||||||
|
</ProfileAction>
|
||||||
|
<AnalyzeAction
|
||||||
|
buildConfiguration = "Debug">
|
||||||
|
</AnalyzeAction>
|
||||||
|
<ArchiveAction
|
||||||
|
buildConfiguration = "Release"
|
||||||
|
revealArchiveInOrganizer = "YES">
|
||||||
|
</ArchiveAction>
|
||||||
|
</Scheme>
|
@ -0,0 +1,22 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||||
|
<plist version="1.0">
|
||||||
|
<dict>
|
||||||
|
<key>SchemeUserState</key>
|
||||||
|
<dict>
|
||||||
|
<key>Grid.xcscheme</key>
|
||||||
|
<dict>
|
||||||
|
<key>orderHint</key>
|
||||||
|
<integer>0</integer>
|
||||||
|
</dict>
|
||||||
|
</dict>
|
||||||
|
<key>SuppressBuildableAutocreation</key>
|
||||||
|
<dict>
|
||||||
|
<key>0BF5DA411AC370840045EA34</key>
|
||||||
|
<dict>
|
||||||
|
<key>primary</key>
|
||||||
|
<true/>
|
||||||
|
</dict>
|
||||||
|
</dict>
|
||||||
|
</dict>
|
||||||
|
</plist>
|
184
Grid_Cartesian.h
184
Grid_Cartesian.h
@ -1,10 +1,9 @@
|
|||||||
#ifndef GRID_CARTESIAN_H
|
#ifndef GRID_CARTESIAN_H
|
||||||
#define GRID_CARTESIAN_H
|
#define GRID_CARTESIAN_H
|
||||||
|
|
||||||
#include "Grid.h"
|
#include <Grid.h>
|
||||||
|
#include <Grid_Communicator.h>
|
||||||
namespace dpo{
|
namespace dpo{
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Grid Support. Following will go into Grid.h.
|
// Grid Support. Following will go into Grid.h.
|
||||||
@ -13,28 +12,47 @@ namespace dpo{
|
|||||||
// dpo::Grid
|
// dpo::Grid
|
||||||
// dpo::GridCartesian
|
// dpo::GridCartesian
|
||||||
// dpo::GridCartesianRedBlack
|
// dpo::GridCartesianRedBlack
|
||||||
|
|
||||||
class Grid {
|
|
||||||
|
class SimdGrid : public CartesianCommunicator {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|
||||||
|
SimdGrid(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
|
||||||
|
|
||||||
// Give Lattice access
|
// Give Lattice access
|
||||||
template<class object> friend class Lattice;
|
template<class object> friend class Lattice;
|
||||||
|
|
||||||
|
|
||||||
//protected:
|
//protected:
|
||||||
|
|
||||||
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
||||||
// and one generator per site.
|
// and one generator per site.
|
||||||
//std::default_random_engine generator;
|
//std::default_random_engine generator;
|
||||||
// static std::mt19937 generator( 9 );
|
// static std::mt19937 generator( 9 );
|
||||||
|
|
||||||
|
|
||||||
// Grid information.
|
// Grid information.
|
||||||
unsigned long _ndimension;
|
|
||||||
std::vector<int> _layout; // Which dimensions get relayed out over simd lanes.
|
// Commicator provides
|
||||||
std::vector<int> _dimensions; // Dimensions of array
|
// unsigned long _ndimension;
|
||||||
std::vector<int> _rdimensions;// Reduced dimensions with simd lane images removed
|
// std::vector<int> _processors; // processor grid
|
||||||
|
// int _processor; // linear processor rank
|
||||||
|
// std::vector<int> _processor_coor; // linear processor rank
|
||||||
|
|
||||||
|
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<int> _fdimensions;// Global dimensions of array with cb removed
|
||||||
|
std::vector<int> _gdimensions;// Global dimensions of array
|
||||||
|
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
|
||||||
|
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
|
||||||
|
|
||||||
|
// std::vector<int> _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
|
||||||
|
// std::vector<int> _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> _ostride; // Outer stride for each dimension
|
std::vector<int> _ostride; // Outer stride for each dimension
|
||||||
std::vector<int> _istride; // Inner stride i.e. within simd lane
|
std::vector<int> _istride; // Inner stride i.e. within simd lane
|
||||||
|
|
||||||
int _osites; // _isites*_osites = product(dimensions).
|
int _osites; // _isites*_osites = product(dimensions).
|
||||||
int _isites;
|
int _isites;
|
||||||
|
|
||||||
@ -70,15 +88,33 @@ public:
|
|||||||
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
|
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int oSites(void) { return _osites; };
|
inline int oSites(void) { return _osites; };
|
||||||
inline int iSites(void) { return _isites; };
|
inline int iSites(void) { return _isites; };
|
||||||
|
|
||||||
|
inline int CheckerboardFromOsite (int Osite){
|
||||||
|
std::vector<int> ocoor;
|
||||||
|
CoordFromOsite(ocoor,Osite);
|
||||||
|
int ss=0;
|
||||||
|
for(int d=0;d<_ndimension;d++){
|
||||||
|
ss=ss+ocoor[d];
|
||||||
|
}
|
||||||
|
return ss&0x1;
|
||||||
|
}
|
||||||
|
inline void CoordFromOsite (std::vector<int>& coor,int Osite){
|
||||||
|
coor.resize(_ndimension);
|
||||||
|
for(int d=0;d<_ndimension;d++){
|
||||||
|
coor[d] = Osite % _rdimensions[d];
|
||||||
|
Osite = Osite / _rdimensions[d];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
virtual int CheckerBoard(std::vector<int> site)=0;
|
virtual int CheckerBoard(std::vector<int> site)=0;
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
|
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift)=0;
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||||
};
|
};
|
||||||
|
|
||||||
class GridCartesian: public Grid {
|
class GridCartesian: public SimdGrid {
|
||||||
public:
|
public:
|
||||||
virtual int CheckerBoard(std::vector<int> site){
|
virtual int CheckerBoard(std::vector<int> site){
|
||||||
return 0;
|
return 0;
|
||||||
@ -86,19 +122,24 @@ public:
|
|||||||
virtual int CheckerBoardDestination(int cb,int shift){
|
virtual int CheckerBoardDestination(int cb,int shift){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift){
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
|
||||||
return shift;
|
return shift;
|
||||||
}
|
}
|
||||||
GridCartesian(std::vector<int> &dimensions,std::vector<int> layout)
|
GridCartesian(std::vector<int> &dimensions,
|
||||||
|
std::vector<int> &simd_layout,
|
||||||
|
std::vector<int> &processor_grid
|
||||||
|
) : SimdGrid(processor_grid)
|
||||||
{
|
{
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Grid information
|
// Grid information
|
||||||
///////////////////////
|
///////////////////////
|
||||||
_ndimension = dimensions.size();
|
_ndimension = dimensions.size();
|
||||||
|
|
||||||
_dimensions.resize(_ndimension);
|
_fdimensions.resize(_ndimension);
|
||||||
|
_gdimensions.resize(_ndimension);
|
||||||
|
_ldimensions.resize(_ndimension);
|
||||||
_rdimensions.resize(_ndimension);
|
_rdimensions.resize(_ndimension);
|
||||||
_layout.resize(_ndimension);
|
_simd_layout.resize(_ndimension);
|
||||||
|
|
||||||
_ostride.resize(_ndimension);
|
_ostride.resize(_ndimension);
|
||||||
_istride.resize(_ndimension);
|
_istride.resize(_ndimension);
|
||||||
@ -106,21 +147,26 @@ public:
|
|||||||
_osites = 1;
|
_osites = 1;
|
||||||
_isites = 1;
|
_isites = 1;
|
||||||
for(int d=0;d<_ndimension;d++){
|
for(int d=0;d<_ndimension;d++){
|
||||||
_dimensions[d] = dimensions[d];
|
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||||
_layout[d] = layout[d];
|
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||||
// Use a reduced simd grid
|
_simd_layout[d] = simd_layout[d];
|
||||||
_rdimensions[d]= _dimensions[d]/_layout[d]; //<-- _layout[d] is zero
|
|
||||||
_osites *= _rdimensions[d];
|
//FIXME check for exact division
|
||||||
_isites *= _layout[d];
|
|
||||||
|
// Use a reduced simd grid
|
||||||
|
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
|
||||||
|
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
|
||||||
|
_osites *= _rdimensions[d];
|
||||||
|
_isites *= _simd_layout[d];
|
||||||
|
|
||||||
// Addressing support
|
// Addressing support
|
||||||
if ( d==0 ) {
|
if ( d==0 ) {
|
||||||
_ostride[d] = 1;
|
_ostride[d] = 1;
|
||||||
_istride[d] = 1;
|
_istride[d] = 1;
|
||||||
} else {
|
} else {
|
||||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||||
_istride[d] = _istride[d-1]*_layout[d-1];
|
_istride[d] = _istride[d-1]*_simd_layout[d-1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
@ -148,46 +194,57 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
// Specialise this for red black grids storing half the data like a chess board.
|
// Specialise this for red black grids storing half the data like a chess board.
|
||||||
class GridRedBlackCartesian : public Grid
|
class GridRedBlackCartesian : public SimdGrid
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
virtual int CheckerBoard(std::vector<int> site){
|
virtual int CheckerBoard(std::vector<int> site){
|
||||||
return site[0]&0x1;
|
return (site[0]+site[1]+site[2]+site[3])&0x1;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift){
|
|
||||||
if ( dim == 0 ){
|
// Depending on the cb of site, we toggle source cb.
|
||||||
int fulldim =2*_dimensions[0];
|
// for block #b, element #e = (b, e)
|
||||||
shift = (shift+fulldim)%fulldim;
|
// we need
|
||||||
if ( source_cb ) {
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
|
||||||
// Shift 0,1 -> 0
|
|
||||||
return (shift+1)/2;
|
if(dim != 0) return shift;
|
||||||
} else {
|
|
||||||
// Shift 0->0, 1 -> 1, 2->1
|
int fulldim =_fdimensions[0];
|
||||||
return (shift)/2;
|
shift = (shift+fulldim)%fulldim;
|
||||||
}
|
|
||||||
} else {
|
// Probably faster with table lookup;
|
||||||
return shift;
|
// or by looping over x,y,z and multiply rather than computing checkerboard.
|
||||||
}
|
int ocb=CheckerboardFromOsite(osite);
|
||||||
|
|
||||||
|
if ( (source_cb+ocb)&1 ) {
|
||||||
|
return (shift)/2;
|
||||||
|
} else {
|
||||||
|
return (shift+1)/2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift){
|
virtual int CheckerBoardDestination(int source_cb,int shift){
|
||||||
if ((shift+2*_dimensions[0])&0x1) {
|
if ((shift+_fdimensions[0])&0x1) {
|
||||||
return 1-source_cb;
|
return 1-source_cb;
|
||||||
} else {
|
} else {
|
||||||
return source_cb;
|
return source_cb;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
GridRedBlackCartesian(std::vector<int> &dimensions,std::vector<int> layout)
|
GridRedBlackCartesian(std::vector<int> &dimensions,
|
||||||
|
std::vector<int> &simd_layout,
|
||||||
|
std::vector<int> &processor_grid) : SimdGrid(processor_grid)
|
||||||
{
|
{
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Grid information
|
// Grid information
|
||||||
///////////////////////
|
///////////////////////
|
||||||
_ndimension = dimensions.size();
|
_ndimension = dimensions.size();
|
||||||
|
|
||||||
_dimensions.resize(_ndimension);
|
_fdimensions.resize(_ndimension);
|
||||||
|
_gdimensions.resize(_ndimension);
|
||||||
|
_ldimensions.resize(_ndimension);
|
||||||
_rdimensions.resize(_ndimension);
|
_rdimensions.resize(_ndimension);
|
||||||
_layout.resize(_ndimension);
|
_simd_layout.resize(_ndimension);
|
||||||
|
|
||||||
_ostride.resize(_ndimension);
|
_ostride.resize(_ndimension);
|
||||||
_istride.resize(_ndimension);
|
_istride.resize(_ndimension);
|
||||||
@ -195,14 +252,17 @@ public:
|
|||||||
_osites = 1;
|
_osites = 1;
|
||||||
_isites = 1;
|
_isites = 1;
|
||||||
for(int d=0;d<_ndimension;d++){
|
for(int d=0;d<_ndimension;d++){
|
||||||
_dimensions[d] = dimensions[d];
|
_fdimensions[d] = dimensions[d];
|
||||||
_dimensions[0] = dimensions[0]/2;
|
_gdimensions[d] = _fdimensions[d];
|
||||||
_layout[d] = layout[d];
|
if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
|
||||||
|
_ldimensions[d] = _gdimensions[d]/_processors[d];
|
||||||
|
|
||||||
// Use a reduced simd grid
|
// Use a reduced simd grid
|
||||||
_rdimensions[d]= _dimensions[d]/_layout[d];
|
_simd_layout[d] = simd_layout[d];
|
||||||
|
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
|
||||||
|
|
||||||
_osites *= _rdimensions[d];
|
_osites *= _rdimensions[d];
|
||||||
_isites *= _layout[d];
|
_isites *= _simd_layout[d];
|
||||||
|
|
||||||
// Addressing support
|
// Addressing support
|
||||||
if ( d==0 ) {
|
if ( d==0 ) {
|
||||||
@ -210,7 +270,7 @@ public:
|
|||||||
_istride[d] = 1;
|
_istride[d] = 1;
|
||||||
} else {
|
} else {
|
||||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||||
_istride[d] = _istride[d-1]*_layout[d-1];
|
_istride[d] = _istride[d-1]*_simd_layout[d-1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
39
Grid_Communicator.h
Normal file
39
Grid_Communicator.h
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
#ifndef GRID_COMMUNICATOR_H
|
||||||
|
#define GRID_COMMUNICATOR_H
|
||||||
|
///////////////////////////////////
|
||||||
|
// Processor layout information
|
||||||
|
///////////////////////////////////
|
||||||
|
namespace dpo {
|
||||||
|
class CartesianCommunicator {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Communicator should know nothing of the physics grid, only processor grid.
|
||||||
|
|
||||||
|
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
|
||||||
|
int _processor; // linear processor rank
|
||||||
|
std::vector<int> _processor_coor; // linear processor coordinate
|
||||||
|
unsigned long _ndimension;
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_MPI
|
||||||
|
MPI_Comm communicator;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
CartesianCommunicator(std::vector<int> &pdimensions_in);
|
||||||
|
|
||||||
|
int Rank(std::vector<int> coor);
|
||||||
|
void GlobalSumF(float &);
|
||||||
|
void GlobalSumFVector(float *,int N);
|
||||||
|
|
||||||
|
void GlobalSumF(double &);
|
||||||
|
void GlobalSumFVector(double *,int N);
|
||||||
|
|
||||||
|
void SendToRecvFrom(void *xmit,
|
||||||
|
std::vector<int> to_coordinate,
|
||||||
|
void *recv,
|
||||||
|
std::vector<int> from_coordinate,
|
||||||
|
int bytes);
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
299
Grid_Lattice.h
299
Grid_Lattice.h
@ -3,19 +3,31 @@
|
|||||||
|
|
||||||
#include "Grid.h"
|
#include "Grid.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace dpo {
|
namespace dpo {
|
||||||
|
|
||||||
|
// Permute the pointers 32bitx16 = 512
|
||||||
|
static int permute_map[4][16] = {
|
||||||
|
{ 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
|
||||||
|
{ 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13},
|
||||||
|
{ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11},
|
||||||
|
{ 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
class Lattice
|
class Lattice
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Grid *_grid;
|
SimdGrid *_grid;
|
||||||
int checkerboard;
|
int checkerboard;
|
||||||
std::vector<vobj,alignedAllocator<vobj> > _odata;
|
std::vector<vobj,alignedAllocator<vobj> > _odata;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
Lattice(Grid *grid) : _grid(grid) {
|
|
||||||
|
Lattice(SimdGrid *grid) : _grid(grid) {
|
||||||
_odata.reserve(_grid->oSites());
|
_odata.reserve(_grid->oSites());
|
||||||
if ( ((uint64_t)&_odata[0])&0xF) {
|
if ( ((uint64_t)&_odata[0])&0xF) {
|
||||||
exit(-1);
|
exit(-1);
|
||||||
@ -23,6 +35,19 @@ public:
|
|||||||
checkerboard=0;
|
checkerboard=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_NONE
|
||||||
|
#include <Grid_none_cshift.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_FAKE
|
||||||
|
#include <Grid_fake_cshift.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_MPI
|
||||||
|
#include <Grid_mpi_cshift.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// overloading dpo::conformable but no conformable in dpo ...?:w
|
// overloading dpo::conformable but no conformable in dpo ...?:w
|
||||||
template<class obj1,class obj2>
|
template<class obj1,class obj2>
|
||||||
@ -69,212 +94,6 @@ public:
|
|||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
#if 0
|
|
||||||
// Collapse doesn't appear to work the way I think it should in icpc
|
|
||||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
|
||||||
{
|
|
||||||
Lattice<vobj> ret(rhs._grid);
|
|
||||||
|
|
||||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
|
||||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
|
||||||
int sx,so,o;
|
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
|
||||||
int ld = rhs._grid->_dimensions[dimension];
|
|
||||||
|
|
||||||
// Map to always positive shift.
|
|
||||||
shift = (shift+ld)%ld;
|
|
||||||
|
|
||||||
// Work out whether to permute and the permute type
|
|
||||||
// ABCDEFGH -> AE BF CG DH permute
|
|
||||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
|
||||||
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
|
||||||
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
|
||||||
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
|
||||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
|
||||||
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
|
||||||
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
|
||||||
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
|
||||||
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
|
||||||
int permute_type=0;
|
|
||||||
for(int d=0;d<dimension;d++)
|
|
||||||
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
|
||||||
|
|
||||||
|
|
||||||
// loop over perp slices.
|
|
||||||
// Threading considerations:
|
|
||||||
// Need to map thread_num to
|
|
||||||
//
|
|
||||||
// x_min,x_max for Loop-A
|
|
||||||
// n_min,n_max for Loop-B
|
|
||||||
// b_min,b_max for Loop-C
|
|
||||||
// In a way that maximally load balances.
|
|
||||||
//
|
|
||||||
// Optimal:
|
|
||||||
// There are rd*n_block*block items of work.
|
|
||||||
// These serialise as item "w"
|
|
||||||
// b=w%block; w=w/block
|
|
||||||
// n=w%nblock; x=w/nblock. Perhaps 20 cycles?
|
|
||||||
//
|
|
||||||
// Logic:
|
|
||||||
// x_chunk = (rd+thread)/nthreads simply divide work across nodes.
|
|
||||||
//
|
|
||||||
// rd=5 , threads = 8;
|
|
||||||
// 0 1 2 3 4 5 6 7
|
|
||||||
// 0 0 0 1 1 1 1 1
|
|
||||||
for(int x=0;x<rd;x++){ // Loop A
|
|
||||||
sx = (x-shift+ld)%rd;
|
|
||||||
o = x*rhs._grid->_ostride[dimension];
|
|
||||||
so =sx*rhs._grid->_ostride[dimension];
|
|
||||||
|
|
||||||
int permute_slice=0;
|
|
||||||
if ( permute_dim ) {
|
|
||||||
permute_slice = shift/rd;
|
|
||||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
if ( permute_slice ) {
|
|
||||||
|
|
||||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
|
||||||
int num =rhs._grid->_slice_block[dimension]*internal;
|
|
||||||
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
vComplex *optr = (vComplex *)&ret._odata[o];
|
|
||||||
vComplex *iptr = (vComplex *)&rhs._odata[so];
|
|
||||||
for(int b=0;b<num;b++){
|
|
||||||
permute(optr[b],iptr[b],permute_type);
|
|
||||||
}
|
|
||||||
o+=rhs._grid->_slice_stride[dimension];
|
|
||||||
so+=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
|
||||||
ret._odata[o+i]=rhs._odata[so+i];
|
|
||||||
}
|
|
||||||
o+=rhs._grid->_slice_stride[dimension];
|
|
||||||
so+=rhs._grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
|
|
||||||
if ( permute_slice ) {
|
|
||||||
|
|
||||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
|
||||||
int num =rhs._grid->_slice_block[dimension]*internal;
|
|
||||||
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<num;b++){
|
|
||||||
vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
|
|
||||||
vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
|
|
||||||
permute(optr[b],iptr[b],permute_type);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
|
||||||
int oo = o+ n*rhs._grid->_slice_stride[dimension];
|
|
||||||
int soo=so+ n*rhs._grid->_slice_stride[dimension];
|
|
||||||
ret._odata[oo+i]=rhs._odata[soo+i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
|
||||||
{
|
|
||||||
Lattice<vobj> ret(rhs._grid);
|
|
||||||
|
|
||||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
|
||||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
|
||||||
int ld = rhs._grid->_dimensions[dimension];
|
|
||||||
|
|
||||||
// Map to always positive shift.
|
|
||||||
shift = (shift+ld)%ld;
|
|
||||||
|
|
||||||
// Work out whether to permute and the permute type
|
|
||||||
// ABCDEFGH -> AE BF CG DH permute
|
|
||||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
|
||||||
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
|
||||||
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
|
||||||
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
|
||||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
|
||||||
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
|
||||||
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
|
||||||
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
|
||||||
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
|
||||||
int permute_type=0;
|
|
||||||
for(int d=0;d<dimension;d++)
|
|
||||||
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
|
||||||
|
|
||||||
|
|
||||||
// loop over all work
|
|
||||||
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
for(int ww=0;ww<work;ww++){
|
|
||||||
|
|
||||||
|
|
||||||
// can optimise this if know w moves congtiguously for a given thread.
|
|
||||||
// b=(b+1);
|
|
||||||
// if (b==_slice_block) {b=0; n=n+1;}
|
|
||||||
// if (n==_slice_nblock) { n=0; x=x+1}
|
|
||||||
//
|
|
||||||
// Perhaps a five cycle iterator, or so.
|
|
||||||
int w=ww;
|
|
||||||
int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
|
|
||||||
int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
|
|
||||||
int x = w;
|
|
||||||
|
|
||||||
int sx,so,o;
|
|
||||||
sx = (x-shift+ld)%rd;
|
|
||||||
o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
|
|
||||||
so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
|
|
||||||
|
|
||||||
int permute_slice=0;
|
|
||||||
if ( permute_dim ) {
|
|
||||||
permute_slice = shift/rd;
|
|
||||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( permute_slice ) {
|
|
||||||
|
|
||||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
|
||||||
vComplex *optr = (vComplex *)&ret._odata[o+b];
|
|
||||||
vComplex *iptr = (vComplex *)&rhs._odata[so+b];
|
|
||||||
const char *pf = (const char *)iptr;
|
|
||||||
for(int i=0;i<sizeof(vobj);i+=64){
|
|
||||||
_mm_prefetch(pf+i,_MM_HINT_T0);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int i=0;i<internal;i++){
|
|
||||||
permute(optr[i],iptr[i],permute_type);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
const char *pf = (const char *) &rhs._odata[so+b];
|
|
||||||
for(int i=0;i<sizeof(vobj);i+=64){
|
|
||||||
_mm_prefetch(pf+i,_MM_HINT_T0);
|
|
||||||
}
|
|
||||||
ret._odata[o+b]=rhs._odata[so+b];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class sobj>
|
template<class sobj>
|
||||||
inline Lattice<vobj> & operator = (const sobj & r){
|
inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
@ -288,14 +107,15 @@ public:
|
|||||||
template<class sobj>
|
template<class sobj>
|
||||||
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
||||||
|
|
||||||
if ( l._grid.checkerboard != l._grid->Checkerboard(site)){
|
if ( l.checkerboard != l._grid->CheckerBoard(site)){
|
||||||
printf("Poking wrong checkerboard\n");
|
printf("Poking wrong checkerboard\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
int o_index = l._grid->oIndex(site);
|
int o_index = l._grid->oIndex(site);
|
||||||
int i_index = l._grid->iIndex(site);
|
int i_index = l._grid->iIndex(site);
|
||||||
|
|
||||||
|
// BUGGY. This assumes complex real
|
||||||
Real *v_ptr = (Real *)&l._odata[o_index];
|
Real *v_ptr = (Real *)&l._odata[o_index];
|
||||||
Real *s_ptr = (Real *)&s;
|
Real *s_ptr = (Real *)&s;
|
||||||
v_ptr = v_ptr + 2*i_index;
|
v_ptr = v_ptr + 2*i_index;
|
||||||
@ -348,6 +168,22 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
friend void lex_sites(Lattice<vobj> &l){
|
||||||
|
|
||||||
|
Real *v_ptr = (Real *)&l._odata[0];
|
||||||
|
size_t o_len = l._grid->oSites();
|
||||||
|
size_t v_len = sizeof(vobj)/sizeof(vRealF);
|
||||||
|
size_t vec_len = vRealF::Nsimd();
|
||||||
|
|
||||||
|
for(int i=0;i<o_len;i++){
|
||||||
|
for(int j=0;j<v_len;j++){
|
||||||
|
for(int vv=0;vv<vec_len;vv+=2){
|
||||||
|
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
|
||||||
|
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
};
|
||||||
|
|
||||||
friend void gaussian(Lattice<vobj> &l){
|
friend void gaussian(Lattice<vobj> &l){
|
||||||
// Zero mean, unit variance.
|
// Zero mean, unit variance.
|
||||||
std::normal_distribution<double> distribution(0.0,1.0);
|
std::normal_distribution<double> distribution(0.0,1.0);
|
||||||
@ -427,22 +263,41 @@ public:
|
|||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// remove and insert a half checkerboard
|
// remove and insert a half checkerboard
|
||||||
friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
||||||
|
half.checkerboard = cb;
|
||||||
|
int ssh=0;
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(int ss=0;ss<half._grid->oSites();ss++){
|
for(int ss=0;ss<full._grid->oSites();ss++){
|
||||||
half._odata[ss] = full._odata[ss*2+cb];
|
std::vector<int> coor;
|
||||||
}
|
int cbos;
|
||||||
half.checkerboard = cb;
|
|
||||||
|
full._grid->CoordFromOsite(coor,ss);
|
||||||
|
cbos=half._grid->CheckerBoard(coor);
|
||||||
|
|
||||||
|
if (cbos==cb) {
|
||||||
|
|
||||||
|
half._odata[ssh] = full._odata[ss];
|
||||||
|
ssh++;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||||
int cb = half.checkerboard;
|
int cb = half.checkerboard;
|
||||||
|
int ssh=0;
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(int ss=0;ss<half._grid->oSites();ss++){
|
for(int ss=0;ss<full._grid->oSites();ss++){
|
||||||
full._odata[ss*2+cb]=half._odata[ss];
|
std::vector<int> coor;
|
||||||
}
|
int cbos;
|
||||||
|
|
||||||
|
full._grid->CoordFromOsite(coor,ss);
|
||||||
|
cbos=half._grid->CheckerBoard(coor);
|
||||||
|
|
||||||
|
if (cbos==cb) {
|
||||||
|
full._odata[ss]=half._odata[ssh];
|
||||||
|
ssh++;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}; // class Lattice
|
}; // class Lattice
|
||||||
|
|
||||||
|
@ -5,7 +5,6 @@ namespace QCD {
|
|||||||
|
|
||||||
static const int Nc=3;
|
static const int Nc=3;
|
||||||
static const int Ns=4;
|
static const int Ns=4;
|
||||||
|
|
||||||
static const int CbRed =0;
|
static const int CbRed =0;
|
||||||
static const int CbBlack=1;
|
static const int CbBlack=1;
|
||||||
|
|
||||||
|
@ -10,6 +10,15 @@
|
|||||||
/* AVX512 */
|
/* AVX512 */
|
||||||
/* #undef AVX512 */
|
/* #undef AVX512 */
|
||||||
|
|
||||||
|
/* GRID_COMMS_FAKE */
|
||||||
|
#define GRID_COMMS_FAKE 1
|
||||||
|
|
||||||
|
/* GRID_COMMS_MPI */
|
||||||
|
/* #undef GRID_COMMS_MPI */
|
||||||
|
|
||||||
|
/* GRID_COMMS_NONE */
|
||||||
|
/* #undef GRID_COMMS_NONE */
|
||||||
|
|
||||||
/* Define to 1 if you have the `gettimeofday' function. */
|
/* Define to 1 if you have the `gettimeofday' function. */
|
||||||
#define HAVE_GETTIMEOFDAY 1
|
#define HAVE_GETTIMEOFDAY 1
|
||||||
|
|
||||||
@ -61,6 +70,9 @@
|
|||||||
/* Define to the one symbol short name of this package. */
|
/* Define to the one symbol short name of this package. */
|
||||||
#define PACKAGE_TARNAME "grid"
|
#define PACKAGE_TARNAME "grid"
|
||||||
|
|
||||||
|
/* Define to the home page for this package. */
|
||||||
|
#define PACKAGE_URL ""
|
||||||
|
|
||||||
/* Define to the version of this package. */
|
/* Define to the version of this package. */
|
||||||
#define PACKAGE_VERSION "1.0"
|
#define PACKAGE_VERSION "1.0"
|
||||||
|
|
||||||
|
34
Grid_fake.cc
Normal file
34
Grid_fake.cc
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
#include "Grid.h"
|
||||||
|
namespace dpo {
|
||||||
|
|
||||||
|
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
||||||
|
{
|
||||||
|
_ndimension = _processors.size();
|
||||||
|
_processor_coor.resize(_ndimension);
|
||||||
|
_processors = processors;
|
||||||
|
|
||||||
|
// Require 1^N processor grid for fake
|
||||||
|
for(int d=0;d<_ndimension;d++) if(_processors[d]!=1) exit(-1);
|
||||||
|
|
||||||
|
_processor = 0;// I am the one. The only one..
|
||||||
|
for(int d=0;d<_ndimension;d++) _processor_coor[d] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::GlobalSumF(float &){}
|
||||||
|
void CartesianCommunicator::GlobalSumFVector(float *,int N){}
|
||||||
|
void CartesianCommunicator::GlobalSumF(double &){}
|
||||||
|
void CartesianCommunicator::GlobalSumFVector(double *,int N){}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
|
std::vector<int> to_coordinate,
|
||||||
|
void *recv,
|
||||||
|
std::vector<int> from_coordinate,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
exit(-1);
|
||||||
|
bcopy(xmit,recv,bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
246
Grid_fake_cshift.h
Normal file
246
Grid_fake_cshift.h
Normal file
@ -0,0 +1,246 @@
|
|||||||
|
#ifndef _GRID_FAKE_H_
|
||||||
|
#define _GRID_FAKE_H_
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
const int Nsimd = vector_type::Nsimd();
|
||||||
|
|
||||||
|
Lattice<vobj> ret(rhs._grid);
|
||||||
|
|
||||||
|
int fd = rhs._grid->_fdimensions[dimension];
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
//int ld = rhs._grid->_ldimensions[dimension];
|
||||||
|
//int gd = rhs._grid->_gdimensions[dimension];
|
||||||
|
|
||||||
|
|
||||||
|
// Map to always positive shift modulo global full dimension.
|
||||||
|
shift = (shift+fd)%fd;
|
||||||
|
|
||||||
|
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||||
|
|
||||||
|
// the permute type
|
||||||
|
|
||||||
|
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||||
|
int permute_type=0;
|
||||||
|
for(int d=0;d<dimension;d++){
|
||||||
|
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
// Move via a fake comms buffer
|
||||||
|
// Simd direction uses an extract/merge pair
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||||
|
int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
|
||||||
|
std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||||
|
std::vector<scalar_type *> pointers(Nsimd);
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||||
|
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||||
|
|
||||||
|
if ( permute_dim ) {
|
||||||
|
|
||||||
|
int o = 0; // relative offset to base
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
|
// base offset for source
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
int wrap = sshift/rd;
|
||||||
|
int num = sshift%rd;
|
||||||
|
|
||||||
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
extract(rhs._odata[so+o+b],pointers);
|
||||||
|
} else {
|
||||||
|
ret._odata[ro+o+b]=rhs._odata[so+o+b];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||||
|
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
o = 0; // relative offset to base
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
|
// base offset for source
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
int wrap = sshift/rd;
|
||||||
|
int num = sshift%rd;
|
||||||
|
|
||||||
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
merge(ret._odata[ro+o+b],pointers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int co; // comm offset
|
||||||
|
int o;
|
||||||
|
|
||||||
|
co=0; o=0;
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
// This call in inner loop is annoying but necessary for dimension=0
|
||||||
|
// in the case of RedBlack grids. Could optimise away with
|
||||||
|
// alternate code paths for all other cases.
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
|
||||||
|
comm_buf[co++]=rhs._odata[so+o+b];
|
||||||
|
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step through a copy into a comms buffer and pull back in.
|
||||||
|
// Genuine fake implementation could calculate if loops back
|
||||||
|
co=0; o=0;
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
ret._odata[ro+o+b]=comm_buf[co++];
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
|
{
|
||||||
|
Lattice<vobj> ret(rhs._grid);
|
||||||
|
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
int ld = rhs._grid->_ldimensions[dimension];
|
||||||
|
int gd = rhs._grid->_gdimensions[dimension];
|
||||||
|
|
||||||
|
// Map to always positive shift.
|
||||||
|
shift = (shift+gd)%gd;
|
||||||
|
|
||||||
|
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||||
|
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||||
|
|
||||||
|
// Work out whether to permute and the permute type
|
||||||
|
// ABCDEFGH -> AE BF CG DH permute
|
||||||
|
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||||
|
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||||
|
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||||
|
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||||
|
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||||
|
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||||
|
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||||
|
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||||
|
|
||||||
|
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||||
|
int permute_type=0;
|
||||||
|
for(int d=0;d<dimension;d++)
|
||||||
|
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||||
|
|
||||||
|
|
||||||
|
// loop over all work
|
||||||
|
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
// Packed gather sequence is clean
|
||||||
|
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar_t;
|
||||||
|
typedef typename vobj::vector_type vector_t;
|
||||||
|
const int ns=sizeof(vobj)/sizeof(scalar_t);
|
||||||
|
const int nv=sizeof(vobj)/sizeof(vector_t);
|
||||||
|
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int sx = (x+shift)%rd;
|
||||||
|
int o = x*rhs._grid->_ostride[dimension];
|
||||||
|
int so =sx*rhs._grid->_ostride[dimension];
|
||||||
|
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
if ( permute_dim ) {
|
||||||
|
permute_slice = shift/rd;
|
||||||
|
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
exit(0);
|
||||||
|
// For fake communication ALWAYS extract and either merge one way or other
|
||||||
|
scalar_t * bptr = (scalar_t *) &comm_buf[0];
|
||||||
|
|
||||||
|
int bo=0;
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
|
||||||
|
vector_t *optr = (vector_t *)&ret._odata[o];
|
||||||
|
vector_t *iptr = (vector_t *)&rhs._odata[so];
|
||||||
|
int skew = buffer_size*ns/2;
|
||||||
|
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
for(int n=0;n<nv;n++){// number of simd vecsscalars in a vector
|
||||||
|
extract(iptr[b*nv+n],&bptr[n],skew,permute_type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
o+=rhs._grid->_slice_stride[dimension];
|
||||||
|
// bo+=rhs._grid->_slice_stride[dimension]*ns/2;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
int bo=0;
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||||
|
comm_buf[bo++] =rhs._odata[so+i];
|
||||||
|
}
|
||||||
|
so+=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
bo=0;
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||||
|
ret._odata[o+i]=comm_buf[bo++];
|
||||||
|
}
|
||||||
|
o+=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
|
#endif
|
@ -28,13 +28,13 @@ double usecond(void) {
|
|||||||
|
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
{
|
{
|
||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
|
||||||
|
|
||||||
printf("Caught signal %d\n",si->si_signo);
|
printf("Caught signal %d\n",si->si_signo);
|
||||||
printf(" mem address %llx\n",(uint64_t)si->si_addr);
|
printf(" mem address %llx\n",(uint64_t)si->si_addr);
|
||||||
printf(" code %d\n",si->si_code);
|
printf(" code %d\n",si->si_code);
|
||||||
|
|
||||||
#ifdef __X86_64
|
#ifdef __X86_64
|
||||||
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
|
|
||||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||||
printf(" instruction %llx\n",(uint64_t)sc->rip);
|
printf(" instruction %llx\n",(uint64_t)sc->rip);
|
||||||
|
|
||||||
|
124
Grid_main.cc
124
Grid_main.cc
@ -14,7 +14,7 @@ using namespace dpo::QCD;
|
|||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
|
||||||
#ifdef KNL
|
#if 0
|
||||||
struct sigaction sa,osa;
|
struct sigaction sa,osa;
|
||||||
sigemptyset (&sa.sa_mask);
|
sigemptyset (&sa.sa_mask);
|
||||||
sa.sa_sigaction= sa_action;
|
sa.sa_sigaction= sa_action;
|
||||||
@ -27,6 +27,9 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<int> latt_size(4);
|
std::vector<int> latt_size(4);
|
||||||
std::vector<int> simd_layout(4);
|
std::vector<int> simd_layout(4);
|
||||||
|
|
||||||
|
std::vector<int> mpi_layout(4);
|
||||||
|
for(int d=0;d<4;d++) mpi_layout[d]=1;
|
||||||
|
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
for(int omp=128;omp<236;omp+=16){
|
for(int omp=128;omp<236;omp+=16){
|
||||||
#else
|
#else
|
||||||
@ -37,7 +40,7 @@ int main (int argc, char ** argv)
|
|||||||
omp_set_num_threads(omp);
|
omp_set_num_threads(omp);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for(int lat=16;lat<=20;lat+=4){
|
for(int lat=4;lat<=16;lat+=40){
|
||||||
latt_size[0] = lat;
|
latt_size[0] = lat;
|
||||||
latt_size[1] = lat;
|
latt_size[1] = lat;
|
||||||
latt_size[2] = lat;
|
latt_size[2] = lat;
|
||||||
@ -62,11 +65,9 @@ int main (int argc, char ** argv)
|
|||||||
simd_layout[2] = 1;
|
simd_layout[2] = 1;
|
||||||
simd_layout[3] = 2;
|
simd_layout[3] = 2;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
GridCartesian Fine(latt_size,simd_layout);
|
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||||
GridRedBlackCartesian rbFine(latt_size,simd_layout);
|
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
|
|
||||||
LatticeColourMatrix Foo(&Fine);
|
LatticeColourMatrix Foo(&Fine);
|
||||||
LatticeColourMatrix Bar(&Fine);
|
LatticeColourMatrix Bar(&Fine);
|
||||||
@ -152,19 +153,23 @@ int main (int argc, char ** argv)
|
|||||||
trscMat = trace(scMat); // Trace
|
trscMat = trace(scMat); // Trace
|
||||||
|
|
||||||
FooBar = Bar;
|
FooBar = Bar;
|
||||||
int shift=1;
|
|
||||||
int dir=0;
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
{
|
||||||
|
std::vector<int> coor(4);
|
||||||
|
for(int d=0;d<4;d++) coor[d] = 0;
|
||||||
|
peekSite(cmat,Foo,coor);
|
||||||
|
Foo = zero;
|
||||||
|
pokeSite(cmat,Foo,coor);
|
||||||
|
}
|
||||||
random(Foo);
|
random(Foo);
|
||||||
pickCheckerboard(CbRed,rFoo,Foo); // Pick out red or black checkerboards
|
*/
|
||||||
pickCheckerboard(CbBlack,bFoo,Foo);
|
lex_sites(Foo);
|
||||||
|
|
||||||
|
|
||||||
|
//setCheckerboard(ShiftedCheck,rFoo);
|
||||||
|
//setCheckerboard(ShiftedCheck,bFoo);
|
||||||
|
|
||||||
Shifted = Cshift(Foo,dir,shift); // Shift everything
|
|
||||||
bShifted = Cshift(rFoo,dir,shift); // Shift red
|
|
||||||
rShifted = Cshift(bFoo,dir,shift); // Shift black
|
|
||||||
|
|
||||||
setCheckerboard(ShiftedCheck,bShifted); // Put them all together
|
|
||||||
setCheckerboard(ShiftedCheck,rShifted); // and check the results (later)
|
|
||||||
|
|
||||||
// Lattice SU(3) x SU(3)
|
// Lattice SU(3) x SU(3)
|
||||||
FooBar = Foo * Bar;
|
FooBar = Foo * Bar;
|
||||||
@ -211,25 +216,69 @@ int main (int argc, char ** argv)
|
|||||||
printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
|
printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
|
||||||
printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
|
printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
|
||||||
|
|
||||||
pickCheckerboard(0,rFoo,FooBar);
|
// pickCheckerboard(0,rFoo,FooBar);
|
||||||
pickCheckerboard(1,bFoo,FooBar);
|
// pickCheckerboard(1,bFoo,FooBar);
|
||||||
|
// setCheckerboard(FooBar,rFoo);
|
||||||
setCheckerboard(FooBar,rFoo);
|
// setCheckerboard(FooBar,bFoo);
|
||||||
setCheckerboard(FooBar,bFoo);
|
|
||||||
|
|
||||||
double nrm=0;
|
double nrm=0;
|
||||||
|
|
||||||
|
for(int dir=0;dir<4;dir++){
|
||||||
|
for(int shift=0;shift<latt_size[dir];shift++){
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pickCheckerboard(0,rFoo,Foo); // Pick out red or black checkerboards
|
||||||
|
pickCheckerboard(1,bFoo,Foo);
|
||||||
|
|
||||||
std::vector<int> coor(4);
|
std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
|
||||||
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
Shifted = Cshift(Foo,dir,shift); // Shift everything
|
||||||
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
|
||||||
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
std::cout << "Shifting even parities"<<std::endl;
|
||||||
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
bShifted = Cshift(rFoo,dir,shift); // Shift red->black
|
||||||
|
std::cout << "Shifting odd parities"<<std::endl;
|
||||||
|
rShifted = Cshift(bFoo,dir,shift); // Shift black->red
|
||||||
|
|
||||||
|
ShiftedCheck=zero;
|
||||||
|
setCheckerboard(ShiftedCheck,bShifted); // Put them all together
|
||||||
|
setCheckerboard(ShiftedCheck,rShifted); // and check the results (later)
|
||||||
|
|
||||||
|
// Check results
|
||||||
|
std::vector<int> coor(4);
|
||||||
|
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||||
|
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||||
|
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||||
|
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||||
|
|
||||||
std::complex<dpo::Real> diff;
|
std::complex<dpo::Real> diff;
|
||||||
|
|
||||||
std::vector<int> shiftcoor = coor;
|
std::vector<int> shiftcoor = coor;
|
||||||
shiftcoor[dir]=(shiftcoor[dir]-shift+latt_size[dir])%latt_size[dir];
|
shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%latt_size[dir];
|
||||||
|
|
||||||
|
std::vector<int> rl(4);
|
||||||
|
for(int dd=0;dd<4;dd++){
|
||||||
|
rl[dd] = latt_size[dd]/simd_layout[dd];
|
||||||
|
}
|
||||||
|
int lex = coor[0]%rl[0]
|
||||||
|
+ (coor[1]%rl[1])*rl[0]
|
||||||
|
+ (coor[2]%rl[2])*rl[0]*rl[1]
|
||||||
|
+ (coor[3]%rl[3])*rl[0]*rl[1]*rl[2];
|
||||||
|
lex +=
|
||||||
|
+1000*(coor[0]/rl[0])
|
||||||
|
+1000*(coor[1]/rl[1])*simd_layout[0]
|
||||||
|
+1000*(coor[2]/rl[2])*simd_layout[0]*simd_layout[1]
|
||||||
|
+1000*(coor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2];
|
||||||
|
|
||||||
|
int lex_coor = shiftcoor[0]%rl[0]
|
||||||
|
+ (shiftcoor[1]%rl[1])*rl[0]
|
||||||
|
+ (shiftcoor[2]%rl[2])*rl[0]*rl[1]
|
||||||
|
+ (shiftcoor[3]%rl[3])*rl[0]*rl[1]*rl[2];
|
||||||
|
lex_coor +=
|
||||||
|
+1000*(shiftcoor[0]/rl[0])
|
||||||
|
+1000*(shiftcoor[1]/rl[1])*simd_layout[0]
|
||||||
|
+1000*(shiftcoor[2]/rl[2])*simd_layout[0]*simd_layout[1]
|
||||||
|
+1000*(shiftcoor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2];
|
||||||
|
|
||||||
ColourMatrix foo;
|
ColourMatrix foo;
|
||||||
ColourMatrix bar;
|
ColourMatrix bar;
|
||||||
ColourMatrix shifted1;
|
ColourMatrix shifted1;
|
||||||
@ -242,7 +291,6 @@ int main (int argc, char ** argv)
|
|||||||
peekSite(shifted1,Shifted,coor);
|
peekSite(shifted1,Shifted,coor);
|
||||||
peekSite(shifted2,Foo,shiftcoor);
|
peekSite(shifted2,Foo,shiftcoor);
|
||||||
peekSite(shifted3,ShiftedCheck,coor);
|
peekSite(shifted3,ShiftedCheck,coor);
|
||||||
|
|
||||||
peekSite(foo,Foo,coor);
|
peekSite(foo,Foo,coor);
|
||||||
|
|
||||||
mdiff = shifted1-shifted2;
|
mdiff = shifted1-shifted2;
|
||||||
@ -258,13 +306,13 @@ int main (int argc, char ** argv)
|
|||||||
diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c];
|
diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c];
|
||||||
nn=real(conj(diff)*diff);
|
nn=real(conj(diff)*diff);
|
||||||
if ( nn > 0 )
|
if ( nn > 0 )
|
||||||
cout<<"Shift fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||||
<<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
<<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
||||||
<< " "<< foo._internal._internal[r][c]<<endl;
|
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
|
||||||
else if(0)
|
else if(0)
|
||||||
cout<<"Shift pass "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
cout<<"Shift pass 1vs2 "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||||
<<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
<<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
||||||
<< " "<< foo._internal._internal[r][c]<<endl;
|
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
@ -272,13 +320,13 @@ int main (int argc, char ** argv)
|
|||||||
diff =shifted3._internal._internal[r][c]-shifted2._internal._internal[r][c];
|
diff =shifted3._internal._internal[r][c]-shifted2._internal._internal[r][c];
|
||||||
nn=real(conj(diff)*diff);
|
nn=real(conj(diff)*diff);
|
||||||
if ( nn > 0 )
|
if ( nn > 0 )
|
||||||
cout<<"Shift rb fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||||
<<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
<<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
||||||
<< " "<< foo._internal._internal[r][c]<<endl;
|
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
|
||||||
else if(0)
|
else if(0)
|
||||||
cout<<"Shift rb pass "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
cout<<"Shift rb pass 3vs2 "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||||
<<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
<<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
||||||
<< " "<< foo._internal._internal[r][c]<<endl;
|
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
|
||||||
}}
|
}}
|
||||||
peekSite(bar,Bar,coor);
|
peekSite(bar,Bar,coor);
|
||||||
|
|
||||||
@ -290,7 +338,7 @@ int main (int argc, char ** argv)
|
|||||||
nrm = nrm + real(conj(diff)*diff);
|
nrm = nrm + real(conj(diff)*diff);
|
||||||
}}
|
}}
|
||||||
}}}}
|
}}}}
|
||||||
|
}}
|
||||||
std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;
|
std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;
|
||||||
|
|
||||||
} // loop for lat
|
} // loop for lat
|
||||||
|
@ -1,6 +1,63 @@
|
|||||||
#ifndef GRID_MATH_TYPES_H
|
#ifndef GRID_MATH_TYPES_H
|
||||||
#define GRID_MATH_TYPES_H
|
#define GRID_MATH_TYPES_H
|
||||||
namespace dpo {
|
namespace dpo {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
template <class T> class GridTypeMapper {
|
||||||
|
public:
|
||||||
|
typedef typename T::scalar_type scalar_type;
|
||||||
|
typedef typename T::vector_type vector_type;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> class GridTypeMapper<RealF> {
|
||||||
|
public:
|
||||||
|
typedef RealF scalar_type;
|
||||||
|
typedef RealF vector_type;
|
||||||
|
};
|
||||||
|
template<> class GridTypeMapper<RealD> {
|
||||||
|
public:
|
||||||
|
typedef RealD scalar_type;
|
||||||
|
typedef RealD vector_type;
|
||||||
|
};
|
||||||
|
template<> class GridTypeMapper<ComplexF> {
|
||||||
|
public:
|
||||||
|
typedef ComplexF scalar_type;
|
||||||
|
typedef ComplexF vector_type;
|
||||||
|
};
|
||||||
|
template<> class GridTypeMapper<ComplexD> {
|
||||||
|
public:
|
||||||
|
typedef ComplexD scalar_type;
|
||||||
|
typedef ComplexD vector_type;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> class GridTypeMapper<vRealF> {
|
||||||
|
public:
|
||||||
|
typedef RealF scalar_type;
|
||||||
|
typedef vRealF vector_type;
|
||||||
|
};
|
||||||
|
template<> class GridTypeMapper<vRealD> {
|
||||||
|
public:
|
||||||
|
typedef RealD scalar_type;
|
||||||
|
typedef vRealD vector_type;
|
||||||
|
};
|
||||||
|
template<> class GridTypeMapper<vComplexF> {
|
||||||
|
public:
|
||||||
|
typedef ComplexF scalar_type;
|
||||||
|
typedef vComplexF vector_type;
|
||||||
|
};
|
||||||
|
template<> class GridTypeMapper<vComplexD> {
|
||||||
|
public:
|
||||||
|
typedef ComplexD scalar_type;
|
||||||
|
typedef vComplexD vector_type;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// Scalar, Vector, Matrix objects.
|
// Scalar, Vector, Matrix objects.
|
||||||
// These can be composed to form tensor products of internal indices.
|
// These can be composed to form tensor products of internal indices.
|
||||||
@ -10,6 +67,10 @@ template<class vtype> class iScalar
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SIMDalign vtype _internal;
|
SIMDalign vtype _internal;
|
||||||
|
|
||||||
|
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||||
|
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||||
|
|
||||||
iScalar(){};
|
iScalar(){};
|
||||||
iScalar(Zero &z){ *this = zero; };
|
iScalar(Zero &z){ *this = zero; };
|
||||||
iScalar<vtype> & operator= (const Zero &hero){
|
iScalar<vtype> & operator= (const Zero &hero){
|
||||||
@ -19,6 +80,15 @@ public:
|
|||||||
friend void zeroit(iScalar<vtype> &that){
|
friend void zeroit(iScalar<vtype> &that){
|
||||||
zeroit(that._internal);
|
zeroit(that._internal);
|
||||||
}
|
}
|
||||||
|
friend void permute(iScalar<vtype> &out,iScalar<vtype> &in,int permutetype){
|
||||||
|
permute(out._internal,in._internal,permutetype);
|
||||||
|
}
|
||||||
|
friend void extract(iScalar<vtype> &in,std::vector<scalar_type *> &out){
|
||||||
|
extract(in._internal,out); // extract advances the pointers in out
|
||||||
|
}
|
||||||
|
friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
|
||||||
|
merge(in._internal,out); // extract advances the pointers in out
|
||||||
|
}
|
||||||
// Unary negation
|
// Unary negation
|
||||||
friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
|
friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
|
||||||
iScalar<vtype> ret;
|
iScalar<vtype> ret;
|
||||||
@ -38,14 +108,16 @@ public:
|
|||||||
*this = (*this)+r;
|
*this = (*this)+r;
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class vtype,int N> class iVector
|
template<class vtype,int N> class iVector
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SIMDalign vtype _internal[N];
|
SIMDalign vtype _internal[N];
|
||||||
|
|
||||||
|
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||||
|
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||||
|
|
||||||
iVector(Zero &z){ *this = zero; };
|
iVector(Zero &z){ *this = zero; };
|
||||||
iVector() {};
|
iVector() {};
|
||||||
iVector<vtype,N> & operator= (Zero &hero){
|
iVector<vtype,N> & operator= (Zero &hero){
|
||||||
@ -57,6 +129,21 @@ public:
|
|||||||
zeroit(that._internal[i]);
|
zeroit(that._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
friend void permute(iVector<vtype,N> &out,iVector<vtype,N> &in,int permutetype){
|
||||||
|
for(int i=0;i<N;i++){
|
||||||
|
permute(out._internal[i],in._internal[i],permutetype);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
friend void extract(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
|
||||||
|
for(int i=0;i<N;i++){
|
||||||
|
extract(in._internal[i],out);// extract advances pointers in out
|
||||||
|
}
|
||||||
|
}
|
||||||
|
friend void merge(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
|
||||||
|
for(int i=0;i<N;i++){
|
||||||
|
merge(in._internal[i],out);// extract advances pointers in out
|
||||||
|
}
|
||||||
|
}
|
||||||
// Unary negation
|
// Unary negation
|
||||||
friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
|
friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
|
||||||
iVector<vtype,N> ret;
|
iVector<vtype,N> ret;
|
||||||
@ -84,6 +171,10 @@ template<class vtype,int N> class iMatrix
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SIMDalign vtype _internal[N][N];
|
SIMDalign vtype _internal[N][N];
|
||||||
|
|
||||||
|
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||||
|
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||||
|
|
||||||
iMatrix(Zero &z){ *this = zero; };
|
iMatrix(Zero &z){ *this = zero; };
|
||||||
iMatrix() {};
|
iMatrix() {};
|
||||||
iMatrix<vtype,N> & operator= (Zero &hero){
|
iMatrix<vtype,N> & operator= (Zero &hero){
|
||||||
@ -96,6 +187,24 @@ public:
|
|||||||
zeroit(that._internal[i][j]);
|
zeroit(that._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
friend void permute(iMatrix<vtype,N> &out,iMatrix<vtype,N> &in,int permutetype){
|
||||||
|
for(int i=0;i<N;i++){
|
||||||
|
for(int j=0;j<N;j++){
|
||||||
|
permute(out._internal[i][j],in._internal[i][j],permutetype);
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
friend void extract(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
|
||||||
|
for(int i=0;i<N;i++){
|
||||||
|
for(int j=0;j<N;j++){
|
||||||
|
extract(in._internal[i][j],out);// extract advances pointers in out
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
|
||||||
|
for(int i=0;i<N;i++){
|
||||||
|
for(int j=0;j<N;j++){
|
||||||
|
merge(in._internal[i][j],out);// extract advances pointers in out
|
||||||
|
}}
|
||||||
|
}
|
||||||
// Unary negation
|
// Unary negation
|
||||||
friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
|
friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
|
||||||
iMatrix<vtype,N> ret;
|
iMatrix<vtype,N> ret;
|
||||||
|
85
Grid_mpi.cc
Normal file
85
Grid_mpi.cc
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
#include "Grid.h"
|
||||||
|
#include <mpi.h>
|
||||||
|
|
||||||
|
namespace dpo {
|
||||||
|
|
||||||
|
// Should error check all MPI calls.
|
||||||
|
|
||||||
|
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
||||||
|
{
|
||||||
|
_ndimension = _processors.size();
|
||||||
|
std::vector<int> periodic(_ndimension,1);
|
||||||
|
|
||||||
|
_processors = processors;
|
||||||
|
_processor_coords.resize(_ndimension);
|
||||||
|
|
||||||
|
MPI_Cart_create(MPI_COMM_WORLD _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||||
|
MPI_Comm_rank(communicator,&_processor);
|
||||||
|
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coords[0]);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::GlobalSumF(float &f){
|
||||||
|
MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumFVector(float *f,int N)
|
||||||
|
{
|
||||||
|
MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumF(double &d)
|
||||||
|
{
|
||||||
|
MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumFVector(double *d,int N)
|
||||||
|
{
|
||||||
|
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
}
|
||||||
|
|
||||||
|
int CartesianCommunicator::ShiftedRank(int dim,int shift)
|
||||||
|
{
|
||||||
|
int rank;
|
||||||
|
MPI_Cart_shift(communicator,dim,shift,&_processor,&rank);
|
||||||
|
return rank;
|
||||||
|
}
|
||||||
|
int CartesianCommunicator::Rank(std::vector<int> coor)
|
||||||
|
{
|
||||||
|
int rank;
|
||||||
|
MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||||
|
return rank;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
|
std::vector<int> &to_coordinate,
|
||||||
|
void *recv,
|
||||||
|
std::vector<int> &from_coordinate,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
int dest = Rank(to_coordinate);
|
||||||
|
int from = Rank(from_coordinate);
|
||||||
|
|
||||||
|
MPI_Request x_req;
|
||||||
|
MPI_Request r_req;
|
||||||
|
MPI_Status OkeyDokey;
|
||||||
|
|
||||||
|
MPI_Isend(xmit, bytes, MPI_CHAR,dest,dest,communicator,&x_req);
|
||||||
|
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&r_req);
|
||||||
|
MPI_Wait(&x_req,&OkeyDokey);
|
||||||
|
MPI_Wait(&r_req,&OkeyDokey);
|
||||||
|
MPI_Barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
|
||||||
|
// Could possibly do a direct block strided send?
|
||||||
|
int MPI_Type_vector(
|
||||||
|
int count,
|
||||||
|
int blocklength,
|
||||||
|
int stride,
|
||||||
|
MPI_Datatype old_type,
|
||||||
|
MPI_Datatype *newtype_p
|
||||||
|
);
|
||||||
|
|
||||||
|
#endif
|
384
Grid_mpi_cshift.h
Normal file
384
Grid_mpi_cshift.h
Normal file
@ -0,0 +1,384 @@
|
|||||||
|
#ifndef _GRID_MPI_CSHIFT_H_
|
||||||
|
#define _GRID_MPI_CSHIFT_H_
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Q. Split this into seperate sub functions?
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
|
||||||
|
// CshiftCB_comms_splice
|
||||||
|
// CshiftCB_comms
|
||||||
|
// CshiftCB_local
|
||||||
|
// CshiftCB_local_permute
|
||||||
|
|
||||||
|
// Cshift_comms_splice
|
||||||
|
// Cshift_comms
|
||||||
|
// Cshift_local
|
||||||
|
// Cshift_local_permute
|
||||||
|
|
||||||
|
// Broadly I remain annoyed that the iteration is so painful
|
||||||
|
// for red black data layout, when simple block strided descriptors suffice for non-cb.
|
||||||
|
//
|
||||||
|
// The other option is to do it table driven, or perhaps store the CB of each site in a table.
|
||||||
|
//
|
||||||
|
// Must not lose sight that goal is to be able to construct really efficient
|
||||||
|
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
||||||
|
// additional stencil support.
|
||||||
|
//
|
||||||
|
// Could still do a templated syntax tree and make CSHIFT return lattice vector.
|
||||||
|
//
|
||||||
|
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
||||||
|
//
|
||||||
|
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
||||||
|
//
|
||||||
|
// Grid could create a neighbour index table for a given stencil.
|
||||||
|
// Could also implement CovariantCshift.
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////
|
||||||
|
//Non checkerboarded support functions
|
||||||
|
//////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
friend void Gather_plane (Lattice<vobj> &rhs,std::vector<vobj> &buffer, int dimension,int plane)
|
||||||
|
{
|
||||||
|
const int Nsimd = vector_type::Nsimd();
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int o = 0; // relative offset to base within plane
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
int permute_slice=0;
|
||||||
|
int wrap = sshift/rd;
|
||||||
|
int num = sshift%rd;
|
||||||
|
|
||||||
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
|
||||||
|
} else {
|
||||||
|
ret._odata[ro+o+b]=rhs._odata[so+o+b];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||||
|
//
|
||||||
|
//friend void Scatter_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||||
|
//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||||
|
//
|
||||||
|
//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||||
|
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||||
|
//
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////
|
||||||
|
//Checkerboarded support functions
|
||||||
|
//////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
//friend void GatherCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||||
|
//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||||
|
//
|
||||||
|
//friend void ScatterCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||||
|
//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||||
|
//
|
||||||
|
//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||||
|
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||||
|
|
||||||
|
|
||||||
|
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
const int Nsimd = vector_type::Nsimd();
|
||||||
|
|
||||||
|
Lattice<vobj> ret(rhs._grid);
|
||||||
|
|
||||||
|
int fd = rhs._grid->_fdimensions[dimension];
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
//int ld = rhs._grid->_ldimensions[dimension];
|
||||||
|
//int gd = rhs._grid->_gdimensions[dimension];
|
||||||
|
|
||||||
|
|
||||||
|
// Map to always positive shift modulo global full dimension.
|
||||||
|
shift = (shift+fd)%fd;
|
||||||
|
|
||||||
|
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||||
|
|
||||||
|
// the permute type
|
||||||
|
int simd_layout = rhs._grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = rhs._grid->_processors[dimension] >1 ;
|
||||||
|
int permute_dim = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);
|
||||||
|
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||||
|
|
||||||
|
int permute_type=0;
|
||||||
|
for(int d=0;d<dimension;d++){
|
||||||
|
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Logic for non-distributed dimension
|
||||||
|
std::vector<int> comm_offnode(simd_layout);
|
||||||
|
std::vector<int> comm_to (simd_layout);
|
||||||
|
std::vector<int> comm_from (simd_layout);
|
||||||
|
std::vector<int> comm_rx (simd_layout); // reduced coordinate of neighbour plane
|
||||||
|
std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane
|
||||||
|
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
// Move via a fake comms buffer
|
||||||
|
// Simd direction uses an extract/merge pair
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||||
|
int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
|
||||||
|
std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||||
|
std::vector<scalar_type *> pointers(Nsimd);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if ( permute_dim ) {
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||||
|
int o = 0; // relative offset to base
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
int permute_slice=0;
|
||||||
|
int wrap = sshift/rd;
|
||||||
|
int num = sshift%rd;
|
||||||
|
|
||||||
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
|
||||||
|
} else {
|
||||||
|
ret._odata[ro+o+b]=rhs._odata[so+o+b];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if ( splice_dim ) {
|
||||||
|
|
||||||
|
if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace
|
||||||
|
if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);
|
||||||
|
|
||||||
|
|
||||||
|
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||||
|
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Extract one orthogonal slice at a time
|
||||||
|
///////////////////////////////////////////
|
||||||
|
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||||
|
|
||||||
|
o = 0; // relative offset to base
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
|
// base offset for source
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
int wrap = sshift/rd;
|
||||||
|
int num = sshift%rd;
|
||||||
|
|
||||||
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
extract(rhs._odata[so+o+b],pointers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Work out what to send where
|
||||||
|
///////////////////////////////////////////
|
||||||
|
|
||||||
|
for(int s=0;s<simd_layout;s++) {
|
||||||
|
|
||||||
|
|
||||||
|
// shift to "neighbour" takes us off node
|
||||||
|
// coordinates (rx, simd_lane) of neighbour
|
||||||
|
// how many nodes away is this shift
|
||||||
|
// where we should send to
|
||||||
|
// where we should receive from
|
||||||
|
int shifted_x = x+s*rd+shift;
|
||||||
|
comm_offnode[s] = shifted_x > ld;
|
||||||
|
comm_send_rx[s] = shifted_x%rd; // which slice geton the other node
|
||||||
|
comm_send_simd_lane [s] = shifted_x/rd; // which slice on the other node
|
||||||
|
comm_from[s] = shifted_x/ld;
|
||||||
|
comm_to [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];
|
||||||
|
comm_from[s] = (comm_from[s]+_processors[dimension]) % _processors[dimension];
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
// Insert communication phase
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
#if 0
|
||||||
|
} else if (comm_dim ) {
|
||||||
|
|
||||||
|
// Packed gather sequence is clean
|
||||||
|
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];
|
||||||
|
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
|
||||||
|
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
||||||
|
|
||||||
|
// off node; communcate slice (ld==rd)
|
||||||
|
if ( x+shift > rd ) {
|
||||||
|
int sb=0;
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||||
|
send_buf[sb++]=rhs._odata[so+i];
|
||||||
|
}
|
||||||
|
so+=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make a comm_fake them mimics comms in periodic case.
|
||||||
|
|
||||||
|
// scatter face
|
||||||
|
int rb=0;
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||||
|
ret._odata[so+i]=recv_buf[rb++];
|
||||||
|
}
|
||||||
|
so+=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||||
|
ret._odata[o+i]=rhs._odata[so+i];
|
||||||
|
}
|
||||||
|
o+=rhs._grid->_slice_stride[dimension];
|
||||||
|
so+=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
// Pull receive buffers and permuted buffers in
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||||
|
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
o = 0; // relative offset to base
|
||||||
|
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
|
// base offset for source
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
int wrap = sshift/rd;
|
||||||
|
int num = sshift%rd;
|
||||||
|
|
||||||
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
merge(ret._odata[ro+o+b],pointers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
} else if ( comm_dim ) {
|
||||||
|
|
||||||
|
|
||||||
|
int co; // comm offset
|
||||||
|
int o;
|
||||||
|
|
||||||
|
co=0;
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
o=0;
|
||||||
|
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
// This call in inner loop is annoying but necessary for dimension=0
|
||||||
|
// in the case of RedBlack grids. Could optimise away with
|
||||||
|
// alternate code paths for all other cases.
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
|
||||||
|
comm_buf[co++]=rhs._odata[so+o+b];
|
||||||
|
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step through a copy into a comms buffer and pull back in.
|
||||||
|
// Genuine fake implementation could calculate if loops back
|
||||||
|
co=0; o=0;
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
ret._odata[ro+o+b]=comm_buf[co++];
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else { // Local dimension, no permute required
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
int o=0;
|
||||||
|
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
// This call in inner loop is annoying but necessary for dimension=0
|
||||||
|
// in the case of RedBlack grids. Could optimise away with
|
||||||
|
// alternate code paths for all other cases.
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||||
|
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
313
Grid_none_cshift.h
Normal file
313
Grid_none_cshift.h
Normal file
@ -0,0 +1,313 @@
|
|||||||
|
#ifndef _GRID_NONE_CSHIFT_H_
|
||||||
|
#define _GRID_NONE_CSHIFT_H_
|
||||||
|
// Work out whether to permute
|
||||||
|
// ABCDEFGH -> AE BF CG DH permute wrap num
|
||||||
|
//
|
||||||
|
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
|
||||||
|
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
|
||||||
|
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
|
||||||
|
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
|
||||||
|
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
|
||||||
|
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
|
||||||
|
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
|
||||||
|
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
|
||||||
|
|
||||||
|
// Suppose 4way simd in one dim.
|
||||||
|
// ABCDEFGH -> AECG BFDH permute wrap num
|
||||||
|
|
||||||
|
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
|
||||||
|
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
|
||||||
|
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
|
||||||
|
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
|
||||||
|
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
|
||||||
|
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
|
||||||
|
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
|
||||||
|
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
|
||||||
|
|
||||||
|
// Generalisation to 8 way simd, 16 way simd required.
|
||||||
|
//
|
||||||
|
// Need log2 Nway masks. consisting of
|
||||||
|
// 1 bit 256 bit granule
|
||||||
|
// 2 bit 128 bit granule
|
||||||
|
// 4 bits 64 bit granule
|
||||||
|
// 8 bits 32 bit granules
|
||||||
|
//
|
||||||
|
// 15 bits....
|
||||||
|
|
||||||
|
// For optimisation:
|
||||||
|
//
|
||||||
|
// split into Cshift_none_rb_permute
|
||||||
|
// split into Cshift_none_rb_simple
|
||||||
|
//
|
||||||
|
// split into Cshift_none_permute
|
||||||
|
// split into Cshift_none_simple
|
||||||
|
|
||||||
|
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
|
{
|
||||||
|
Lattice<vobj> ret(rhs._grid);
|
||||||
|
|
||||||
|
int fd = rhs._grid->_fdimensions[dimension];
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
int ld = rhs._grid->_ldimensions[dimension];
|
||||||
|
int gd = rhs._grid->_gdimensions[dimension];
|
||||||
|
|
||||||
|
|
||||||
|
// Map to always positive shift modulo global full dimension.
|
||||||
|
shift = (shift+fd)%fd;
|
||||||
|
|
||||||
|
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||||
|
|
||||||
|
// the permute type
|
||||||
|
|
||||||
|
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||||
|
int permute_type=0;
|
||||||
|
for(int d=0;d<dimension;d++){
|
||||||
|
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int bo = x*rhs._grid->_ostride[dimension];
|
||||||
|
int o = 0;
|
||||||
|
|
||||||
|
if ( permute_dim ) {
|
||||||
|
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
int wrap = sshift/rd;
|
||||||
|
int num = sshift%rd;
|
||||||
|
|
||||||
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
|
else permute_slice = 1-wrap;
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type);
|
||||||
|
} else {
|
||||||
|
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
// This call in inner loop is annoying but necessary for dimension=0
|
||||||
|
// in the case of RedBlack grids. Could optimise away with
|
||||||
|
// alternate code paths for all other cases.
|
||||||
|
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int so = sx*rhs._grid->_ostride[dimension];
|
||||||
|
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||||
|
|
||||||
|
}
|
||||||
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// Collapse doesn't appear to work the way I think it should in icpc
|
||||||
|
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
|
{
|
||||||
|
Lattice<vobj> ret(rhs._grid);
|
||||||
|
|
||||||
|
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||||
|
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||||
|
int sx,so,o;
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
int ld = rhs._grid->_dimensions[dimension];
|
||||||
|
// Map to always positive shift.
|
||||||
|
shift = (shift+ld)%ld;
|
||||||
|
// Work out whether to permute and the permute type
|
||||||
|
// ABCDEFGH -> AE BF CG DH permute
|
||||||
|
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||||
|
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||||
|
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||||
|
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||||
|
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||||
|
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||||
|
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||||
|
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||||
|
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
||||||
|
int permute_type=0;
|
||||||
|
for(int d=0;d<dimension;d++)
|
||||||
|
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
||||||
|
|
||||||
|
// loop over perp slices.
|
||||||
|
// Threading considerations:
|
||||||
|
// Need to map thread_num to
|
||||||
|
//
|
||||||
|
// x_min,x_max for Loop-A
|
||||||
|
// n_min,n_max for Loop-B
|
||||||
|
// b_min,b_max for Loop-C
|
||||||
|
// In a way that maximally load balances.
|
||||||
|
//
|
||||||
|
// Optimal:
|
||||||
|
// There are rd*n_block*block items of work.
|
||||||
|
// These serialise as item "w"
|
||||||
|
// b=w%block; w=w/block
|
||||||
|
// n=w%nblock; x=w/nblock. Perhaps 20 cycles?
|
||||||
|
//
|
||||||
|
// Logic:
|
||||||
|
// x_chunk = (rd+thread)/nthreads simply divide work across nodes.
|
||||||
|
//
|
||||||
|
// rd=5 , threads = 8;
|
||||||
|
// 0 1 2 3 4 5 6 7
|
||||||
|
// 0 0 0 1 1 1 1 1
|
||||||
|
for(int x=0;x<rd;x++){ // Loop A
|
||||||
|
sx = (x-shift+ld)%rd;
|
||||||
|
o = x*rhs._grid->_ostride[dimension];
|
||||||
|
so =sx*rhs._grid->_ostride[dimension];
|
||||||
|
int permute_slice=0;
|
||||||
|
if ( permute_dim ) {
|
||||||
|
permute_slice = shift/rd;
|
||||||
|
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||||
|
}
|
||||||
|
#if 0
|
||||||
|
if ( permute_slice ) {
|
||||||
|
|
||||||
|
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||||
|
int num =rhs._grid->_slice_block[dimension]*internal;
|
||||||
|
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
vComplex *optr = (vComplex *)&ret._odata[o];
|
||||||
|
vComplex *iptr = (vComplex *)&rhs._odata[so];
|
||||||
|
for(int b=0;b<num;b++){
|
||||||
|
permute(optr[b],iptr[b],permute_type);
|
||||||
|
}
|
||||||
|
o+=rhs._grid->_slice_stride[dimension];
|
||||||
|
so+=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||||
|
ret._odata[o+i]=rhs._odata[so+i];
|
||||||
|
}
|
||||||
|
o+=rhs._grid->_slice_stride[dimension];
|
||||||
|
so+=rhs._grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if ( permute_slice ) {
|
||||||
|
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||||
|
int num =rhs._grid->_slice_block[dimension]*internal;
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<num;b++){
|
||||||
|
vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
|
||||||
|
vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
|
||||||
|
permute(optr[b],iptr[b],permute_type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||||
|
int oo = o+ n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int soo=so+ n*rhs._grid->_slice_stride[dimension];
|
||||||
|
ret._odata[oo+i]=rhs._odata[soo+i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
|
{
|
||||||
|
Lattice<vobj> ret(rhs._grid);
|
||||||
|
|
||||||
|
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||||
|
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||||
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
int ld = rhs._grid->_dimensions[dimension];
|
||||||
|
|
||||||
|
// Map to always positive shift.
|
||||||
|
shift = (shift+ld)%ld;
|
||||||
|
|
||||||
|
// Work out whether to permute and the permute type
|
||||||
|
// ABCDEFGH -> AE BF CG DH permute
|
||||||
|
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||||
|
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||||
|
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||||
|
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||||
|
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||||
|
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||||
|
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||||
|
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||||
|
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
||||||
|
int permute_type=0;
|
||||||
|
for(int d=0;d<dimension;d++)
|
||||||
|
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
||||||
|
|
||||||
|
|
||||||
|
// loop over all work
|
||||||
|
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int ww=0;ww<work;ww++){
|
||||||
|
|
||||||
|
|
||||||
|
// can optimise this if know w moves congtiguously for a given thread.
|
||||||
|
// b=(b+1);
|
||||||
|
// if (b==_slice_block) {b=0; n=n+1;}
|
||||||
|
// if (n==_slice_nblock) { n=0; x=x+1}
|
||||||
|
//
|
||||||
|
// Perhaps a five cycle iterator, or so.
|
||||||
|
int w=ww;
|
||||||
|
int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
|
||||||
|
int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
|
||||||
|
int x = w;
|
||||||
|
|
||||||
|
int sx,so,o;
|
||||||
|
sx = (x-shift+ld)%rd;
|
||||||
|
o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
|
||||||
|
so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
if ( permute_dim ) {
|
||||||
|
permute_slice = shift/rd;
|
||||||
|
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( permute_slice ) {
|
||||||
|
|
||||||
|
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||||
|
vComplex *optr = (vComplex *)&ret._odata[o+b];
|
||||||
|
vComplex *iptr = (vComplex *)&rhs._odata[so+b];
|
||||||
|
const char *pf = (const char *)iptr;
|
||||||
|
for(int i=0;i<sizeof(vobj);i+=64){
|
||||||
|
_mm_prefetch(pf+i,_MM_HINT_T0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i=0;i<internal;i++){
|
||||||
|
permute(optr[i],iptr[i],permute_type);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const char *pf = (const char *) &rhs._odata[so+b];
|
||||||
|
for(int i=0;i<sizeof(vobj);i+=64){
|
||||||
|
_mm_prefetch(pf+i,_MM_HINT_T0);
|
||||||
|
}
|
||||||
|
ret._odata[o+b]=rhs._odata[so+b];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
@ -47,7 +47,7 @@ namespace dpo {
|
|||||||
typedef std::complex<RealF> ComplexF;
|
typedef std::complex<RealF> ComplexF;
|
||||||
typedef std::complex<RealD> ComplexD;
|
typedef std::complex<RealD> ComplexD;
|
||||||
typedef std::complex<Real> Complex;
|
typedef std::complex<Real> Complex;
|
||||||
|
|
||||||
|
|
||||||
inline RealF adj(const RealF & r){ return r; }
|
inline RealF adj(const RealF & r){ return r; }
|
||||||
inline RealF conj(const RealF & r){ return r; }
|
inline RealF conj(const RealF & r){ return r; }
|
||||||
@ -130,7 +130,7 @@ namespace dpo {
|
|||||||
#if defined (AVX1) || defined (AVX2)
|
#if defined (AVX1) || defined (AVX2)
|
||||||
typedef __m256 fvec;
|
typedef __m256 fvec;
|
||||||
typedef __m256d dvec;
|
typedef __m256d dvec;
|
||||||
typedef __m256 cvec;
|
typedef __m256 cvec;
|
||||||
typedef __m256d zvec;
|
typedef __m256d zvec;
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX512)
|
#if defined (AVX512)
|
||||||
|
@ -8,6 +8,9 @@ namespace dpo {
|
|||||||
protected:
|
protected:
|
||||||
zvec v;
|
zvec v;
|
||||||
public:
|
public:
|
||||||
|
typedef zvec vector_type;
|
||||||
|
typedef ComplexD scalar_type;
|
||||||
|
|
||||||
vComplexD & operator = ( Zero & z){
|
vComplexD & operator = ( Zero & z){
|
||||||
vzero(*this);
|
vzero(*this);
|
||||||
return (*this);
|
return (*this);
|
||||||
@ -145,14 +148,43 @@ namespace dpo {
|
|||||||
ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
|
ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
|
||||||
ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1);
|
ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1);
|
||||||
/* Imag OK */
|
/* Imag OK */
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
ret.v = vec_mul(a.v,b.v);
|
ret.v = vec_mul(a.v,b.v);
|
||||||
#endif
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Extract
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted){
|
||||||
|
// Bounce off stack is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
const int Nsimd = vComplexD::Nsimd();
|
||||||
|
std::vector<ComplexD> buf(Nsimd);
|
||||||
|
|
||||||
|
vstore(y,&buf[0]);
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
*extracted[i] = buf[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted){
|
||||||
|
// Bounce off stack is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
const int Nsimd = vComplexD::Nsimd();
|
||||||
|
std::vector<ComplexD> buf(Nsimd);
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
buf[i]=*extracted[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
vset(y,&buf[0]);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// Permute
|
// Permute
|
||||||
@ -165,7 +197,7 @@ namespace dpo {
|
|||||||
// AB => BA i.e. ab cd =>cd ab
|
// AB => BA i.e. ab cd =>cd ab
|
||||||
#endif
|
#endif
|
||||||
#ifdef SSE2
|
#ifdef SSE2
|
||||||
//No cases here
|
break;
|
||||||
#endif
|
#endif
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
// 4 complex=>2 permute
|
// 4 complex=>2 permute
|
||||||
@ -205,7 +237,7 @@ namespace dpo {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
friend inline void vset(vComplexD &ret, std::complex<double> *a){
|
friend inline void vset(vComplexD &ret,ComplexD *a){
|
||||||
#if defined (AVX1)|| defined (AVX2)
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||||
#endif
|
#endif
|
||||||
@ -221,7 +253,7 @@ namespace dpo {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
friend inline void vstore(vComplexD &ret, std::complex<double> *a){
|
friend inline void vstore(vComplexD &ret, ComplexD *a){
|
||||||
#if defined (AVX1)|| defined (AVX2)
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
_mm256_store_pd((double *)a,ret.v);
|
_mm256_store_pd((double *)a,ret.v);
|
||||||
#endif
|
#endif
|
||||||
@ -273,10 +305,10 @@ friend inline void vstore(vComplexD &ret, std::complex<double> *a){
|
|||||||
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
|
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
|
||||||
__attribute__ ((aligned(32))) double c_[4];
|
__attribute__ ((aligned(32))) double c_[4];
|
||||||
_mm256_store_pd(c_,in.v);
|
_mm256_store_pd(c_,in.v);
|
||||||
return std::complex<double>(c_[0]+c_[2],c_[1]+c_[3]);
|
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
|
||||||
#endif
|
#endif
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
return std::complex<double>(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
|
return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
#endif
|
#endif
|
||||||
|
@ -8,6 +8,11 @@ namespace dpo {
|
|||||||
cvec v;
|
cvec v;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
|
||||||
|
public:
|
||||||
|
typedef cvec vector_type;
|
||||||
|
typedef ComplexF scalar_type;
|
||||||
|
|
||||||
vComplexF & operator = ( Zero & z){
|
vComplexF & operator = ( Zero & z){
|
||||||
vzero(*this);
|
vzero(*this);
|
||||||
return (*this);
|
return (*this);
|
||||||
@ -125,6 +130,39 @@ namespace dpo {
|
|||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Extract
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted){
|
||||||
|
// Bounce off heap is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
vComplexF vbuf;
|
||||||
|
ComplexF *buf = (ComplexF *)&vbuf;
|
||||||
|
|
||||||
|
vstore(y,&buf[0]);
|
||||||
|
for(int i=0;i<vComplexF::Nsimd();i++){
|
||||||
|
*extracted[i] = buf[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted){
|
||||||
|
// Bounce off stack is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
const int Nsimd = vComplexF::Nsimd();
|
||||||
|
vComplexF vbuf;
|
||||||
|
ComplexF *buf = (ComplexF *)&vbuf;
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
buf[i]=*extracted[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
vset(y,&buf[0]);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// Permute
|
// Permute
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
@ -185,7 +223,7 @@ namespace dpo {
|
|||||||
vsplat(ret,a,b);
|
vsplat(ret,a,b);
|
||||||
}
|
}
|
||||||
|
|
||||||
friend inline void vstore(vComplexF &ret, std::complex<float> *a){
|
friend inline void vstore(vComplexF &ret, ComplexF *a){
|
||||||
#if defined (AVX1)|| defined (AVX2)
|
#if defined (AVX1)|| defined (AVX2)
|
||||||
_mm256_store_ps((float *)a,ret.v);
|
_mm256_store_ps((float *)a,ret.v);
|
||||||
#endif
|
#endif
|
||||||
@ -225,11 +263,11 @@ exit(-1);
|
|||||||
#if defined (AVX1) || defined(AVX2)
|
#if defined (AVX1) || defined(AVX2)
|
||||||
__attribute__ ((aligned(32))) float c_[8];
|
__attribute__ ((aligned(32))) float c_[8];
|
||||||
_mm256_store_ps(c_,in.v);
|
_mm256_store_ps(c_,in.v);
|
||||||
return std::complex<float>(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
|
return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
return std::complex<float>(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
|
return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
#endif
|
#endif
|
||||||
@ -321,9 +359,6 @@ exit(-1);
|
|||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public:
|
|
||||||
static int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
|
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
|
||||||
|
@ -7,7 +7,11 @@ namespace dpo{
|
|||||||
class vRealD {
|
class vRealD {
|
||||||
protected:
|
protected:
|
||||||
dvec v; // dvec is double precision vector
|
dvec v; // dvec is double precision vector
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
typedef dvec vector_type;
|
||||||
|
typedef RealD scalar_type;
|
||||||
|
|
||||||
vRealD(){};
|
vRealD(){};
|
||||||
|
|
||||||
friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
|
friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
|
||||||
@ -94,6 +98,37 @@ namespace dpo{
|
|||||||
#endif
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Extract
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
friend inline void extract(vRealD &y,std::vector<RealD *> &extracted){
|
||||||
|
// Bounce off stack is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
const int Nsimd = vRealD::Nsimd();
|
||||||
|
RealD buf[Nsimd];
|
||||||
|
|
||||||
|
vstore(y,buf);
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
*extracted[i] = buf[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted){
|
||||||
|
// Bounce off stack is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
const int Nsimd = vRealD::Nsimd();
|
||||||
|
RealD buf[Nsimd];
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
buf[i]=*extracted[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
vset(y,buf);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
// Permute plans
|
// Permute plans
|
||||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||||
|
@ -7,7 +7,12 @@ namespace dpo {
|
|||||||
class vRealF {
|
class vRealF {
|
||||||
protected:
|
protected:
|
||||||
fvec v;
|
fvec v;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
typedef fvec vector_type;
|
||||||
|
typedef RealF scalar_type;
|
||||||
|
|
||||||
vRealF(){};
|
vRealF(){};
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// Arithmetic operator overloads +,-,*
|
// Arithmetic operator overloads +,-,*
|
||||||
@ -113,6 +118,37 @@ namespace dpo {
|
|||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
|
friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
|
||||||
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
|
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Extract
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
friend inline void extract(vRealF &y,std::vector<RealF *> &extracted){
|
||||||
|
// Bounce off stack is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
const int Nsimd = vRealF::Nsimd();
|
||||||
|
RealF buf[Nsimd];
|
||||||
|
|
||||||
|
vstore(y,buf);
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
*extracted[i] = buf[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted){
|
||||||
|
// Bounce off stack is painful
|
||||||
|
// temporary hack while I figure out the right interface
|
||||||
|
const int Nsimd = vRealF::Nsimd();
|
||||||
|
RealF buf[Nsimd];
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
buf[i]=*extracted[i];
|
||||||
|
extracted[i]++;
|
||||||
|
}
|
||||||
|
vset(y,buf);
|
||||||
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Permute
|
// Permute
|
||||||
@ -256,7 +292,7 @@ friend inline void vstore(vRealF &ret, float *a){
|
|||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
static int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
||||||
};
|
};
|
||||||
inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
||||||
inline void zeroit(vRealF &z){ vzero(z);}
|
inline void zeroit(vRealF &z){ vzero(z);}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user