mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Fixing the Checkerboarding cshift.
Implemented "fake" communications in preparation for the leap to MPI.
This commit is contained in:
parent
613706dca2
commit
98f14f1030
239
Grid/Grid.xcodeproj/project.pbxproj
Normal file
239
Grid/Grid.xcodeproj/project.pbxproj
Normal file
@ -0,0 +1,239 @@
|
||||
// !$*UTF8*$!
|
||||
{
|
||||
archiveVersion = 1;
|
||||
classes = {
|
||||
};
|
||||
objectVersion = 46;
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
0BF5DA461AC370840045EA34 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0BF5DA451AC370840045EA34 /* main.cpp */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXCopyFilesBuildPhase section */
|
||||
0BF5DA401AC370840045EA34 /* CopyFiles */ = {
|
||||
isa = PBXCopyFilesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
dstPath = /usr/share/man/man1/;
|
||||
dstSubfolderSpec = 0;
|
||||
files = (
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 1;
|
||||
};
|
||||
/* End PBXCopyFilesBuildPhase section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
0BF5DA421AC370840045EA34 /* Grid */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Grid; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
0BF5DA451AC370840045EA34 /* main.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = main.cpp; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
/* Begin PBXFrameworksBuildPhase section */
|
||||
0BF5DA3F1AC370840045EA34 /* Frameworks */ = {
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXFrameworksBuildPhase section */
|
||||
|
||||
/* Begin PBXGroup section */
|
||||
0BF5DA391AC370840045EA34 = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
0BF5DA441AC370840045EA34 /* Grid */,
|
||||
0BF5DA431AC370840045EA34 /* Products */,
|
||||
);
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
0BF5DA431AC370840045EA34 /* Products */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
0BF5DA421AC370840045EA34 /* Grid */,
|
||||
);
|
||||
name = Products;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
0BF5DA441AC370840045EA34 /* Grid */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
0BF5DA451AC370840045EA34 /* main.cpp */,
|
||||
);
|
||||
path = Grid;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
/* End PBXGroup section */
|
||||
|
||||
/* Begin PBXNativeTarget section */
|
||||
0BF5DA411AC370840045EA34 /* Grid */ = {
|
||||
isa = PBXNativeTarget;
|
||||
buildConfigurationList = 0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */;
|
||||
buildPhases = (
|
||||
0BF5DA3E1AC370840045EA34 /* Sources */,
|
||||
0BF5DA3F1AC370840045EA34 /* Frameworks */,
|
||||
0BF5DA401AC370840045EA34 /* CopyFiles */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
dependencies = (
|
||||
);
|
||||
name = Grid;
|
||||
productName = Grid;
|
||||
productReference = 0BF5DA421AC370840045EA34 /* Grid */;
|
||||
productType = "com.apple.product-type.tool";
|
||||
};
|
||||
/* End PBXNativeTarget section */
|
||||
|
||||
/* Begin PBXProject section */
|
||||
0BF5DA3A1AC370840045EA34 /* Project object */ = {
|
||||
isa = PBXProject;
|
||||
attributes = {
|
||||
LastUpgradeCheck = 0610;
|
||||
ORGANIZATIONNAME = "University of Edinburgh";
|
||||
TargetAttributes = {
|
||||
0BF5DA411AC370840045EA34 = {
|
||||
CreatedOnToolsVersion = 6.1.1;
|
||||
};
|
||||
};
|
||||
};
|
||||
buildConfigurationList = 0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */;
|
||||
compatibilityVersion = "Xcode 3.2";
|
||||
developmentRegion = English;
|
||||
hasScannedForEncodings = 0;
|
||||
knownRegions = (
|
||||
en,
|
||||
);
|
||||
mainGroup = 0BF5DA391AC370840045EA34;
|
||||
productRefGroup = 0BF5DA431AC370840045EA34 /* Products */;
|
||||
projectDirPath = "";
|
||||
projectRoot = "";
|
||||
targets = (
|
||||
0BF5DA411AC370840045EA34 /* Grid */,
|
||||
);
|
||||
};
|
||||
/* End PBXProject section */
|
||||
|
||||
/* Begin PBXSourcesBuildPhase section */
|
||||
0BF5DA3E1AC370840045EA34 /* Sources */ = {
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
0BF5DA461AC370840045EA34 /* main.cpp in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXSourcesBuildPhase section */
|
||||
|
||||
/* Begin XCBuildConfiguration section */
|
||||
0BF5DA471AC370840045EA34 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
|
||||
CLANG_CXX_LIBRARY = "libc++";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu99;
|
||||
GCC_DYNAMIC_NO_PIC = NO;
|
||||
GCC_OPTIMIZATION_LEVEL = 0;
|
||||
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||
"DEBUG=1",
|
||||
"$(inherited)",
|
||||
);
|
||||
GCC_SYMBOLS_PRIVATE_EXTERN = NO;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.10;
|
||||
MTL_ENABLE_DEBUG_INFO = YES;
|
||||
ONLY_ACTIVE_ARCH = YES;
|
||||
SDKROOT = macosx;
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
0BF5DA481AC370840045EA34 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
|
||||
CLANG_CXX_LIBRARY = "libc++";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = YES;
|
||||
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||
ENABLE_NS_ASSERTIONS = NO;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu99;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.10;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
SDKROOT = macosx;
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
0BF5DA4A1AC370840045EA34 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
0BF5DA4B1AC370840045EA34 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
/* End XCBuildConfiguration section */
|
||||
|
||||
/* Begin XCConfigurationList section */
|
||||
0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
0BF5DA471AC370840045EA34 /* Debug */,
|
||||
0BF5DA481AC370840045EA34 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
0BF5DA4A1AC370840045EA34 /* Debug */,
|
||||
0BF5DA4B1AC370840045EA34 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
};
|
||||
/* End XCConfigurationList section */
|
||||
};
|
||||
rootObject = 0BF5DA3A1AC370840045EA34 /* Project object */;
|
||||
}
|
7
Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
7
Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Workspace
|
||||
version = "1.0">
|
||||
<FileRef
|
||||
location = "self:Grid.xcodeproj">
|
||||
</FileRef>
|
||||
</Workspace>
|
BIN
Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate
generated
Normal file
BIN
Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate
generated
Normal file
Binary file not shown.
@ -0,0 +1,43 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Scheme
|
||||
LastUpgradeVersion = "0610"
|
||||
version = "1.3">
|
||||
<BuildAction
|
||||
parallelizeBuildables = "YES"
|
||||
buildImplicitDependencies = "YES">
|
||||
</BuildAction>
|
||||
<TestAction
|
||||
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
|
||||
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
|
||||
shouldUseLaunchSchemeArgsEnv = "YES"
|
||||
buildConfiguration = "Debug">
|
||||
<Testables>
|
||||
</Testables>
|
||||
</TestAction>
|
||||
<LaunchAction
|
||||
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
|
||||
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
|
||||
launchStyle = "0"
|
||||
useCustomWorkingDirectory = "NO"
|
||||
buildConfiguration = "Debug"
|
||||
ignoresPersistentStateOnLaunch = "NO"
|
||||
debugDocumentVersioning = "YES"
|
||||
allowLocationSimulation = "YES">
|
||||
<AdditionalOptions>
|
||||
</AdditionalOptions>
|
||||
</LaunchAction>
|
||||
<ProfileAction
|
||||
shouldUseLaunchSchemeArgsEnv = "YES"
|
||||
savedToolIdentifier = ""
|
||||
useCustomWorkingDirectory = "NO"
|
||||
buildConfiguration = "Release"
|
||||
debugDocumentVersioning = "YES">
|
||||
</ProfileAction>
|
||||
<AnalyzeAction
|
||||
buildConfiguration = "Debug">
|
||||
</AnalyzeAction>
|
||||
<ArchiveAction
|
||||
buildConfiguration = "Release"
|
||||
revealArchiveInOrganizer = "YES">
|
||||
</ArchiveAction>
|
||||
</Scheme>
|
@ -0,0 +1,22 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>SchemeUserState</key>
|
||||
<dict>
|
||||
<key>Grid.xcscheme</key>
|
||||
<dict>
|
||||
<key>orderHint</key>
|
||||
<integer>0</integer>
|
||||
</dict>
|
||||
</dict>
|
||||
<key>SuppressBuildableAutocreation</key>
|
||||
<dict>
|
||||
<key>0BF5DA411AC370840045EA34</key>
|
||||
<dict>
|
||||
<key>primary</key>
|
||||
<true/>
|
||||
</dict>
|
||||
</dict>
|
||||
</dict>
|
||||
</plist>
|
184
Grid_Cartesian.h
184
Grid_Cartesian.h
@ -1,10 +1,9 @@
|
||||
#ifndef GRID_CARTESIAN_H
|
||||
#define GRID_CARTESIAN_H
|
||||
|
||||
#include "Grid.h"
|
||||
|
||||
#include <Grid.h>
|
||||
#include <Grid_Communicator.h>
|
||||
namespace dpo{
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Grid Support. Following will go into Grid.h.
|
||||
@ -13,28 +12,47 @@ namespace dpo{
|
||||
// dpo::Grid
|
||||
// dpo::GridCartesian
|
||||
// dpo::GridCartesianRedBlack
|
||||
|
||||
class Grid {
|
||||
|
||||
|
||||
class SimdGrid : public CartesianCommunicator {
|
||||
public:
|
||||
|
||||
|
||||
SimdGrid(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
|
||||
|
||||
// Give Lattice access
|
||||
template<class object> friend class Lattice;
|
||||
|
||||
|
||||
//protected:
|
||||
|
||||
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
||||
// and one generator per site.
|
||||
//std::default_random_engine generator;
|
||||
// Lattice wide random support. not yet fully implemented. Need seed strategy
|
||||
// and one generator per site.
|
||||
//std::default_random_engine generator;
|
||||
// static std::mt19937 generator( 9 );
|
||||
|
||||
|
||||
// Grid information.
|
||||
unsigned long _ndimension;
|
||||
std::vector<int> _layout; // Which dimensions get relayed out over simd lanes.
|
||||
std::vector<int> _dimensions; // Dimensions of array
|
||||
std::vector<int> _rdimensions;// Reduced dimensions with simd lane images removed
|
||||
|
||||
// Commicator provides
|
||||
// unsigned long _ndimension;
|
||||
// std::vector<int> _processors; // processor grid
|
||||
// int _processor; // linear processor rank
|
||||
// std::vector<int> _processor_coor; // linear processor rank
|
||||
|
||||
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
|
||||
|
||||
|
||||
std::vector<int> _fdimensions;// Global dimensions of array with cb removed
|
||||
std::vector<int> _gdimensions;// Global dimensions of array
|
||||
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
|
||||
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
|
||||
|
||||
// std::vector<int> _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
|
||||
// std::vector<int> _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
|
||||
|
||||
|
||||
std::vector<int> _ostride; // Outer stride for each dimension
|
||||
std::vector<int> _istride; // Inner stride i.e. within simd lane
|
||||
|
||||
int _osites; // _isites*_osites = product(dimensions).
|
||||
int _isites;
|
||||
|
||||
@ -70,15 +88,33 @@ public:
|
||||
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
inline int oSites(void) { return _osites; };
|
||||
inline int iSites(void) { return _isites; };
|
||||
|
||||
inline int CheckerboardFromOsite (int Osite){
|
||||
std::vector<int> ocoor;
|
||||
CoordFromOsite(ocoor,Osite);
|
||||
int ss=0;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
ss=ss+ocoor[d];
|
||||
}
|
||||
return ss&0x1;
|
||||
}
|
||||
inline void CoordFromOsite (std::vector<int>& coor,int Osite){
|
||||
coor.resize(_ndimension);
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
coor[d] = Osite % _rdimensions[d];
|
||||
Osite = Osite / _rdimensions[d];
|
||||
}
|
||||
}
|
||||
|
||||
virtual int CheckerBoard(std::vector<int> site)=0;
|
||||
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift)=0;
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||
};
|
||||
|
||||
class GridCartesian: public Grid {
|
||||
class GridCartesian: public SimdGrid {
|
||||
public:
|
||||
virtual int CheckerBoard(std::vector<int> site){
|
||||
return 0;
|
||||
@ -86,19 +122,24 @@ public:
|
||||
virtual int CheckerBoardDestination(int cb,int shift){
|
||||
return 0;
|
||||
}
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift){
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
|
||||
return shift;
|
||||
}
|
||||
GridCartesian(std::vector<int> &dimensions,std::vector<int> layout)
|
||||
GridCartesian(std::vector<int> &dimensions,
|
||||
std::vector<int> &simd_layout,
|
||||
std::vector<int> &processor_grid
|
||||
) : SimdGrid(processor_grid)
|
||||
{
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
_ndimension = dimensions.size();
|
||||
|
||||
_dimensions.resize(_ndimension);
|
||||
_fdimensions.resize(_ndimension);
|
||||
_gdimensions.resize(_ndimension);
|
||||
_ldimensions.resize(_ndimension);
|
||||
_rdimensions.resize(_ndimension);
|
||||
_layout.resize(_ndimension);
|
||||
_simd_layout.resize(_ndimension);
|
||||
|
||||
_ostride.resize(_ndimension);
|
||||
_istride.resize(_ndimension);
|
||||
@ -106,21 +147,26 @@ public:
|
||||
_osites = 1;
|
||||
_isites = 1;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
_dimensions[d] = dimensions[d];
|
||||
_layout[d] = layout[d];
|
||||
// Use a reduced simd grid
|
||||
_rdimensions[d]= _dimensions[d]/_layout[d]; //<-- _layout[d] is zero
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _layout[d];
|
||||
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
|
||||
//FIXME check for exact division
|
||||
|
||||
// Use a reduced simd grid
|
||||
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
|
||||
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _simd_layout[d];
|
||||
|
||||
// Addressing support
|
||||
if ( d==0 ) {
|
||||
_ostride[d] = 1;
|
||||
_istride[d] = 1;
|
||||
} else {
|
||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||
_istride[d] = _istride[d-1]*_layout[d-1];
|
||||
}
|
||||
// Addressing support
|
||||
if ( d==0 ) {
|
||||
_ostride[d] = 1;
|
||||
_istride[d] = 1;
|
||||
} else {
|
||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||
_istride[d] = _istride[d-1]*_simd_layout[d-1];
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
@ -148,46 +194,57 @@ public:
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
// Specialise this for red black grids storing half the data like a chess board.
|
||||
class GridRedBlackCartesian : public Grid
|
||||
class GridRedBlackCartesian : public SimdGrid
|
||||
{
|
||||
public:
|
||||
virtual int CheckerBoard(std::vector<int> site){
|
||||
return site[0]&0x1;
|
||||
return (site[0]+site[1]+site[2]+site[3])&0x1;
|
||||
}
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift){
|
||||
if ( dim == 0 ){
|
||||
int fulldim =2*_dimensions[0];
|
||||
shift = (shift+fulldim)%fulldim;
|
||||
if ( source_cb ) {
|
||||
// Shift 0,1 -> 0
|
||||
return (shift+1)/2;
|
||||
} else {
|
||||
// Shift 0->0, 1 -> 1, 2->1
|
||||
return (shift)/2;
|
||||
}
|
||||
} else {
|
||||
return shift;
|
||||
}
|
||||
|
||||
// Depending on the cb of site, we toggle source cb.
|
||||
// for block #b, element #e = (b, e)
|
||||
// we need
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
|
||||
|
||||
if(dim != 0) return shift;
|
||||
|
||||
int fulldim =_fdimensions[0];
|
||||
shift = (shift+fulldim)%fulldim;
|
||||
|
||||
// Probably faster with table lookup;
|
||||
// or by looping over x,y,z and multiply rather than computing checkerboard.
|
||||
int ocb=CheckerboardFromOsite(osite);
|
||||
|
||||
if ( (source_cb+ocb)&1 ) {
|
||||
return (shift)/2;
|
||||
} else {
|
||||
return (shift+1)/2;
|
||||
}
|
||||
}
|
||||
|
||||
virtual int CheckerBoardDestination(int source_cb,int shift){
|
||||
if ((shift+2*_dimensions[0])&0x1) {
|
||||
if ((shift+_fdimensions[0])&0x1) {
|
||||
return 1-source_cb;
|
||||
} else {
|
||||
return source_cb;
|
||||
}
|
||||
};
|
||||
GridRedBlackCartesian(std::vector<int> &dimensions,std::vector<int> layout)
|
||||
GridRedBlackCartesian(std::vector<int> &dimensions,
|
||||
std::vector<int> &simd_layout,
|
||||
std::vector<int> &processor_grid) : SimdGrid(processor_grid)
|
||||
{
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
_ndimension = dimensions.size();
|
||||
|
||||
_dimensions.resize(_ndimension);
|
||||
_fdimensions.resize(_ndimension);
|
||||
_gdimensions.resize(_ndimension);
|
||||
_ldimensions.resize(_ndimension);
|
||||
_rdimensions.resize(_ndimension);
|
||||
_layout.resize(_ndimension);
|
||||
_simd_layout.resize(_ndimension);
|
||||
|
||||
_ostride.resize(_ndimension);
|
||||
_istride.resize(_ndimension);
|
||||
@ -195,14 +252,17 @@ public:
|
||||
_osites = 1;
|
||||
_isites = 1;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
_dimensions[d] = dimensions[d];
|
||||
_dimensions[0] = dimensions[0]/2;
|
||||
_layout[d] = layout[d];
|
||||
_fdimensions[d] = dimensions[d];
|
||||
_gdimensions[d] = _fdimensions[d];
|
||||
if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
|
||||
_ldimensions[d] = _gdimensions[d]/_processors[d];
|
||||
|
||||
// Use a reduced simd grid
|
||||
_rdimensions[d]= _dimensions[d]/_layout[d];
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
|
||||
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _layout[d];
|
||||
_isites *= _simd_layout[d];
|
||||
|
||||
// Addressing support
|
||||
if ( d==0 ) {
|
||||
@ -210,7 +270,7 @@ public:
|
||||
_istride[d] = 1;
|
||||
} else {
|
||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||
_istride[d] = _istride[d-1]*_layout[d-1];
|
||||
_istride[d] = _istride[d-1]*_simd_layout[d-1];
|
||||
}
|
||||
}
|
||||
|
||||
|
39
Grid_Communicator.h
Normal file
39
Grid_Communicator.h
Normal file
@ -0,0 +1,39 @@
|
||||
#ifndef GRID_COMMUNICATOR_H
|
||||
#define GRID_COMMUNICATOR_H
|
||||
///////////////////////////////////
|
||||
// Processor layout information
|
||||
///////////////////////////////////
|
||||
namespace dpo {
|
||||
class CartesianCommunicator {
|
||||
public:
|
||||
|
||||
// Communicator should know nothing of the physics grid, only processor grid.
|
||||
|
||||
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
|
||||
int _processor; // linear processor rank
|
||||
std::vector<int> _processor_coor; // linear processor coordinate
|
||||
unsigned long _ndimension;
|
||||
|
||||
#ifdef GRID_COMMS_MPI
|
||||
MPI_Comm communicator;
|
||||
#endif
|
||||
|
||||
CartesianCommunicator(std::vector<int> &pdimensions_in);
|
||||
|
||||
int Rank(std::vector<int> coor);
|
||||
void GlobalSumF(float &);
|
||||
void GlobalSumFVector(float *,int N);
|
||||
|
||||
void GlobalSumF(double &);
|
||||
void GlobalSumFVector(double *,int N);
|
||||
|
||||
void SendToRecvFrom(void *xmit,
|
||||
std::vector<int> to_coordinate,
|
||||
void *recv,
|
||||
std::vector<int> from_coordinate,
|
||||
int bytes);
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
299
Grid_Lattice.h
299
Grid_Lattice.h
@ -3,19 +3,31 @@
|
||||
|
||||
#include "Grid.h"
|
||||
|
||||
|
||||
|
||||
namespace dpo {
|
||||
|
||||
// Permute the pointers 32bitx16 = 512
|
||||
static int permute_map[4][16] = {
|
||||
{ 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
|
||||
{ 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13},
|
||||
{ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11},
|
||||
{ 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8}
|
||||
};
|
||||
|
||||
|
||||
template<class vobj>
|
||||
class Lattice
|
||||
{
|
||||
public:
|
||||
Grid *_grid;
|
||||
SimdGrid *_grid;
|
||||
int checkerboard;
|
||||
std::vector<vobj,alignedAllocator<vobj> > _odata;
|
||||
|
||||
public:
|
||||
|
||||
Lattice(Grid *grid) : _grid(grid) {
|
||||
|
||||
|
||||
Lattice(SimdGrid *grid) : _grid(grid) {
|
||||
_odata.reserve(_grid->oSites());
|
||||
if ( ((uint64_t)&_odata[0])&0xF) {
|
||||
exit(-1);
|
||||
@ -23,6 +35,19 @@ public:
|
||||
checkerboard=0;
|
||||
}
|
||||
|
||||
|
||||
#ifdef GRID_COMMS_NONE
|
||||
#include <Grid_none_cshift.h>
|
||||
#endif
|
||||
|
||||
#ifdef GRID_COMMS_FAKE
|
||||
#include <Grid_fake_cshift.h>
|
||||
#endif
|
||||
|
||||
#ifdef GRID_COMMS_MPI
|
||||
#include <Grid_mpi_cshift.h>
|
||||
#endif
|
||||
|
||||
|
||||
// overloading dpo::conformable but no conformable in dpo ...?:w
|
||||
template<class obj1,class obj2>
|
||||
@ -69,212 +94,6 @@ public:
|
||||
return ret;
|
||||
};
|
||||
|
||||
#if 0
|
||||
// Collapse doesn't appear to work the way I think it should in icpc
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||
int sx,so,o;
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_dimensions[dimension];
|
||||
|
||||
// Map to always positive shift.
|
||||
shift = (shift+ld)%ld;
|
||||
|
||||
// Work out whether to permute and the permute type
|
||||
// ABCDEFGH -> AE BF CG DH permute
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++)
|
||||
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
||||
|
||||
|
||||
// loop over perp slices.
|
||||
// Threading considerations:
|
||||
// Need to map thread_num to
|
||||
//
|
||||
// x_min,x_max for Loop-A
|
||||
// n_min,n_max for Loop-B
|
||||
// b_min,b_max for Loop-C
|
||||
// In a way that maximally load balances.
|
||||
//
|
||||
// Optimal:
|
||||
// There are rd*n_block*block items of work.
|
||||
// These serialise as item "w"
|
||||
// b=w%block; w=w/block
|
||||
// n=w%nblock; x=w/nblock. Perhaps 20 cycles?
|
||||
//
|
||||
// Logic:
|
||||
// x_chunk = (rd+thread)/nthreads simply divide work across nodes.
|
||||
//
|
||||
// rd=5 , threads = 8;
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// 0 0 0 1 1 1 1 1
|
||||
for(int x=0;x<rd;x++){ // Loop A
|
||||
sx = (x-shift+ld)%rd;
|
||||
o = x*rhs._grid->_ostride[dimension];
|
||||
so =sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
if ( permute_dim ) {
|
||||
permute_slice = shift/rd;
|
||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
if ( permute_slice ) {
|
||||
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
int num =rhs._grid->_slice_block[dimension]*internal;
|
||||
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
vComplex *optr = (vComplex *)&ret._odata[o];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so];
|
||||
for(int b=0;b<num;b++){
|
||||
permute(optr[b],iptr[b],permute_type);
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
} else {
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
ret._odata[o+i]=rhs._odata[so+i];
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
if ( permute_slice ) {
|
||||
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
int num =rhs._grid->_slice_block[dimension]*internal;
|
||||
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<num;b++){
|
||||
vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
|
||||
permute(optr[b],iptr[b],permute_type);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
int oo = o+ n*rhs._grid->_slice_stride[dimension];
|
||||
int soo=so+ n*rhs._grid->_slice_stride[dimension];
|
||||
ret._odata[oo+i]=rhs._odata[soo+i];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_dimensions[dimension];
|
||||
|
||||
// Map to always positive shift.
|
||||
shift = (shift+ld)%ld;
|
||||
|
||||
// Work out whether to permute and the permute type
|
||||
// ABCDEFGH -> AE BF CG DH permute
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++)
|
||||
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
||||
|
||||
|
||||
// loop over all work
|
||||
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ww=0;ww<work;ww++){
|
||||
|
||||
|
||||
// can optimise this if know w moves congtiguously for a given thread.
|
||||
// b=(b+1);
|
||||
// if (b==_slice_block) {b=0; n=n+1;}
|
||||
// if (n==_slice_nblock) { n=0; x=x+1}
|
||||
//
|
||||
// Perhaps a five cycle iterator, or so.
|
||||
int w=ww;
|
||||
int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
|
||||
int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
|
||||
int x = w;
|
||||
|
||||
int sx,so,o;
|
||||
sx = (x-shift+ld)%rd;
|
||||
o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
|
||||
so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
if ( permute_dim ) {
|
||||
permute_slice = shift/rd;
|
||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||
}
|
||||
|
||||
if ( permute_slice ) {
|
||||
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
vComplex *optr = (vComplex *)&ret._odata[o+b];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so+b];
|
||||
const char *pf = (const char *)iptr;
|
||||
for(int i=0;i<sizeof(vobj);i+=64){
|
||||
_mm_prefetch(pf+i,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
for(int i=0;i<internal;i++){
|
||||
permute(optr[i],iptr[i],permute_type);
|
||||
}
|
||||
} else {
|
||||
const char *pf = (const char *) &rhs._odata[so+b];
|
||||
for(int i=0;i<sizeof(vobj);i+=64){
|
||||
_mm_prefetch(pf+i,_MM_HINT_T0);
|
||||
}
|
||||
ret._odata[o+b]=rhs._odata[so+b];
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class sobj>
|
||||
inline Lattice<vobj> & operator = (const sobj & r){
|
||||
#pragma omp parallel for
|
||||
@ -288,14 +107,15 @@ public:
|
||||
template<class sobj>
|
||||
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
||||
|
||||
if ( l._grid.checkerboard != l._grid->Checkerboard(site)){
|
||||
printf("Poking wrong checkerboard\n");
|
||||
exit(EXIT_FAILURE);
|
||||
if ( l.checkerboard != l._grid->CheckerBoard(site)){
|
||||
printf("Poking wrong checkerboard\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
int o_index = l._grid->oIndex(site);
|
||||
int i_index = l._grid->iIndex(site);
|
||||
|
||||
// BUGGY. This assumes complex real
|
||||
Real *v_ptr = (Real *)&l._odata[o_index];
|
||||
Real *s_ptr = (Real *)&s;
|
||||
v_ptr = v_ptr + 2*i_index;
|
||||
@ -348,6 +168,22 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
friend void lex_sites(Lattice<vobj> &l){
|
||||
|
||||
Real *v_ptr = (Real *)&l._odata[0];
|
||||
size_t o_len = l._grid->oSites();
|
||||
size_t v_len = sizeof(vobj)/sizeof(vRealF);
|
||||
size_t vec_len = vRealF::Nsimd();
|
||||
|
||||
for(int i=0;i<o_len;i++){
|
||||
for(int j=0;j<v_len;j++){
|
||||
for(int vv=0;vv<vec_len;vv+=2){
|
||||
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
|
||||
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
|
||||
}
|
||||
}}
|
||||
};
|
||||
|
||||
friend void gaussian(Lattice<vobj> &l){
|
||||
// Zero mean, unit variance.
|
||||
std::normal_distribution<double> distribution(0.0,1.0);
|
||||
@ -427,22 +263,41 @@ public:
|
||||
return ret;
|
||||
};
|
||||
|
||||
|
||||
|
||||
// remove and insert a half checkerboard
|
||||
friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
||||
half.checkerboard = cb;
|
||||
int ssh=0;
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<half._grid->oSites();ss++){
|
||||
half._odata[ss] = full._odata[ss*2+cb];
|
||||
}
|
||||
half.checkerboard = cb;
|
||||
for(int ss=0;ss<full._grid->oSites();ss++){
|
||||
std::vector<int> coor;
|
||||
int cbos;
|
||||
|
||||
full._grid->CoordFromOsite(coor,ss);
|
||||
cbos=half._grid->CheckerBoard(coor);
|
||||
|
||||
if (cbos==cb) {
|
||||
|
||||
half._odata[ssh] = full._odata[ss];
|
||||
ssh++;
|
||||
}
|
||||
}
|
||||
}
|
||||
friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||
int cb = half.checkerboard;
|
||||
int cb = half.checkerboard;
|
||||
int ssh=0;
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<half._grid->oSites();ss++){
|
||||
full._odata[ss*2+cb]=half._odata[ss];
|
||||
}
|
||||
for(int ss=0;ss<full._grid->oSites();ss++){
|
||||
std::vector<int> coor;
|
||||
int cbos;
|
||||
|
||||
full._grid->CoordFromOsite(coor,ss);
|
||||
cbos=half._grid->CheckerBoard(coor);
|
||||
|
||||
if (cbos==cb) {
|
||||
full._odata[ss]=half._odata[ssh];
|
||||
ssh++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // class Lattice
|
||||
|
||||
|
@ -5,7 +5,6 @@ namespace QCD {
|
||||
|
||||
static const int Nc=3;
|
||||
static const int Ns=4;
|
||||
|
||||
static const int CbRed =0;
|
||||
static const int CbBlack=1;
|
||||
|
||||
|
@ -10,6 +10,15 @@
|
||||
/* AVX512 */
|
||||
/* #undef AVX512 */
|
||||
|
||||
/* GRID_COMMS_FAKE */
|
||||
#define GRID_COMMS_FAKE 1
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
/* #undef GRID_COMMS_MPI */
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
/* #undef GRID_COMMS_NONE */
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#define HAVE_GETTIMEOFDAY 1
|
||||
|
||||
@ -61,6 +70,9 @@
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "grid"
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "1.0"
|
||||
|
||||
|
34
Grid_fake.cc
Normal file
34
Grid_fake.cc
Normal file
@ -0,0 +1,34 @@
|
||||
#include "Grid.h"
|
||||
namespace dpo {
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
||||
{
|
||||
_ndimension = _processors.size();
|
||||
_processor_coor.resize(_ndimension);
|
||||
_processors = processors;
|
||||
|
||||
// Require 1^N processor grid for fake
|
||||
for(int d=0;d<_ndimension;d++) if(_processors[d]!=1) exit(-1);
|
||||
|
||||
_processor = 0;// I am the one. The only one..
|
||||
for(int d=0;d<_ndimension;d++) _processor_coor[d] = 0;
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSumF(float &){}
|
||||
void CartesianCommunicator::GlobalSumFVector(float *,int N){}
|
||||
void CartesianCommunicator::GlobalSumF(double &){}
|
||||
void CartesianCommunicator::GlobalSumFVector(double *,int N){}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
std::vector<int> to_coordinate,
|
||||
void *recv,
|
||||
std::vector<int> from_coordinate,
|
||||
int bytes)
|
||||
{
|
||||
exit(-1);
|
||||
bcopy(xmit,recv,bytes);
|
||||
}
|
||||
|
||||
}
|
||||
|
246
Grid_fake_cshift.h
Normal file
246
Grid_fake_cshift.h
Normal file
@ -0,0 +1,246 @@
|
||||
#ifndef _GRID_FAKE_H_
|
||||
#define _GRID_FAKE_H_
|
||||
|
||||
|
||||
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
const int Nsimd = vector_type::Nsimd();
|
||||
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
//int ld = rhs._grid->_ldimensions[dimension];
|
||||
//int gd = rhs._grid->_gdimensions[dimension];
|
||||
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
|
||||
// the permute type
|
||||
|
||||
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Move via a fake comms buffer
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
|
||||
std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
std::vector<scalar_type *> pointers(Nsimd);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
|
||||
}
|
||||
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
|
||||
if ( permute_dim ) {
|
||||
|
||||
int o = 0; // relative offset to base
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
// base offset for source
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
extract(rhs._odata[so+o+b],pointers);
|
||||
} else {
|
||||
ret._odata[ro+o+b]=rhs._odata[so+o+b];
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
|
||||
}
|
||||
|
||||
o = 0; // relative offset to base
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
// base offset for source
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
merge(ret._odata[ro+o+b],pointers);
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int co; // comm offset
|
||||
int o;
|
||||
|
||||
co=0; o=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
// This call in inner loop is annoying but necessary for dimension=0
|
||||
// in the case of RedBlack grids. Could optimise away with
|
||||
// alternate code paths for all other cases.
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
comm_buf[co++]=rhs._odata[so+o+b];
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
// Step through a copy into a comms buffer and pull back in.
|
||||
// Genuine fake implementation could calculate if loops back
|
||||
co=0; o=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
ret._odata[ro+o+b]=comm_buf[co++];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_ldimensions[dimension];
|
||||
int gd = rhs._grid->_gdimensions[dimension];
|
||||
|
||||
// Map to always positive shift.
|
||||
shift = (shift+gd)%gd;
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||
|
||||
// Work out whether to permute and the permute type
|
||||
// ABCDEFGH -> AE BF CG DH permute
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||
|
||||
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++)
|
||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||
|
||||
|
||||
// loop over all work
|
||||
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
|
||||
// Packed gather sequence is clean
|
||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
|
||||
typedef typename vobj::scalar_type scalar_t;
|
||||
typedef typename vobj::vector_type vector_t;
|
||||
const int ns=sizeof(vobj)/sizeof(scalar_t);
|
||||
const int nv=sizeof(vobj)/sizeof(vector_t);
|
||||
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+shift)%rd;
|
||||
int o = x*rhs._grid->_ostride[dimension];
|
||||
int so =sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
|
||||
int permute_slice=0;
|
||||
if ( permute_dim ) {
|
||||
permute_slice = shift/rd;
|
||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||
}
|
||||
|
||||
if ( permute_slice ) {
|
||||
exit(0);
|
||||
// For fake communication ALWAYS extract and either merge one way or other
|
||||
scalar_t * bptr = (scalar_t *) &comm_buf[0];
|
||||
|
||||
int bo=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
|
||||
vector_t *optr = (vector_t *)&ret._odata[o];
|
||||
vector_t *iptr = (vector_t *)&rhs._odata[so];
|
||||
int skew = buffer_size*ns/2;
|
||||
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
for(int n=0;n<nv;n++){// number of simd vecsscalars in a vector
|
||||
extract(iptr[b*nv+n],&bptr[n],skew,permute_type);
|
||||
}
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
// bo+=rhs._grid->_slice_stride[dimension]*ns/2;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
int bo=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
comm_buf[bo++] =rhs._odata[so+i];
|
||||
}
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
bo=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
ret._odata[o+i]=comm_buf[bo++];
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
*/
|
||||
|
||||
#endif
|
@ -28,13 +28,13 @@ double usecond(void) {
|
||||
|
||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
ucontext_t * uc= (ucontext_t *)ptr;
|
||||
|
||||
printf("Caught signal %d\n",si->si_signo);
|
||||
printf(" mem address %llx\n",(uint64_t)si->si_addr);
|
||||
printf(" code %d\n",si->si_code);
|
||||
|
||||
#ifdef __X86_64
|
||||
ucontext_t * uc= (ucontext_t *)ptr;
|
||||
|
||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||
printf(" instruction %llx\n",(uint64_t)sc->rip);
|
||||
|
||||
|
124
Grid_main.cc
124
Grid_main.cc
@ -14,7 +14,7 @@ using namespace dpo::QCD;
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
|
||||
#ifdef KNL
|
||||
#if 0
|
||||
struct sigaction sa,osa;
|
||||
sigemptyset (&sa.sa_mask);
|
||||
sa.sa_sigaction= sa_action;
|
||||
@ -27,6 +27,9 @@ int main (int argc, char ** argv)
|
||||
std::vector<int> latt_size(4);
|
||||
std::vector<int> simd_layout(4);
|
||||
|
||||
std::vector<int> mpi_layout(4);
|
||||
for(int d=0;d<4;d++) mpi_layout[d]=1;
|
||||
|
||||
#ifdef AVX512
|
||||
for(int omp=128;omp<236;omp+=16){
|
||||
#else
|
||||
@ -37,7 +40,7 @@ int main (int argc, char ** argv)
|
||||
omp_set_num_threads(omp);
|
||||
#endif
|
||||
|
||||
for(int lat=16;lat<=20;lat+=4){
|
||||
for(int lat=4;lat<=16;lat+=40){
|
||||
latt_size[0] = lat;
|
||||
latt_size[1] = lat;
|
||||
latt_size[2] = lat;
|
||||
@ -62,11 +65,9 @@ int main (int argc, char ** argv)
|
||||
simd_layout[2] = 1;
|
||||
simd_layout[3] = 2;
|
||||
#endif
|
||||
|
||||
|
||||
GridCartesian Fine(latt_size,simd_layout);
|
||||
GridRedBlackCartesian rbFine(latt_size,simd_layout);
|
||||
|
||||
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
|
||||
|
||||
LatticeColourMatrix Foo(&Fine);
|
||||
LatticeColourMatrix Bar(&Fine);
|
||||
@ -152,19 +153,23 @@ int main (int argc, char ** argv)
|
||||
trscMat = trace(scMat); // Trace
|
||||
|
||||
FooBar = Bar;
|
||||
int shift=1;
|
||||
int dir=0;
|
||||
|
||||
/*
|
||||
{
|
||||
std::vector<int> coor(4);
|
||||
for(int d=0;d<4;d++) coor[d] = 0;
|
||||
peekSite(cmat,Foo,coor);
|
||||
Foo = zero;
|
||||
pokeSite(cmat,Foo,coor);
|
||||
}
|
||||
random(Foo);
|
||||
pickCheckerboard(CbRed,rFoo,Foo); // Pick out red or black checkerboards
|
||||
pickCheckerboard(CbBlack,bFoo,Foo);
|
||||
*/
|
||||
lex_sites(Foo);
|
||||
|
||||
|
||||
//setCheckerboard(ShiftedCheck,rFoo);
|
||||
//setCheckerboard(ShiftedCheck,bFoo);
|
||||
|
||||
Shifted = Cshift(Foo,dir,shift); // Shift everything
|
||||
bShifted = Cshift(rFoo,dir,shift); // Shift red
|
||||
rShifted = Cshift(bFoo,dir,shift); // Shift black
|
||||
|
||||
setCheckerboard(ShiftedCheck,bShifted); // Put them all together
|
||||
setCheckerboard(ShiftedCheck,rShifted); // and check the results (later)
|
||||
|
||||
// Lattice SU(3) x SU(3)
|
||||
FooBar = Foo * Bar;
|
||||
@ -211,25 +216,69 @@ int main (int argc, char ** argv)
|
||||
printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
|
||||
printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
|
||||
|
||||
pickCheckerboard(0,rFoo,FooBar);
|
||||
pickCheckerboard(1,bFoo,FooBar);
|
||||
|
||||
setCheckerboard(FooBar,rFoo);
|
||||
setCheckerboard(FooBar,bFoo);
|
||||
// pickCheckerboard(0,rFoo,FooBar);
|
||||
// pickCheckerboard(1,bFoo,FooBar);
|
||||
// setCheckerboard(FooBar,rFoo);
|
||||
// setCheckerboard(FooBar,bFoo);
|
||||
|
||||
double nrm=0;
|
||||
|
||||
for(int dir=0;dir<4;dir++){
|
||||
for(int shift=0;shift<latt_size[dir];shift++){
|
||||
|
||||
|
||||
|
||||
pickCheckerboard(0,rFoo,Foo); // Pick out red or black checkerboards
|
||||
pickCheckerboard(1,bFoo,Foo);
|
||||
|
||||
std::vector<int> coor(4);
|
||||
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||
std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
|
||||
Shifted = Cshift(Foo,dir,shift); // Shift everything
|
||||
|
||||
std::cout << "Shifting even parities"<<std::endl;
|
||||
bShifted = Cshift(rFoo,dir,shift); // Shift red->black
|
||||
std::cout << "Shifting odd parities"<<std::endl;
|
||||
rShifted = Cshift(bFoo,dir,shift); // Shift black->red
|
||||
|
||||
ShiftedCheck=zero;
|
||||
setCheckerboard(ShiftedCheck,bShifted); // Put them all together
|
||||
setCheckerboard(ShiftedCheck,rShifted); // and check the results (later)
|
||||
|
||||
// Check results
|
||||
std::vector<int> coor(4);
|
||||
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||
|
||||
std::complex<dpo::Real> diff;
|
||||
|
||||
std::vector<int> shiftcoor = coor;
|
||||
shiftcoor[dir]=(shiftcoor[dir]-shift+latt_size[dir])%latt_size[dir];
|
||||
|
||||
shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%latt_size[dir];
|
||||
|
||||
std::vector<int> rl(4);
|
||||
for(int dd=0;dd<4;dd++){
|
||||
rl[dd] = latt_size[dd]/simd_layout[dd];
|
||||
}
|
||||
int lex = coor[0]%rl[0]
|
||||
+ (coor[1]%rl[1])*rl[0]
|
||||
+ (coor[2]%rl[2])*rl[0]*rl[1]
|
||||
+ (coor[3]%rl[3])*rl[0]*rl[1]*rl[2];
|
||||
lex +=
|
||||
+1000*(coor[0]/rl[0])
|
||||
+1000*(coor[1]/rl[1])*simd_layout[0]
|
||||
+1000*(coor[2]/rl[2])*simd_layout[0]*simd_layout[1]
|
||||
+1000*(coor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2];
|
||||
|
||||
int lex_coor = shiftcoor[0]%rl[0]
|
||||
+ (shiftcoor[1]%rl[1])*rl[0]
|
||||
+ (shiftcoor[2]%rl[2])*rl[0]*rl[1]
|
||||
+ (shiftcoor[3]%rl[3])*rl[0]*rl[1]*rl[2];
|
||||
lex_coor +=
|
||||
+1000*(shiftcoor[0]/rl[0])
|
||||
+1000*(shiftcoor[1]/rl[1])*simd_layout[0]
|
||||
+1000*(shiftcoor[2]/rl[2])*simd_layout[0]*simd_layout[1]
|
||||
+1000*(shiftcoor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2];
|
||||
|
||||
ColourMatrix foo;
|
||||
ColourMatrix bar;
|
||||
ColourMatrix shifted1;
|
||||
@ -242,7 +291,6 @@ int main (int argc, char ** argv)
|
||||
peekSite(shifted1,Shifted,coor);
|
||||
peekSite(shifted2,Foo,shiftcoor);
|
||||
peekSite(shifted3,ShiftedCheck,coor);
|
||||
|
||||
peekSite(foo,Foo,coor);
|
||||
|
||||
mdiff = shifted1-shifted2;
|
||||
@ -258,13 +306,13 @@ int main (int argc, char ** argv)
|
||||
diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c];
|
||||
nn=real(conj(diff)*diff);
|
||||
if ( nn > 0 )
|
||||
cout<<"Shift fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||
cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||
<<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
||||
<< " "<< foo._internal._internal[r][c]<<endl;
|
||||
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
|
||||
else if(0)
|
||||
cout<<"Shift pass "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||
cout<<"Shift pass 1vs2 "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||
<<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
||||
<< " "<< foo._internal._internal[r][c]<<endl;
|
||||
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
|
||||
}}
|
||||
|
||||
for(int r=0;r<3;r++){
|
||||
@ -272,13 +320,13 @@ int main (int argc, char ** argv)
|
||||
diff =shifted3._internal._internal[r][c]-shifted2._internal._internal[r][c];
|
||||
nn=real(conj(diff)*diff);
|
||||
if ( nn > 0 )
|
||||
cout<<"Shift rb fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||
cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||
<<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
||||
<< " "<< foo._internal._internal[r][c]<<endl;
|
||||
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
|
||||
else if(0)
|
||||
cout<<"Shift rb pass "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||
cout<<"Shift rb pass 3vs2 "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
|
||||
<<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
|
||||
<< " "<< foo._internal._internal[r][c]<<endl;
|
||||
<< " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
|
||||
}}
|
||||
peekSite(bar,Bar,coor);
|
||||
|
||||
@ -290,7 +338,7 @@ int main (int argc, char ** argv)
|
||||
nrm = nrm + real(conj(diff)*diff);
|
||||
}}
|
||||
}}}}
|
||||
|
||||
}}
|
||||
std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;
|
||||
|
||||
} // loop for lat
|
||||
|
@ -1,6 +1,63 @@
|
||||
#ifndef GRID_MATH_TYPES_H
|
||||
#define GRID_MATH_TYPES_H
|
||||
namespace dpo {
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class T> class GridTypeMapper {
|
||||
public:
|
||||
typedef typename T::scalar_type scalar_type;
|
||||
typedef typename T::vector_type vector_type;
|
||||
};
|
||||
|
||||
template<> class GridTypeMapper<RealF> {
|
||||
public:
|
||||
typedef RealF scalar_type;
|
||||
typedef RealF vector_type;
|
||||
};
|
||||
template<> class GridTypeMapper<RealD> {
|
||||
public:
|
||||
typedef RealD scalar_type;
|
||||
typedef RealD vector_type;
|
||||
};
|
||||
template<> class GridTypeMapper<ComplexF> {
|
||||
public:
|
||||
typedef ComplexF scalar_type;
|
||||
typedef ComplexF vector_type;
|
||||
};
|
||||
template<> class GridTypeMapper<ComplexD> {
|
||||
public:
|
||||
typedef ComplexD scalar_type;
|
||||
typedef ComplexD vector_type;
|
||||
};
|
||||
|
||||
template<> class GridTypeMapper<vRealF> {
|
||||
public:
|
||||
typedef RealF scalar_type;
|
||||
typedef vRealF vector_type;
|
||||
};
|
||||
template<> class GridTypeMapper<vRealD> {
|
||||
public:
|
||||
typedef RealD scalar_type;
|
||||
typedef vRealD vector_type;
|
||||
};
|
||||
template<> class GridTypeMapper<vComplexF> {
|
||||
public:
|
||||
typedef ComplexF scalar_type;
|
||||
typedef vComplexF vector_type;
|
||||
};
|
||||
template<> class GridTypeMapper<vComplexD> {
|
||||
public:
|
||||
typedef ComplexD scalar_type;
|
||||
typedef vComplexD vector_type;
|
||||
};
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Scalar, Vector, Matrix objects.
|
||||
// These can be composed to form tensor products of internal indices.
|
||||
@ -10,6 +67,10 @@ template<class vtype> class iScalar
|
||||
{
|
||||
public:
|
||||
SIMDalign vtype _internal;
|
||||
|
||||
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||
|
||||
iScalar(){};
|
||||
iScalar(Zero &z){ *this = zero; };
|
||||
iScalar<vtype> & operator= (const Zero &hero){
|
||||
@ -19,6 +80,15 @@ public:
|
||||
friend void zeroit(iScalar<vtype> &that){
|
||||
zeroit(that._internal);
|
||||
}
|
||||
friend void permute(iScalar<vtype> &out,iScalar<vtype> &in,int permutetype){
|
||||
permute(out._internal,in._internal,permutetype);
|
||||
}
|
||||
friend void extract(iScalar<vtype> &in,std::vector<scalar_type *> &out){
|
||||
extract(in._internal,out); // extract advances the pointers in out
|
||||
}
|
||||
friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
|
||||
merge(in._internal,out); // extract advances the pointers in out
|
||||
}
|
||||
// Unary negation
|
||||
friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
|
||||
iScalar<vtype> ret;
|
||||
@ -38,14 +108,16 @@ public:
|
||||
*this = (*this)+r;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
template<class vtype,int N> class iVector
|
||||
{
|
||||
public:
|
||||
SIMDalign vtype _internal[N];
|
||||
|
||||
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||
|
||||
iVector(Zero &z){ *this = zero; };
|
||||
iVector() {};
|
||||
iVector<vtype,N> & operator= (Zero &hero){
|
||||
@ -57,6 +129,21 @@ public:
|
||||
zeroit(that._internal[i]);
|
||||
}
|
||||
}
|
||||
friend void permute(iVector<vtype,N> &out,iVector<vtype,N> &in,int permutetype){
|
||||
for(int i=0;i<N;i++){
|
||||
permute(out._internal[i],in._internal[i],permutetype);
|
||||
}
|
||||
}
|
||||
friend void extract(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
|
||||
for(int i=0;i<N;i++){
|
||||
extract(in._internal[i],out);// extract advances pointers in out
|
||||
}
|
||||
}
|
||||
friend void merge(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
|
||||
for(int i=0;i<N;i++){
|
||||
merge(in._internal[i],out);// extract advances pointers in out
|
||||
}
|
||||
}
|
||||
// Unary negation
|
||||
friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
|
||||
iVector<vtype,N> ret;
|
||||
@ -84,6 +171,10 @@ template<class vtype,int N> class iMatrix
|
||||
{
|
||||
public:
|
||||
SIMDalign vtype _internal[N][N];
|
||||
|
||||
typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
|
||||
typedef typename GridTypeMapper<vtype>::vector_type vector_type;
|
||||
|
||||
iMatrix(Zero &z){ *this = zero; };
|
||||
iMatrix() {};
|
||||
iMatrix<vtype,N> & operator= (Zero &hero){
|
||||
@ -96,6 +187,24 @@ public:
|
||||
zeroit(that._internal[i][j]);
|
||||
}}
|
||||
}
|
||||
friend void permute(iMatrix<vtype,N> &out,iMatrix<vtype,N> &in,int permutetype){
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
permute(out._internal[i][j],in._internal[i][j],permutetype);
|
||||
}}
|
||||
}
|
||||
friend void extract(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
extract(in._internal[i][j],out);// extract advances pointers in out
|
||||
}}
|
||||
}
|
||||
friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
merge(in._internal[i][j],out);// extract advances pointers in out
|
||||
}}
|
||||
}
|
||||
// Unary negation
|
||||
friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
|
||||
iMatrix<vtype,N> ret;
|
||||
|
85
Grid_mpi.cc
Normal file
85
Grid_mpi.cc
Normal file
@ -0,0 +1,85 @@
|
||||
#include "Grid.h"
|
||||
#include <mpi.h>
|
||||
|
||||
namespace dpo {
|
||||
|
||||
// Should error check all MPI calls.
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
||||
{
|
||||
_ndimension = _processors.size();
|
||||
std::vector<int> periodic(_ndimension,1);
|
||||
|
||||
_processors = processors;
|
||||
_processor_coords.resize(_ndimension);
|
||||
|
||||
MPI_Cart_create(MPI_COMM_WORLD _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||
MPI_Comm_rank(communicator,&_processor);
|
||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coords[0]);
|
||||
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSumF(float &f){
|
||||
MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumFVector(float *f,int N)
|
||||
{
|
||||
MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumF(double &d)
|
||||
{
|
||||
MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumFVector(double *d,int N)
|
||||
{
|
||||
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
}
|
||||
|
||||
int CartesianCommunicator::ShiftedRank(int dim,int shift)
|
||||
{
|
||||
int rank;
|
||||
MPI_Cart_shift(communicator,dim,shift,&_processor,&rank);
|
||||
return rank;
|
||||
}
|
||||
int CartesianCommunicator::Rank(std::vector<int> coor)
|
||||
{
|
||||
int rank;
|
||||
MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||
return rank;
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
std::vector<int> &to_coordinate,
|
||||
void *recv,
|
||||
std::vector<int> &from_coordinate,
|
||||
int bytes)
|
||||
{
|
||||
int dest = Rank(to_coordinate);
|
||||
int from = Rank(from_coordinate);
|
||||
|
||||
MPI_Request x_req;
|
||||
MPI_Request r_req;
|
||||
MPI_Status OkeyDokey;
|
||||
|
||||
MPI_Isend(xmit, bytes, MPI_CHAR,dest,dest,communicator,&x_req);
|
||||
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&r_req);
|
||||
MPI_Wait(&x_req,&OkeyDokey);
|
||||
MPI_Wait(&r_req,&OkeyDokey);
|
||||
MPI_Barrier();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
// Could possibly do a direct block strided send?
|
||||
int MPI_Type_vector(
|
||||
int count,
|
||||
int blocklength,
|
||||
int stride,
|
||||
MPI_Datatype old_type,
|
||||
MPI_Datatype *newtype_p
|
||||
);
|
||||
|
||||
#endif
|
384
Grid_mpi_cshift.h
Normal file
384
Grid_mpi_cshift.h
Normal file
@ -0,0 +1,384 @@
|
||||
#ifndef _GRID_MPI_CSHIFT_H_
|
||||
#define _GRID_MPI_CSHIFT_H_
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Q. Split this into seperate sub functions?
|
||||
//////////////////////////////////////////////
|
||||
|
||||
// CshiftCB_comms_splice
|
||||
// CshiftCB_comms
|
||||
// CshiftCB_local
|
||||
// CshiftCB_local_permute
|
||||
|
||||
// Cshift_comms_splice
|
||||
// Cshift_comms
|
||||
// Cshift_local
|
||||
// Cshift_local_permute
|
||||
|
||||
// Broadly I remain annoyed that the iteration is so painful
|
||||
// for red black data layout, when simple block strided descriptors suffice for non-cb.
|
||||
//
|
||||
// The other option is to do it table driven, or perhaps store the CB of each site in a table.
|
||||
//
|
||||
// Must not lose sight that goal is to be able to construct really efficient
|
||||
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
||||
// additional stencil support.
|
||||
//
|
||||
// Could still do a templated syntax tree and make CSHIFT return lattice vector.
|
||||
//
|
||||
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
||||
//
|
||||
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
||||
//
|
||||
// Grid could create a neighbour index table for a given stencil.
|
||||
// Could also implement CovariantCshift.
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
//Non checkerboarded support functions
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
friend void Gather_plane (Lattice<vobj> &rhs,std::vector<vobj> &buffer, int dimension,int plane)
|
||||
{
|
||||
const int Nsimd = vector_type::Nsimd();
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
|
||||
} else {
|
||||
ret._odata[ro+o+b]=rhs._odata[so+o+b];
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||
//
|
||||
//friend void Scatter_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||
//
|
||||
//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
//
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
//Checkerboarded support functions
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
//friend void GatherCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||
//
|
||||
//friend void ScatterCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||
//
|
||||
//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
|
||||
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
const int Nsimd = vector_type::Nsimd();
|
||||
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
//int ld = rhs._grid->_ldimensions[dimension];
|
||||
//int gd = rhs._grid->_gdimensions[dimension];
|
||||
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
|
||||
// the permute type
|
||||
int simd_layout = rhs._grid->_simd_layout[dimension];
|
||||
int comm_dim = rhs._grid->_processors[dimension] >1 ;
|
||||
int permute_dim = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);
|
||||
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
|
||||
// Logic for non-distributed dimension
|
||||
std::vector<int> comm_offnode(simd_layout);
|
||||
std::vector<int> comm_to (simd_layout);
|
||||
std::vector<int> comm_from (simd_layout);
|
||||
std::vector<int> comm_rx (simd_layout); // reduced coordinate of neighbour plane
|
||||
std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Move via a fake comms buffer
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
|
||||
std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
std::vector<scalar_type *> pointers(Nsimd);
|
||||
|
||||
|
||||
|
||||
if ( permute_dim ) {
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
int o = 0; // relative offset to base
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
|
||||
} else {
|
||||
ret._odata[ro+o+b]=rhs._odata[so+o+b];
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
} else if ( splice_dim ) {
|
||||
|
||||
if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace
|
||||
if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);
|
||||
|
||||
|
||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
|
||||
}
|
||||
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Extract one orthogonal slice at a time
|
||||
///////////////////////////////////////////
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
|
||||
o = 0; // relative offset to base
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
// base offset for source
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
extract(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
|
||||
for(int s=0;s<simd_layout;s++) {
|
||||
|
||||
|
||||
// shift to "neighbour" takes us off node
|
||||
// coordinates (rx, simd_lane) of neighbour
|
||||
// how many nodes away is this shift
|
||||
// where we should send to
|
||||
// where we should receive from
|
||||
int shifted_x = x+s*rd+shift;
|
||||
comm_offnode[s] = shifted_x > ld;
|
||||
comm_send_rx[s] = shifted_x%rd; // which slice geton the other node
|
||||
comm_send_simd_lane [s] = shifted_x/rd; // which slice on the other node
|
||||
comm_from[s] = shifted_x/ld;
|
||||
comm_to [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];
|
||||
comm_from[s] = (comm_from[s]+_processors[dimension]) % _processors[dimension];
|
||||
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Insert communication phase
|
||||
////////////////////////////////////////////////
|
||||
#if 0
|
||||
} else if (comm_dim ) {
|
||||
|
||||
// Packed gather sequence is clean
|
||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];
|
||||
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
|
||||
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
||||
|
||||
// off node; communcate slice (ld==rd)
|
||||
if ( x+shift > rd ) {
|
||||
int sb=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
send_buf[sb++]=rhs._odata[so+i];
|
||||
}
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
// Make a comm_fake them mimics comms in periodic case.
|
||||
|
||||
// scatter face
|
||||
int rb=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
ret._odata[so+i]=recv_buf[rb++];
|
||||
}
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
ret._odata[o+i]=rhs._odata[so+i];
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Pull receive buffers and permuted buffers in
|
||||
////////////////////////////////////////////////
|
||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
|
||||
}
|
||||
|
||||
o = 0; // relative offset to base
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
// base offset for source
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
merge(ret._odata[ro+o+b],pointers);
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else if ( comm_dim ) {
|
||||
|
||||
|
||||
int co; // comm offset
|
||||
int o;
|
||||
|
||||
co=0;
|
||||
for(int x=0;x<rd;x++){
|
||||
o=0;
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
// This call in inner loop is annoying but necessary for dimension=0
|
||||
// in the case of RedBlack grids. Could optimise away with
|
||||
// alternate code paths for all other cases.
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
comm_buf[co++]=rhs._odata[so+o+b];
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
// Step through a copy into a comms buffer and pull back in.
|
||||
// Genuine fake implementation could calculate if loops back
|
||||
co=0; o=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
ret._odata[ro+o+b]=comm_buf[co++];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Local dimension, no permute required
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
int o=0;
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
// This call in inner loop is annoying but necessary for dimension=0
|
||||
// in the case of RedBlack grids. Could optimise away with
|
||||
// alternate code paths for all other cases.
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
313
Grid_none_cshift.h
Normal file
313
Grid_none_cshift.h
Normal file
@ -0,0 +1,313 @@
|
||||
#ifndef _GRID_NONE_CSHIFT_H_
|
||||
#define _GRID_NONE_CSHIFT_H_
|
||||
// Work out whether to permute
|
||||
// ABCDEFGH -> AE BF CG DH permute wrap num
|
||||
//
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
|
||||
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
|
||||
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
|
||||
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
|
||||
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
|
||||
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
|
||||
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
|
||||
|
||||
// Suppose 4way simd in one dim.
|
||||
// ABCDEFGH -> AECG BFDH permute wrap num
|
||||
|
||||
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
|
||||
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
|
||||
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
|
||||
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
|
||||
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
|
||||
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
|
||||
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
|
||||
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
|
||||
|
||||
// Generalisation to 8 way simd, 16 way simd required.
|
||||
//
|
||||
// Need log2 Nway masks. consisting of
|
||||
// 1 bit 256 bit granule
|
||||
// 2 bit 128 bit granule
|
||||
// 4 bits 64 bit granule
|
||||
// 8 bits 32 bit granules
|
||||
//
|
||||
// 15 bits....
|
||||
|
||||
// For optimisation:
|
||||
//
|
||||
// split into Cshift_none_rb_permute
|
||||
// split into Cshift_none_rb_simple
|
||||
//
|
||||
// split into Cshift_none_permute
|
||||
// split into Cshift_none_simple
|
||||
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_ldimensions[dimension];
|
||||
int gd = rhs._grid->_gdimensions[dimension];
|
||||
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
|
||||
// the permute type
|
||||
|
||||
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int bo = x*rhs._grid->_ostride[dimension];
|
||||
int o = 0;
|
||||
|
||||
if ( permute_dim ) {
|
||||
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type);
|
||||
} else {
|
||||
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
|
||||
}
|
||||
} else {
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
// This call in inner loop is annoying but necessary for dimension=0
|
||||
// in the case of RedBlack grids. Could optimise away with
|
||||
// alternate code paths for all other cases.
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Collapse doesn't appear to work the way I think it should in icpc
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||
int sx,so,o;
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_dimensions[dimension];
|
||||
// Map to always positive shift.
|
||||
shift = (shift+ld)%ld;
|
||||
// Work out whether to permute and the permute type
|
||||
// ABCDEFGH -> AE BF CG DH permute
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++)
|
||||
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
||||
|
||||
// loop over perp slices.
|
||||
// Threading considerations:
|
||||
// Need to map thread_num to
|
||||
//
|
||||
// x_min,x_max for Loop-A
|
||||
// n_min,n_max for Loop-B
|
||||
// b_min,b_max for Loop-C
|
||||
// In a way that maximally load balances.
|
||||
//
|
||||
// Optimal:
|
||||
// There are rd*n_block*block items of work.
|
||||
// These serialise as item "w"
|
||||
// b=w%block; w=w/block
|
||||
// n=w%nblock; x=w/nblock. Perhaps 20 cycles?
|
||||
//
|
||||
// Logic:
|
||||
// x_chunk = (rd+thread)/nthreads simply divide work across nodes.
|
||||
//
|
||||
// rd=5 , threads = 8;
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// 0 0 0 1 1 1 1 1
|
||||
for(int x=0;x<rd;x++){ // Loop A
|
||||
sx = (x-shift+ld)%rd;
|
||||
o = x*rhs._grid->_ostride[dimension];
|
||||
so =sx*rhs._grid->_ostride[dimension];
|
||||
int permute_slice=0;
|
||||
if ( permute_dim ) {
|
||||
permute_slice = shift/rd;
|
||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||
}
|
||||
#if 0
|
||||
if ( permute_slice ) {
|
||||
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
int num =rhs._grid->_slice_block[dimension]*internal;
|
||||
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
vComplex *optr = (vComplex *)&ret._odata[o];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so];
|
||||
for(int b=0;b<num;b++){
|
||||
permute(optr[b],iptr[b],permute_type);
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
} else {
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
ret._odata[o+i]=rhs._odata[so+i];
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
if ( permute_slice ) {
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
int num =rhs._grid->_slice_block[dimension]*internal;
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<num;b++){
|
||||
vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
|
||||
permute(optr[b],iptr[b],permute_type);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
int oo = o+ n*rhs._grid->_slice_stride[dimension];
|
||||
int soo=so+ n*rhs._grid->_slice_stride[dimension];
|
||||
ret._odata[oo+i]=rhs._odata[soo+i];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_dimensions[dimension];
|
||||
|
||||
// Map to always positive shift.
|
||||
shift = (shift+ld)%ld;
|
||||
|
||||
// Work out whether to permute and the permute type
|
||||
// ABCDEFGH -> AE BF CG DH permute
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++)
|
||||
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
||||
|
||||
|
||||
// loop over all work
|
||||
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ww=0;ww<work;ww++){
|
||||
|
||||
|
||||
// can optimise this if know w moves congtiguously for a given thread.
|
||||
// b=(b+1);
|
||||
// if (b==_slice_block) {b=0; n=n+1;}
|
||||
// if (n==_slice_nblock) { n=0; x=x+1}
|
||||
//
|
||||
// Perhaps a five cycle iterator, or so.
|
||||
int w=ww;
|
||||
int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
|
||||
int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
|
||||
int x = w;
|
||||
|
||||
int sx,so,o;
|
||||
sx = (x-shift+ld)%rd;
|
||||
o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
|
||||
so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
if ( permute_dim ) {
|
||||
permute_slice = shift/rd;
|
||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||
}
|
||||
|
||||
if ( permute_slice ) {
|
||||
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
vComplex *optr = (vComplex *)&ret._odata[o+b];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so+b];
|
||||
const char *pf = (const char *)iptr;
|
||||
for(int i=0;i<sizeof(vobj);i+=64){
|
||||
_mm_prefetch(pf+i,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
for(int i=0;i<internal;i++){
|
||||
permute(optr[i],iptr[i],permute_type);
|
||||
}
|
||||
} else {
|
||||
const char *pf = (const char *) &rhs._odata[so+b];
|
||||
for(int i=0;i<sizeof(vobj);i+=64){
|
||||
_mm_prefetch(pf+i,_MM_HINT_T0);
|
||||
}
|
||||
ret._odata[o+b]=rhs._odata[so+b];
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
@ -47,7 +47,7 @@ namespace dpo {
|
||||
typedef std::complex<RealF> ComplexF;
|
||||
typedef std::complex<RealD> ComplexD;
|
||||
typedef std::complex<Real> Complex;
|
||||
|
||||
|
||||
|
||||
inline RealF adj(const RealF & r){ return r; }
|
||||
inline RealF conj(const RealF & r){ return r; }
|
||||
@ -130,7 +130,7 @@ namespace dpo {
|
||||
#if defined (AVX1) || defined (AVX2)
|
||||
typedef __m256 fvec;
|
||||
typedef __m256d dvec;
|
||||
typedef __m256 cvec;
|
||||
typedef __m256 cvec;
|
||||
typedef __m256d zvec;
|
||||
#endif
|
||||
#if defined (AVX512)
|
||||
|
@ -8,6 +8,9 @@ namespace dpo {
|
||||
protected:
|
||||
zvec v;
|
||||
public:
|
||||
typedef zvec vector_type;
|
||||
typedef ComplexD scalar_type;
|
||||
|
||||
vComplexD & operator = ( Zero & z){
|
||||
vzero(*this);
|
||||
return (*this);
|
||||
@ -145,14 +148,43 @@ namespace dpo {
|
||||
ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
|
||||
ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1);
|
||||
/* Imag OK */
|
||||
|
||||
|
||||
#endif
|
||||
#ifdef QPX
|
||||
ret.v = vec_mul(a.v,b.v);
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Extract
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vComplexD::Nsimd();
|
||||
std::vector<ComplexD> buf(Nsimd);
|
||||
|
||||
vstore(y,&buf[0]);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
};
|
||||
|
||||
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vComplexD::Nsimd();
|
||||
std::vector<ComplexD> buf(Nsimd);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,&buf[0]);
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
@ -165,7 +197,7 @@ namespace dpo {
|
||||
// AB => BA i.e. ab cd =>cd ab
|
||||
#endif
|
||||
#ifdef SSE2
|
||||
//No cases here
|
||||
break;
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
// 4 complex=>2 permute
|
||||
@ -205,7 +237,7 @@ namespace dpo {
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline void vset(vComplexD &ret, std::complex<double> *a){
|
||||
friend inline void vset(vComplexD &ret,ComplexD *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
#endif
|
||||
@ -221,7 +253,7 @@ namespace dpo {
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline void vstore(vComplexD &ret, std::complex<double> *a){
|
||||
friend inline void vstore(vComplexD &ret, ComplexD *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
_mm256_store_pd((double *)a,ret.v);
|
||||
#endif
|
||||
@ -273,10 +305,10 @@ friend inline void vstore(vComplexD &ret, std::complex<double> *a){
|
||||
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
|
||||
__attribute__ ((aligned(32))) double c_[4];
|
||||
_mm256_store_pd(c_,in.v);
|
||||
return std::complex<double>(c_[0]+c_[2],c_[1]+c_[3]);
|
||||
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return std::complex<double>(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
|
||||
return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
|
@ -8,6 +8,11 @@ namespace dpo {
|
||||
cvec v;
|
||||
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
|
||||
public:
|
||||
typedef cvec vector_type;
|
||||
typedef ComplexF scalar_type;
|
||||
|
||||
vComplexF & operator = ( Zero & z){
|
||||
vzero(*this);
|
||||
return (*this);
|
||||
@ -125,6 +130,39 @@ namespace dpo {
|
||||
return ret;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Extract
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted){
|
||||
// Bounce off heap is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
vComplexF vbuf;
|
||||
ComplexF *buf = (ComplexF *)&vbuf;
|
||||
|
||||
vstore(y,&buf[0]);
|
||||
for(int i=0;i<vComplexF::Nsimd();i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vComplexF::Nsimd();
|
||||
vComplexF vbuf;
|
||||
ComplexF *buf = (ComplexF *)&vbuf;
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,&buf[0]);
|
||||
};
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
/////////////////////////////////////////////////////////////////
|
||||
@ -185,7 +223,7 @@ namespace dpo {
|
||||
vsplat(ret,a,b);
|
||||
}
|
||||
|
||||
friend inline void vstore(vComplexF &ret, std::complex<float> *a){
|
||||
friend inline void vstore(vComplexF &ret, ComplexF *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
_mm256_store_ps((float *)a,ret.v);
|
||||
#endif
|
||||
@ -225,11 +263,11 @@ exit(-1);
|
||||
#if defined (AVX1) || defined(AVX2)
|
||||
__attribute__ ((aligned(32))) float c_[8];
|
||||
_mm256_store_ps(c_,in.v);
|
||||
return std::complex<float>(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
|
||||
return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
|
||||
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
return std::complex<float>(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
|
||||
return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#endif
|
||||
@ -321,9 +359,6 @@ exit(-1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
static int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
|
||||
};
|
||||
|
||||
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
|
||||
|
@ -7,7 +7,11 @@ namespace dpo{
|
||||
class vRealD {
|
||||
protected:
|
||||
dvec v; // dvec is double precision vector
|
||||
|
||||
public:
|
||||
typedef dvec vector_type;
|
||||
typedef RealD scalar_type;
|
||||
|
||||
vRealD(){};
|
||||
|
||||
friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
|
||||
@ -94,6 +98,37 @@ namespace dpo{
|
||||
#endif
|
||||
return ret;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Extract
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void extract(vRealD &y,std::vector<RealD *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vRealD::Nsimd();
|
||||
RealD buf[Nsimd];
|
||||
|
||||
vstore(y,buf);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
};
|
||||
|
||||
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vRealD::Nsimd();
|
||||
RealD buf[Nsimd];
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,buf);
|
||||
};
|
||||
|
||||
|
||||
// Permute plans
|
||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||
|
@ -7,7 +7,12 @@ namespace dpo {
|
||||
class vRealF {
|
||||
protected:
|
||||
fvec v;
|
||||
|
||||
public:
|
||||
|
||||
typedef fvec vector_type;
|
||||
typedef RealF scalar_type;
|
||||
|
||||
vRealF(){};
|
||||
////////////////////////////////////
|
||||
// Arithmetic operator overloads +,-,*
|
||||
@ -113,6 +118,37 @@ namespace dpo {
|
||||
//////////////////////////////////
|
||||
friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
|
||||
friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Extract
|
||||
/////////////////////////////////////////////////////////////////
|
||||
friend inline void extract(vRealF &y,std::vector<RealF *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vRealF::Nsimd();
|
||||
RealF buf[Nsimd];
|
||||
|
||||
vstore(y,buf);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
*extracted[i] = buf[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
};
|
||||
|
||||
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted){
|
||||
// Bounce off stack is painful
|
||||
// temporary hack while I figure out the right interface
|
||||
const int Nsimd = vRealF::Nsimd();
|
||||
RealF buf[Nsimd];
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
buf[i]=*extracted[i];
|
||||
extracted[i]++;
|
||||
}
|
||||
vset(y,buf);
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
@ -256,7 +292,7 @@ friend inline void vstore(vRealF &ret, float *a){
|
||||
return *this;
|
||||
}
|
||||
public:
|
||||
static int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
||||
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
||||
};
|
||||
inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
||||
inline void zeroit(vRealF &z){ vzero(z);}
|
||||
|
Loading…
Reference in New Issue
Block a user