From 98f14f10304efefc1e07a07309f5673e23f433c0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 29 Mar 2015 20:35:37 +0100 Subject: [PATCH] Fixing the Checkerboarding cshift. Implemented "fake" communications in preparation for the leap to MPI. --- Grid/Grid.xcodeproj/project.pbxproj | 239 +++++++++++ .../contents.xcworkspacedata | 7 + .../UserInterfaceState.xcuserstate | Bin 0 -> 4124 bytes .../pab.xcuserdatad/xcschemes/Grid.xcscheme | 43 ++ .../xcschemes/xcschememanagement.plist | 22 + Grid_Cartesian.h | 184 ++++++--- Grid_Communicator.h | 39 ++ Grid_Lattice.h | 299 ++++---------- Grid_QCD.h | 1 - Grid_config.h | 12 + Grid_fake.cc | 34 ++ Grid_fake_cshift.h | 246 +++++++++++ Grid_init.cc | 4 +- Grid_main.cc | 124 ++++-- Grid_math_types.h | 113 +++++- Grid_mpi.cc | 85 ++++ Grid_mpi_cshift.h | 384 ++++++++++++++++++ Grid_none_cshift.h | 313 ++++++++++++++ Grid_simd.h | 4 +- Grid_vComplexD.h | 46 ++- Grid_vComplexF.h | 47 ++- Grid_vRealD.h | 35 ++ Grid_vRealF.h | 38 +- 23 files changed, 1976 insertions(+), 343 deletions(-) create mode 100644 Grid/Grid.xcodeproj/project.pbxproj create mode 100644 Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata create mode 100644 Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate create mode 100644 Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/Grid.xcscheme create mode 100644 Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/xcschememanagement.plist create mode 100644 Grid_Communicator.h create mode 100644 Grid_fake.cc create mode 100644 Grid_fake_cshift.h create mode 100644 Grid_mpi.cc create mode 100644 Grid_mpi_cshift.h create mode 100644 Grid_none_cshift.h diff --git a/Grid/Grid.xcodeproj/project.pbxproj b/Grid/Grid.xcodeproj/project.pbxproj new file mode 100644 index 00000000..20cd1d1a --- /dev/null +++ b/Grid/Grid.xcodeproj/project.pbxproj @@ -0,0 +1,239 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 46; + objects = { + +/* Begin PBXBuildFile section */ + 0BF5DA461AC370840045EA34 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0BF5DA451AC370840045EA34 /* main.cpp */; }; +/* End PBXBuildFile section */ + +/* Begin PBXCopyFilesBuildPhase section */ + 0BF5DA401AC370840045EA34 /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = /usr/share/man/man1/; + dstSubfolderSpec = 0; + files = ( + ); + runOnlyForDeploymentPostprocessing = 1; + }; +/* End PBXCopyFilesBuildPhase section */ + +/* Begin PBXFileReference section */ + 0BF5DA421AC370840045EA34 /* Grid */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Grid; sourceTree = BUILT_PRODUCTS_DIR; }; + 0BF5DA451AC370840045EA34 /* main.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = main.cpp; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 0BF5DA3F1AC370840045EA34 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 0BF5DA391AC370840045EA34 = { + isa = PBXGroup; + children = ( + 0BF5DA441AC370840045EA34 /* Grid */, + 0BF5DA431AC370840045EA34 /* Products */, + ); + sourceTree = ""; + }; + 0BF5DA431AC370840045EA34 /* Products */ = { + isa = PBXGroup; + children = ( + 0BF5DA421AC370840045EA34 /* Grid */, + ); + name = Products; + sourceTree = ""; + }; + 0BF5DA441AC370840045EA34 /* Grid */ = { + isa = PBXGroup; + children = ( + 0BF5DA451AC370840045EA34 /* main.cpp */, + ); + path = Grid; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + 0BF5DA411AC370840045EA34 /* Grid */ = { + isa = PBXNativeTarget; + buildConfigurationList = 0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */; + buildPhases = ( + 0BF5DA3E1AC370840045EA34 /* Sources */, + 0BF5DA3F1AC370840045EA34 /* Frameworks */, + 0BF5DA401AC370840045EA34 /* CopyFiles */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = Grid; + productName = Grid; + productReference = 0BF5DA421AC370840045EA34 /* Grid */; + productType = "com.apple.product-type.tool"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 0BF5DA3A1AC370840045EA34 /* Project object */ = { + isa = PBXProject; + attributes = { + LastUpgradeCheck = 0610; + ORGANIZATIONNAME = "University of Edinburgh"; + TargetAttributes = { + 0BF5DA411AC370840045EA34 = { + CreatedOnToolsVersion = 6.1.1; + }; + }; + }; + buildConfigurationList = 0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */; + compatibilityVersion = "Xcode 3.2"; + developmentRegion = English; + hasScannedForEncodings = 0; + knownRegions = ( + en, + ); + mainGroup = 0BF5DA391AC370840045EA34; + productRefGroup = 0BF5DA431AC370840045EA34 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 0BF5DA411AC370840045EA34 /* Grid */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXSourcesBuildPhase section */ + 0BF5DA3E1AC370840045EA34 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 0BF5DA461AC370840045EA34 /* main.cpp in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + 0BF5DA471AC370840045EA34 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_SYMBOLS_PRIVATE_EXTERN = NO; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + MACOSX_DEPLOYMENT_TARGET = 10.10; + MTL_ENABLE_DEBUG_INFO = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = macosx; + }; + name = Debug; + }; + 0BF5DA481AC370840045EA34 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = YES; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + MACOSX_DEPLOYMENT_TARGET = 10.10; + MTL_ENABLE_DEBUG_INFO = NO; + SDKROOT = macosx; + }; + name = Release; + }; + 0BF5DA4A1AC370840045EA34 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Debug; + }; + 0BF5DA4B1AC370840045EA34 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 0BF5DA471AC370840045EA34 /* Debug */, + 0BF5DA481AC370840045EA34 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 0BF5DA4A1AC370840045EA34 /* Debug */, + 0BF5DA4B1AC370840045EA34 /* Release */, + ); + defaultConfigurationIsVisible = 0; + }; +/* End XCConfigurationList section */ + }; + rootObject = 0BF5DA3A1AC370840045EA34 /* Project object */; +} diff --git a/Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata new file mode 100644 index 00000000..ce76f4f5 --- /dev/null +++ b/Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -0,0 +1,7 @@ + + + + + diff --git a/Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate b/Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate new file mode 100644 index 0000000000000000000000000000000000000000..638ce019b327e05f2249ae344a55de843e067d13 GIT binary patch literal 4124 zcma)933O9s7QQzxdH;KVB2CLy*0!`35n2O;E}*69rUhD{X-g^4@R~l_2Wb*t5=vR* zN6A~Wh@#B@@|r@_Imh;#yyd_D zUB3I>d;ixO4rr06%hd-677)M+HV~2FhNhgAsvglo!KR#0>vGi}jV#Pjbbp7&U)SeE zL*X>Q8@KOqt^fiO42B^v6vo44Faa)yi7*NBARn%PsW2U8z)UEID!3XJ!D3hfE#QNt zpg=45p$$41%1{>gd*a$bl&G2iu8}5O7;Xb$@9)ySBxA1#-6#f8Dz_ajY zcm>{sze69q1Bc)k{1-lh&*2L=4qw7oZ~}gS^9YD&!9kdUPE5r#9EziGG>*Y+ybN79 z8H;f`mS6=|<9w_`FD}BxxCB)UVH8*4+J>B9ED-ny$Uwmk4oHEt1-0J%6>4{+nP2-M zY0U8llt`o(E`dSJ!h~LMLMpSeAMe-L-0Ojnf^;gA-LMm+wgwo-|zZ9eD7 z3U!$h3q&j1v}j28M)})FRY-5Bn-lRlhgYbrvG#T?*uFpuwuQRN)DC5(7Sesrltdur z0+&s%Q~f+JQ%_>GUyUamuHu15s)IFJP#aLnol1Hx4+SHkfGQ`VcEM!G-M*a-ItXr< z1_e;qYyurrI@KogR^_G#*LrJWQKdDYR%kr3P*BmkcX06{C|+Faz3{H7);l-OVJ94n z=krP^>ERg@SgL8E6g({hx;(nBa5be+2E{z*@keu81(a-OsZzqtYhXU)?uKfZ1+!re z)IcrFg?Vf!8^(sSbT)#GWTV*V-8?~FsOQNU#Fv}mDH_AZ^5h77Inzwi2!ktesOv+a zK&zrxM;f(=#v^MPSgF!^^gd@6e=%2YQ+2g%fL--UE5|UZhXMg!v_9trb=AQt&f|z* zSJhxmsEzNRbTKVWn-}&r1hs2pYIPe=sJ2W~dBQWec$_xs&s+03ODe))crWWfm8% zo0Pd`vfJgFl*x-`-8wTLDzx{&T#jOfnZ`2EdX20l60`y$M)d@s6M_(eFkA~d%VOhL zHp^j`vRpQP7q6TQSP5OQieFdr%5kvE*i=@)uQQG68Rm&Zc!N|D1dq&7xC3<a!;Lm6< zoG|rkYGes_!K_3>GJ(S;xDj%D;089K2Y$saH{wdIXsjP_r8=s1ModU=fs$U>3|rVl zHkqX+6#NG6fV5t?4Q^+XSY9vO30qk{yMnova5WdR#mS$I;A)iipcaj_sq?Z+H>QK} zXQrqK9_TgdGog7K^!!Nk4z76$b7hx?GUNLalDptxBZ%G1-2;!XY4ISuYJg8BwQT@J zpEI+`&uN~|ahR<|^9L64y-_{pkH&O0p>Gd7HUP@wtk7sSMyr1kN_N0guos?YMQl2| z@&X9^;DsbtL+}!(QL*7_xHQStSK+lJNgdu`7#{|~ z2J&iT^CxEh2}h0oa~S>wN8sP^F1!ct!+%&Q^RO~j&MH_Xt72F0=A9>l(>x14f{)=7 z_>^}X2Pb+8yM`@hOPFdP=`>hm_GeW$j2T2S-kD_L4ZG1i4fKzTSZgOox!HU&`^sRx zqkpEv6&YH1I{IJDGqy*fysy-$VO@>z8^!3@yji6t><;j;IS?FFSUhU_p*B=V=7Mz*17~dY!qrAfXx)$cGgzq1gBofzN-yK%t9L2%VzMW+x zd>)4BoX9ww)%QRa_s^*A{)t+p1){2M1n$Iy_*l$L@E)_+f_~m(4ljJX6mwYMRCb_Ns=@0N^`m=JerA~0guX9OTwc{ z!=oy^n)z7k1&?N7&Bb=G7U#01Oku8tJh3JpvrDz5NfzsILDHTkT*&;a&8V`JIBSi= zj>-G%(#0>u_x526`fw@OQNdR9BR?{w>7h{6slv2k^E$dwKHJL*z!ZT4BNSZ4%BctH=O0p*I*c z7{J-n)_<1t#E;_!1(R!PQApPk^~Eibi0h3uVK2g!noiisY zcH?TFb3DiB=ZAkX7`?a#ihsFyd?4UDyw2y${sk6#alN7amx%JT)_P~F-D;Z&k*XiS z>v3a4{FKWOBz~A4$G=jh@voDan2uv{B2MLB1C=-n=Ws@Ou>lw0LJZ^GxD)?`PvTyD z2KV9f_#(cH`|%+5;UPSXNAO*IACKZ0%MeSxrPQ*>60>Z!Y_Z&Dx!>}DWt(M}<#EeC z%K^(_%SV}1qT7`%Z z6S{-X04wm~+hEzLH>Hq4f88)+MD8*3}H&9bewZLvLPd)xM&?U?Oz+i}|o z+bP@EA{7UV>EcLnv^Z9rD7wTcqFXE$XNXscGsPNlsi=s{#fZ2{yg|HK+$`QI-XU%k z?-uVBd&NEC=QcB85 z1*sv;#77k3Cmm!t36LP^BAduf@$zm?C)=P6QwI%p~#Oo!6pG>1;06KNiuLfy207EuqarC!=V7tqDD zg)XJ7bOl{SyXk7Wmfl1+(_86n^bWd}ZlgQsLv$B?m_9}K(dX%l^i}#-`a1m^?W6C} zWAr%vik_t3&~NEE`h(qRA7Q`TKGVL?9<{Huud=VUueEQrZ?ivSf6U%zKWhKL{-OP2 z`=<_c2o9TrIAlkfW2j@eV}xUzV~%6KV~t~z;}ORm$2*Qs9G^K(#QPjrEanwAnb*Xx H + + + + + + + + + + + + + + + + + + diff --git a/Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/xcschememanagement.plist b/Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/xcschememanagement.plist new file mode 100644 index 00000000..1d4ba5a6 --- /dev/null +++ b/Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/xcschememanagement.plist @@ -0,0 +1,22 @@ + + + + + SchemeUserState + + Grid.xcscheme + + orderHint + 0 + + + SuppressBuildableAutocreation + + 0BF5DA411AC370840045EA34 + + primary + + + + + diff --git a/Grid_Cartesian.h b/Grid_Cartesian.h index 488d47e9..bfe301a9 100644 --- a/Grid_Cartesian.h +++ b/Grid_Cartesian.h @@ -1,10 +1,9 @@ #ifndef GRID_CARTESIAN_H #define GRID_CARTESIAN_H -#include "Grid.h" - +#include +#include namespace dpo{ - ///////////////////////////////////////////////////////////////////////////////////////// // Grid Support. Following will go into Grid.h. @@ -13,28 +12,47 @@ namespace dpo{ // dpo::Grid // dpo::GridCartesian // dpo::GridCartesianRedBlack - -class Grid { + + +class SimdGrid : public CartesianCommunicator { public: + + + SimdGrid(std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; + // Give Lattice access template friend class Lattice; - //protected: - // Lattice wide random support. not yet fully implemented. Need seed strategy - // and one generator per site. - //std::default_random_engine generator; + // Lattice wide random support. not yet fully implemented. Need seed strategy + // and one generator per site. + //std::default_random_engine generator; // static std::mt19937 generator( 9 ); - // Grid information. - unsigned long _ndimension; - std::vector _layout; // Which dimensions get relayed out over simd lanes. - std::vector _dimensions; // Dimensions of array - std::vector _rdimensions;// Reduced dimensions with simd lane images removed + + // Commicator provides + // unsigned long _ndimension; + // std::vector _processors; // processor grid + // int _processor; // linear processor rank + // std::vector _processor_coor; // linear processor rank + + std::vector _simd_layout; // Which dimensions get relayed out over simd lanes. + + + std::vector _fdimensions;// Global dimensions of array with cb removed + std::vector _gdimensions;// Global dimensions of array + std::vector _ldimensions;// local dimensions of array with processor images removed + std::vector _rdimensions;// Reduced local dimensions with simd lane images and processor images removed + + // std::vector _lstart; // local start of array in gcoors. _processor_coor[d]*_ldimensions[d] + // std::vector _lend; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 + + std::vector _ostride; // Outer stride for each dimension std::vector _istride; // Inner stride i.e. within simd lane + int _osites; // _isites*_osites = product(dimensions). int _isites; @@ -70,15 +88,33 @@ public: for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]); return idx; } - + inline int oSites(void) { return _osites; }; inline int iSites(void) { return _isites; }; + + inline int CheckerboardFromOsite (int Osite){ + std::vector ocoor; + CoordFromOsite(ocoor,Osite); + int ss=0; + for(int d=0;d<_ndimension;d++){ + ss=ss+ocoor[d]; + } + return ss&0x1; + } + inline void CoordFromOsite (std::vector& coor,int Osite){ + coor.resize(_ndimension); + for(int d=0;d<_ndimension;d++){ + coor[d] = Osite % _rdimensions[d]; + Osite = Osite / _rdimensions[d]; + } + } + virtual int CheckerBoard(std::vector site)=0; virtual int CheckerBoardDestination(int source_cb,int shift)=0; - virtual int CheckerBoardShift(int source_cb,int dim,int shift)=0; + virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; }; -class GridCartesian: public Grid { +class GridCartesian: public SimdGrid { public: virtual int CheckerBoard(std::vector site){ return 0; @@ -86,19 +122,24 @@ public: virtual int CheckerBoardDestination(int cb,int shift){ return 0; } - virtual int CheckerBoardShift(int source_cb,int dim,int shift){ + virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){ return shift; } - GridCartesian(std::vector &dimensions,std::vector layout) + GridCartesian(std::vector &dimensions, + std::vector &simd_layout, + std::vector &processor_grid + ) : SimdGrid(processor_grid) { /////////////////////// // Grid information /////////////////////// _ndimension = dimensions.size(); - _dimensions.resize(_ndimension); + _fdimensions.resize(_ndimension); + _gdimensions.resize(_ndimension); + _ldimensions.resize(_ndimension); _rdimensions.resize(_ndimension); - _layout.resize(_ndimension); + _simd_layout.resize(_ndimension); _ostride.resize(_ndimension); _istride.resize(_ndimension); @@ -106,21 +147,26 @@ public: _osites = 1; _isites = 1; for(int d=0;d<_ndimension;d++){ - _dimensions[d] = dimensions[d]; - _layout[d] = layout[d]; - // Use a reduced simd grid - _rdimensions[d]= _dimensions[d]/_layout[d]; //<-- _layout[d] is zero - _osites *= _rdimensions[d]; - _isites *= _layout[d]; + _fdimensions[d] = dimensions[d]; // Global dimensions + _gdimensions[d] = _fdimensions[d]; // Global dimensions + _simd_layout[d] = simd_layout[d]; + + //FIXME check for exact division + + // Use a reduced simd grid + _ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions + _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition + _osites *= _rdimensions[d]; + _isites *= _simd_layout[d]; - // Addressing support - if ( d==0 ) { - _ostride[d] = 1; - _istride[d] = 1; - } else { - _ostride[d] = _ostride[d-1]*_rdimensions[d-1]; - _istride[d] = _istride[d-1]*_layout[d-1]; - } + // Addressing support + if ( d==0 ) { + _ostride[d] = 1; + _istride[d] = 1; + } else { + _ostride[d] = _ostride[d-1]*_rdimensions[d-1]; + _istride[d] = _istride[d-1]*_simd_layout[d-1]; + } } /////////////////////// @@ -148,46 +194,57 @@ public: } }; }; - + // Specialise this for red black grids storing half the data like a chess board. -class GridRedBlackCartesian : public Grid +class GridRedBlackCartesian : public SimdGrid { public: virtual int CheckerBoard(std::vector site){ - return site[0]&0x1; + return (site[0]+site[1]+site[2]+site[3])&0x1; } - virtual int CheckerBoardShift(int source_cb,int dim,int shift){ - if ( dim == 0 ){ - int fulldim =2*_dimensions[0]; - shift = (shift+fulldim)%fulldim; - if ( source_cb ) { - // Shift 0,1 -> 0 - return (shift+1)/2; - } else { - // Shift 0->0, 1 -> 1, 2->1 - return (shift)/2; - } - } else { - return shift; - } + + // Depending on the cb of site, we toggle source cb. + // for block #b, element #e = (b, e) + // we need + virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){ + + if(dim != 0) return shift; + + int fulldim =_fdimensions[0]; + shift = (shift+fulldim)%fulldim; + + // Probably faster with table lookup; + // or by looping over x,y,z and multiply rather than computing checkerboard. + int ocb=CheckerboardFromOsite(osite); + + if ( (source_cb+ocb)&1 ) { + return (shift)/2; + } else { + return (shift+1)/2; + } } + virtual int CheckerBoardDestination(int source_cb,int shift){ - if ((shift+2*_dimensions[0])&0x1) { + if ((shift+_fdimensions[0])&0x1) { return 1-source_cb; } else { return source_cb; } }; - GridRedBlackCartesian(std::vector &dimensions,std::vector layout) + GridRedBlackCartesian(std::vector &dimensions, + std::vector &simd_layout, + std::vector &processor_grid) : SimdGrid(processor_grid) { /////////////////////// // Grid information /////////////////////// _ndimension = dimensions.size(); - _dimensions.resize(_ndimension); + _fdimensions.resize(_ndimension); + _gdimensions.resize(_ndimension); + _ldimensions.resize(_ndimension); _rdimensions.resize(_ndimension); - _layout.resize(_ndimension); + _simd_layout.resize(_ndimension); _ostride.resize(_ndimension); _istride.resize(_ndimension); @@ -195,14 +252,17 @@ public: _osites = 1; _isites = 1; for(int d=0;d<_ndimension;d++){ - _dimensions[d] = dimensions[d]; - _dimensions[0] = dimensions[0]/2; - _layout[d] = layout[d]; + _fdimensions[d] = dimensions[d]; + _gdimensions[d] = _fdimensions[d]; + if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard + _ldimensions[d] = _gdimensions[d]/_processors[d]; // Use a reduced simd grid - _rdimensions[d]= _dimensions[d]/_layout[d]; + _simd_layout[d] = simd_layout[d]; + _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; + _osites *= _rdimensions[d]; - _isites *= _layout[d]; + _isites *= _simd_layout[d]; // Addressing support if ( d==0 ) { @@ -210,7 +270,7 @@ public: _istride[d] = 1; } else { _ostride[d] = _ostride[d-1]*_rdimensions[d-1]; - _istride[d] = _istride[d-1]*_layout[d-1]; + _istride[d] = _istride[d-1]*_simd_layout[d-1]; } } diff --git a/Grid_Communicator.h b/Grid_Communicator.h new file mode 100644 index 00000000..5be6f26f --- /dev/null +++ b/Grid_Communicator.h @@ -0,0 +1,39 @@ +#ifndef GRID_COMMUNICATOR_H +#define GRID_COMMUNICATOR_H +/////////////////////////////////// +// Processor layout information +/////////////////////////////////// +namespace dpo { +class CartesianCommunicator { + public: + + // Communicator should know nothing of the physics grid, only processor grid. + + std::vector _processors; // Which dimensions get relayed out over processors lanes. + int _processor; // linear processor rank + std::vector _processor_coor; // linear processor coordinate + unsigned long _ndimension; + +#ifdef GRID_COMMS_MPI + MPI_Comm communicator; +#endif + + CartesianCommunicator(std::vector &pdimensions_in); + + int Rank(std::vector coor); + void GlobalSumF(float &); + void GlobalSumFVector(float *,int N); + + void GlobalSumF(double &); + void GlobalSumFVector(double *,int N); + + void SendToRecvFrom(void *xmit, + std::vector to_coordinate, + void *recv, + std::vector from_coordinate, + int bytes); + +}; +} + +#endif diff --git a/Grid_Lattice.h b/Grid_Lattice.h index 223ad2cd..8c8dc386 100644 --- a/Grid_Lattice.h +++ b/Grid_Lattice.h @@ -3,19 +3,31 @@ #include "Grid.h" + + namespace dpo { +// Permute the pointers 32bitx16 = 512 +static int permute_map[4][16] = { + { 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}, + { 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}, + { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}, + { 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8} +}; + + template class Lattice { public: - Grid *_grid; + SimdGrid *_grid; int checkerboard; std::vector > _odata; public: - - Lattice(Grid *grid) : _grid(grid) { + + + Lattice(SimdGrid *grid) : _grid(grid) { _odata.reserve(_grid->oSites()); if ( ((uint64_t)&_odata[0])&0xF) { exit(-1); @@ -23,6 +35,19 @@ public: checkerboard=0; } + +#ifdef GRID_COMMS_NONE +#include +#endif + +#ifdef GRID_COMMS_FAKE +#include +#endif + +#ifdef GRID_COMMS_MPI +#include +#endif + // overloading dpo::conformable but no conformable in dpo ...?:w template @@ -69,212 +94,6 @@ public: return ret; }; -#if 0 - // Collapse doesn't appear to work the way I think it should in icpc - friend Lattice Cshift(Lattice &rhs,int dimension,int shift) - { - Lattice ret(rhs._grid); - - ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); - shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); - int sx,so,o; - int rd = rhs._grid->_rdimensions[dimension]; - int ld = rhs._grid->_dimensions[dimension]; - - // Map to always positive shift. - shift = (shift+ld)%ld; - - // Work out whether to permute and the permute type - // ABCDEFGH -> AE BF CG DH permute - // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH - // Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG - // Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF - // Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE - // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD - // Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC - // Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB - // Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA - int permute_dim =rhs._grid->_layout[dimension]>1 ; - int permute_type=0; - for(int d=0;d_layout[d]>1 ) permute_type++; - - - // loop over perp slices. - // Threading considerations: - // Need to map thread_num to - // - // x_min,x_max for Loop-A - // n_min,n_max for Loop-B - // b_min,b_max for Loop-C - // In a way that maximally load balances. - // - // Optimal: - // There are rd*n_block*block items of work. - // These serialise as item "w" - // b=w%block; w=w/block - // n=w%nblock; x=w/nblock. Perhaps 20 cycles? - // - // Logic: - // x_chunk = (rd+thread)/nthreads simply divide work across nodes. - // - // rd=5 , threads = 8; - // 0 1 2 3 4 5 6 7 - // 0 0 0 1 1 1 1 1 - for(int x=0;x_ostride[dimension]; - so =sx*rhs._grid->_ostride[dimension]; - - int permute_slice=0; - if ( permute_dim ) { - permute_slice = shift/rd; - if ( x_slice_block[dimension]*internal; - - for(int n=0;n_slice_nblock[dimension];n++){ - vComplex *optr = (vComplex *)&ret._odata[o]; - vComplex *iptr = (vComplex *)&rhs._odata[so]; - for(int b=0;b_slice_stride[dimension]; - so+=rhs._grid->_slice_stride[dimension]; - } - } else { - for(int n=0;n_slice_nblock[dimension];n++){ - for(int i=0;i_slice_block[dimension];i++){ - ret._odata[o+i]=rhs._odata[so+i]; - } - o+=rhs._grid->_slice_stride[dimension]; - so+=rhs._grid->_slice_stride[dimension]; - } - - } -#else - - if ( permute_slice ) { - - int internal=sizeof(vobj)/sizeof(vComplex); - int num =rhs._grid->_slice_block[dimension]*internal; - - -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_stride[dimension]]; - vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]]; - permute(optr[b],iptr[b],permute_type); - } - } - - } else { - - -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int i=0;i_slice_block[dimension];i++){ - int oo = o+ n*rhs._grid->_slice_stride[dimension]; - int soo=so+ n*rhs._grid->_slice_stride[dimension]; - ret._odata[oo+i]=rhs._odata[soo+i]; - } - } - - } -#endif - } - return ret; - } -#else - friend Lattice Cshift(Lattice &rhs,int dimension,int shift) - { - Lattice ret(rhs._grid); - - ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); - shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); - int rd = rhs._grid->_rdimensions[dimension]; - int ld = rhs._grid->_dimensions[dimension]; - - // Map to always positive shift. - shift = (shift+ld)%ld; - - // Work out whether to permute and the permute type - // ABCDEFGH -> AE BF CG DH permute - // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH - // Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG - // Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF - // Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE - // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD - // Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC - // Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB - // Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA - int permute_dim =rhs._grid->_layout[dimension]>1 ; - int permute_type=0; - for(int d=0;d_layout[d]>1 ) permute_type++; - - - // loop over all work - int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; - -#pragma omp parallel for - for(int ww=0;ww_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension]; - int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension]; - int x = w; - - int sx,so,o; - sx = (x-shift+ld)%rd; - o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert. - so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; - - int permute_slice=0; - if ( permute_dim ) { - permute_slice = shift/rd; - if ( x inline Lattice & operator = (const sobj & r){ #pragma omp parallel for @@ -288,14 +107,15 @@ public: template friend void pokeSite(const sobj &s,Lattice &l,std::vector &site){ - if ( l._grid.checkerboard != l._grid->Checkerboard(site)){ - printf("Poking wrong checkerboard\n"); - exit(EXIT_FAILURE); + if ( l.checkerboard != l._grid->CheckerBoard(site)){ + printf("Poking wrong checkerboard\n"); + exit(EXIT_FAILURE); } int o_index = l._grid->oIndex(site); int i_index = l._grid->iIndex(site); + // BUGGY. This assumes complex real Real *v_ptr = (Real *)&l._odata[o_index]; Real *s_ptr = (Real *)&s; v_ptr = v_ptr + 2*i_index; @@ -348,6 +168,22 @@ public: } }; + friend void lex_sites(Lattice &l){ + + Real *v_ptr = (Real *)&l._odata[0]; + size_t o_len = l._grid->oSites(); + size_t v_len = sizeof(vobj)/sizeof(vRealF); + size_t vec_len = vRealF::Nsimd(); + + for(int i=0;i &l){ // Zero mean, unit variance. std::normal_distribution distribution(0.0,1.0); @@ -427,22 +263,41 @@ public: return ret; }; - - // remove and insert a half checkerboard friend void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ + half.checkerboard = cb; + int ssh=0; #pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - half._odata[ss] = full._odata[ss*2+cb]; - } - half.checkerboard = cb; + for(int ss=0;ssoSites();ss++){ + std::vector coor; + int cbos; + + full._grid->CoordFromOsite(coor,ss); + cbos=half._grid->CheckerBoard(coor); + + if (cbos==cb) { + + half._odata[ssh] = full._odata[ss]; + ssh++; + } + } } friend void setCheckerboard(Lattice &full,const Lattice &half){ - int cb = half.checkerboard; + int cb = half.checkerboard; + int ssh=0; #pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - full._odata[ss*2+cb]=half._odata[ss]; - } + for(int ss=0;ssoSites();ss++){ + std::vector coor; + int cbos; + + full._grid->CoordFromOsite(coor,ss); + cbos=half._grid->CheckerBoard(coor); + + if (cbos==cb) { + full._odata[ss]=half._odata[ssh]; + ssh++; + } + } } }; // class Lattice diff --git a/Grid_QCD.h b/Grid_QCD.h index 92a02506..e51e2641 100644 --- a/Grid_QCD.h +++ b/Grid_QCD.h @@ -5,7 +5,6 @@ namespace QCD { static const int Nc=3; static const int Ns=4; - static const int CbRed =0; static const int CbBlack=1; diff --git a/Grid_config.h b/Grid_config.h index 64003419..2a442ea5 100644 --- a/Grid_config.h +++ b/Grid_config.h @@ -10,6 +10,15 @@ /* AVX512 */ /* #undef AVX512 */ +/* GRID_COMMS_FAKE */ +#define GRID_COMMS_FAKE 1 + +/* GRID_COMMS_MPI */ +/* #undef GRID_COMMS_MPI */ + +/* GRID_COMMS_NONE */ +/* #undef GRID_COMMS_NONE */ + /* Define to 1 if you have the `gettimeofday' function. */ #define HAVE_GETTIMEOFDAY 1 @@ -61,6 +70,9 @@ /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "grid" +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + /* Define to the version of this package. */ #define PACKAGE_VERSION "1.0" diff --git a/Grid_fake.cc b/Grid_fake.cc new file mode 100644 index 00000000..7e664f33 --- /dev/null +++ b/Grid_fake.cc @@ -0,0 +1,34 @@ +#include "Grid.h" +namespace dpo { + +CartesianCommunicator::CartesianCommunicator(std::vector &processors) +{ + _ndimension = _processors.size(); + _processor_coor.resize(_ndimension); + _processors = processors; + + // Require 1^N processor grid for fake + for(int d=0;d<_ndimension;d++) if(_processors[d]!=1) exit(-1); + + _processor = 0;// I am the one. The only one.. + for(int d=0;d<_ndimension;d++) _processor_coor[d] = 0; +} + +void CartesianCommunicator::GlobalSumF(float &){} +void CartesianCommunicator::GlobalSumFVector(float *,int N){} +void CartesianCommunicator::GlobalSumF(double &){} +void CartesianCommunicator::GlobalSumFVector(double *,int N){} + +// Basic Halo comms primitive +void CartesianCommunicator::SendToRecvFrom(void *xmit, + std::vector to_coordinate, + void *recv, + std::vector from_coordinate, + int bytes) +{ + exit(-1); + bcopy(xmit,recv,bytes); +} + +} + diff --git a/Grid_fake_cshift.h b/Grid_fake_cshift.h new file mode 100644 index 00000000..0f306b1b --- /dev/null +++ b/Grid_fake_cshift.h @@ -0,0 +1,246 @@ +#ifndef _GRID_FAKE_H_ +#define _GRID_FAKE_H_ + + + +friend Lattice Cshift(Lattice &rhs,int dimension,int shift) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + const int Nsimd = vector_type::Nsimd(); + + Lattice ret(rhs._grid); + + int fd = rhs._grid->_fdimensions[dimension]; + int rd = rhs._grid->_rdimensions[dimension]; + //int ld = rhs._grid->_ldimensions[dimension]; + //int gd = rhs._grid->_gdimensions[dimension]; + + + // Map to always positive shift modulo global full dimension. + shift = (shift+fd)%fd; + + ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); + + // the permute type + + int permute_dim =rhs._grid->_simd_layout[dimension]>1 ; + int permute_type=0; + for(int d=0;d_simd_layout[d]>1 ) permute_type++; + } + + /////////////////////////////////////////////// + // Move via a fake comms buffer + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; + int words = sizeof(vobj)/sizeof(vector_type); + + std::vector > comm_buf(buffer_size); + std::vector > comm_buf_extract(Nsimd,std::vector(buffer_size*words) ); + std::vector pointers(Nsimd); + + for(int x=0;x_ostride[dimension]; // base offset for result + + if ( permute_dim ) { + + int o = 0; // relative offset to base + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + + int sx = (x+sshift)%rd; + + // base offset for source + int so = sx*rhs._grid->_ostride[dimension]; + + int permute_slice=0; + int wrap = sshift/rd; + int num = sshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + if ( permute_slice ) { + extract(rhs._odata[so+o+b],pointers); + } else { + ret._odata[ro+o+b]=rhs._odata[so+o+b]; + } + + } + o +=rhs._grid->_slice_stride[dimension]; + } + + for(int i=0;i_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + int sx = (x+sshift)%rd; + + // base offset for source + int so = sx*rhs._grid->_ostride[dimension]; + + int permute_slice=0; + int wrap = sshift/rd; + int num = sshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + if ( permute_slice ) { + merge(ret._odata[ro+o+b],pointers); + } + } + o +=rhs._grid->_slice_stride[dimension]; + } + + } else { + + int co; // comm offset + int o; + + co=0; o=0; + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + // This call in inner loop is annoying but necessary for dimension=0 + // in the case of RedBlack grids. Could optimise away with + // alternate code paths for all other cases. + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + int sx = (x+sshift)%rd; + int so = sx*rhs._grid->_ostride[dimension]; + + comm_buf[co++]=rhs._odata[so+o+b]; + + } + o +=rhs._grid->_slice_stride[dimension]; + } + + // Step through a copy into a comms buffer and pull back in. + // Genuine fake implementation could calculate if loops back + co=0; o=0; + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + ret._odata[ro+o+b]=comm_buf[co++]; + } + o +=rhs._grid->_slice_stride[dimension]; + } + } + } + return ret; +} + +/* + friend Lattice Cshift(Lattice &rhs,int dimension,int shift) + { + Lattice ret(rhs._grid); + + int rd = rhs._grid->_rdimensions[dimension]; + int ld = rhs._grid->_ldimensions[dimension]; + int gd = rhs._grid->_gdimensions[dimension]; + + // Map to always positive shift. + shift = (shift+gd)%gd; + + ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); + shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); + + // Work out whether to permute and the permute type + // ABCDEFGH -> AE BF CG DH permute + // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH + // Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA + // Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB + // Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC + // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD + // Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE + // Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF + // Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG + + int permute_dim =rhs._grid->_simd_layout[dimension]>1 ; + int permute_type=0; + for(int d=0;d_simd_layout[d]>1 ) permute_type++; + + + // loop over all work + int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; + + // Packed gather sequence is clean + int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; + + typedef typename vobj::scalar_type scalar_t; + typedef typename vobj::vector_type vector_t; + const int ns=sizeof(vobj)/sizeof(scalar_t); + const int nv=sizeof(vobj)/sizeof(vector_t); + std::vector > comm_buf(buffer_size); + + for(int x=0;x_ostride[dimension]; + int so =sx*rhs._grid->_ostride[dimension]; + + + int permute_slice=0; + if ( permute_dim ) { + permute_slice = shift/rd; + if ( x_slice_nblock[dimension];n++){ + + vector_t *optr = (vector_t *)&ret._odata[o]; + vector_t *iptr = (vector_t *)&rhs._odata[so]; + int skew = buffer_size*ns/2; + + for(int b=0;b_slice_block[dimension];b++){ + for(int n=0;n_slice_stride[dimension]; + // bo+=rhs._grid->_slice_stride[dimension]*ns/2; + + } + + } else { + int bo=0; + for(int n=0;n_slice_nblock[dimension];n++){ + for(int i=0;i_slice_block[dimension];i++){ + comm_buf[bo++] =rhs._odata[so+i]; + } + so+=rhs._grid->_slice_stride[dimension]; + } + bo=0; + for(int n=0;n_slice_nblock[dimension];n++){ + for(int i=0;i_slice_block[dimension];i++){ + ret._odata[o+i]=comm_buf[bo++]; + } + o+=rhs._grid->_slice_stride[dimension]; + } + } + } + return ret; + }; +*/ + +#endif diff --git a/Grid_init.cc b/Grid_init.cc index e943c3b0..bb22ce29 100755 --- a/Grid_init.cc +++ b/Grid_init.cc @@ -28,13 +28,13 @@ double usecond(void) { void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) { - ucontext_t * uc= (ucontext_t *)ptr; - printf("Caught signal %d\n",si->si_signo); printf(" mem address %llx\n",(uint64_t)si->si_addr); printf(" code %d\n",si->si_code); #ifdef __X86_64 + ucontext_t * uc= (ucontext_t *)ptr; + struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; printf(" instruction %llx\n",(uint64_t)sc->rip); diff --git a/Grid_main.cc b/Grid_main.cc index 0b0c3e99..4079e62a 100644 --- a/Grid_main.cc +++ b/Grid_main.cc @@ -14,7 +14,7 @@ using namespace dpo::QCD; int main (int argc, char ** argv) { -#ifdef KNL +#if 0 struct sigaction sa,osa; sigemptyset (&sa.sa_mask); sa.sa_sigaction= sa_action; @@ -27,6 +27,9 @@ int main (int argc, char ** argv) std::vector latt_size(4); std::vector simd_layout(4); + std::vector mpi_layout(4); + for(int d=0;d<4;d++) mpi_layout[d]=1; + #ifdef AVX512 for(int omp=128;omp<236;omp+=16){ #else @@ -37,7 +40,7 @@ int main (int argc, char ** argv) omp_set_num_threads(omp); #endif - for(int lat=16;lat<=20;lat+=4){ + for(int lat=4;lat<=16;lat+=40){ latt_size[0] = lat; latt_size[1] = lat; latt_size[2] = lat; @@ -62,11 +65,9 @@ int main (int argc, char ** argv) simd_layout[2] = 1; simd_layout[3] = 2; #endif - - GridCartesian Fine(latt_size,simd_layout); - GridRedBlackCartesian rbFine(latt_size,simd_layout); - + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout); LatticeColourMatrix Foo(&Fine); LatticeColourMatrix Bar(&Fine); @@ -152,19 +153,23 @@ int main (int argc, char ** argv) trscMat = trace(scMat); // Trace FooBar = Bar; - int shift=1; - int dir=0; + /* + { + std::vector coor(4); + for(int d=0;d<4;d++) coor[d] = 0; + peekSite(cmat,Foo,coor); + Foo = zero; + pokeSite(cmat,Foo,coor); + } random(Foo); - pickCheckerboard(CbRed,rFoo,Foo); // Pick out red or black checkerboards - pickCheckerboard(CbBlack,bFoo,Foo); + */ + lex_sites(Foo); + + + //setCheckerboard(ShiftedCheck,rFoo); + //setCheckerboard(ShiftedCheck,bFoo); - Shifted = Cshift(Foo,dir,shift); // Shift everything - bShifted = Cshift(rFoo,dir,shift); // Shift red - rShifted = Cshift(bFoo,dir,shift); // Shift black - - setCheckerboard(ShiftedCheck,bShifted); // Put them all together - setCheckerboard(ShiftedCheck,rShifted); // and check the results (later) // Lattice SU(3) x SU(3) FooBar = Foo * Bar; @@ -211,25 +216,69 @@ int main (int argc, char ** argv) printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0)); printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0)); - pickCheckerboard(0,rFoo,FooBar); - pickCheckerboard(1,bFoo,FooBar); - - setCheckerboard(FooBar,rFoo); - setCheckerboard(FooBar,bFoo); + // pickCheckerboard(0,rFoo,FooBar); + // pickCheckerboard(1,bFoo,FooBar); + // setCheckerboard(FooBar,rFoo); + // setCheckerboard(FooBar,bFoo); double nrm=0; + + for(int dir=0;dir<4;dir++){ + for(int shift=0;shift coor(4); - for(coor[0]=0;coor[0]black + std::cout << "Shifting odd parities"<red + + ShiftedCheck=zero; + setCheckerboard(ShiftedCheck,bShifted); // Put them all together + setCheckerboard(ShiftedCheck,rShifted); // and check the results (later) + + // Check results + std::vector coor(4); + for(coor[3]=0;coor[3] diff; std::vector shiftcoor = coor; - shiftcoor[dir]=(shiftcoor[dir]-shift+latt_size[dir])%latt_size[dir]; - + shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%latt_size[dir]; + + std::vector rl(4); + for(int dd=0;dd<4;dd++){ + rl[dd] = latt_size[dd]/simd_layout[dd]; + } + int lex = coor[0]%rl[0] + + (coor[1]%rl[1])*rl[0] + + (coor[2]%rl[2])*rl[0]*rl[1] + + (coor[3]%rl[3])*rl[0]*rl[1]*rl[2]; + lex += + +1000*(coor[0]/rl[0]) + +1000*(coor[1]/rl[1])*simd_layout[0] + +1000*(coor[2]/rl[2])*simd_layout[0]*simd_layout[1] + +1000*(coor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2]; + + int lex_coor = shiftcoor[0]%rl[0] + + (shiftcoor[1]%rl[1])*rl[0] + + (shiftcoor[2]%rl[2])*rl[0]*rl[1] + + (shiftcoor[3]%rl[3])*rl[0]*rl[1]*rl[2]; + lex_coor += + +1000*(shiftcoor[0]/rl[0]) + +1000*(shiftcoor[1]/rl[1])*simd_layout[0] + +1000*(shiftcoor[2]/rl[2])*simd_layout[0]*simd_layout[1] + +1000*(shiftcoor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2]; + ColourMatrix foo; ColourMatrix bar; ColourMatrix shifted1; @@ -242,7 +291,6 @@ int main (int argc, char ** argv) peekSite(shifted1,Shifted,coor); peekSite(shifted2,Foo,shiftcoor); peekSite(shifted3,ShiftedCheck,coor); - peekSite(foo,Foo,coor); mdiff = shifted1-shifted2; @@ -258,13 +306,13 @@ int main (int argc, char ** argv) diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c]; nn=real(conj(diff)*diff); if ( nn > 0 ) - cout<<"Shift fail "< 0 ) - cout<<"Shift rb fail "< + +namespace dpo { + + // Should error check all MPI calls. + +CartesianCommunicator::CartesianCommunicator(std::vector &processors) +{ + _ndimension = _processors.size(); + std::vector periodic(_ndimension,1); + + _processors = processors; + _processor_coords.resize(_ndimension); + + MPI_Cart_create(MPI_COMM_WORLD _ndimension,&_processors[0],&periodic[0],1,&communicator); + MPI_Comm_rank(communicator,&_processor); + MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coords[0]); + +} + +void CartesianCommunicator::GlobalSumF(float &f){ + MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator); +} +void CartesianCommunicator::GlobalSumFVector(float *f,int N) +{ + MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator); +} +void CartesianCommunicator::GlobalSumF(double &d) +{ + MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator); +} +void CartesianCommunicator::GlobalSumFVector(double *d,int N) +{ + MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator); +} + +int CartesianCommunicator::ShiftedRank(int dim,int shift) +{ + int rank; + MPI_Cart_shift(communicator,dim,shift,&_processor,&rank); + return rank; +} +int CartesianCommunicator::Rank(std::vector coor) +{ + int rank; + MPI_Cart_rank (communicator, &coor[0], &rank); + return rank; +} + +// Basic Halo comms primitive +void CartesianCommunicator::SendToRecvFrom(void *xmit, + std::vector &to_coordinate, + void *recv, + std::vector &from_coordinate, + int bytes) +{ + int dest = Rank(to_coordinate); + int from = Rank(from_coordinate); + + MPI_Request x_req; + MPI_Request r_req; + MPI_Status OkeyDokey; + + MPI_Isend(xmit, bytes, MPI_CHAR,dest,dest,communicator,&x_req); + MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&r_req); + MPI_Wait(&x_req,&OkeyDokey); + MPI_Wait(&r_req,&OkeyDokey); + MPI_Barrier(); +} + +} + +#if 0 + +// Could possibly do a direct block strided send? + int MPI_Type_vector( + int count, + int blocklength, + int stride, + MPI_Datatype old_type, + MPI_Datatype *newtype_p + ); + +#endif diff --git a/Grid_mpi_cshift.h b/Grid_mpi_cshift.h new file mode 100644 index 00000000..9031170b --- /dev/null +++ b/Grid_mpi_cshift.h @@ -0,0 +1,384 @@ +#ifndef _GRID_MPI_CSHIFT_H_ +#define _GRID_MPI_CSHIFT_H_ + + +////////////////////////////////////////////// +// Q. Split this into seperate sub functions? +////////////////////////////////////////////// + +// CshiftCB_comms_splice +// CshiftCB_comms +// CshiftCB_local +// CshiftCB_local_permute + +// Cshift_comms_splice +// Cshift_comms +// Cshift_local +// Cshift_local_permute + +// Broadly I remain annoyed that the iteration is so painful +// for red black data layout, when simple block strided descriptors suffice for non-cb. +// +// The other option is to do it table driven, or perhaps store the CB of each site in a table. +// +// Must not lose sight that goal is to be able to construct really efficient +// gather to a point stencil code. CSHIFT is not the best way, so probably need +// additional stencil support. +// +// Could still do a templated syntax tree and make CSHIFT return lattice vector. +// +// Stencil based code could pre-exchange haloes and use a table lookup for neighbours +// +// Lattice could also allocate haloes which get used for stencil code. +// +// Grid could create a neighbour index table for a given stencil. +// Could also implement CovariantCshift. + +////////////////////////////////////////////////////// +//Non checkerboarded support functions +////////////////////////////////////////////////////// + +friend void Gather_plane (Lattice &rhs,std::vector &buffer, int dimension,int plane) +{ + const int Nsimd = vector_type::Nsimd(); + int rd = rhs._grid->_rdimensions[dimension]; + + int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane +#pragma omp parallel for collapse(2) + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + int sx = (x+sshift)%rd; + int so = sx*rhs._grid->_ostride[dimension]; + int permute_slice=0; + int wrap = sshift/rd; + int num = sshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + if ( permute_slice ) { + permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type); + } else { + ret._odata[ro+o+b]=rhs._odata[so+o+b]; + } + } + o +=rhs._grid->_slice_stride[dimension]; + } + } + +} +//friend void Gather_plane_extract(Lattice &rhs,std::vector pointers,int dimension,int plane); +// +//friend void Scatter_plane (Lattice &rhs,std::vector face, int dimension,int plane); +//friend void Scatter_plane_merge (Lattice &rhs,std::vector pointers,int dimension,int plane); +// +//template friend void Copy_plane_permute(Lattice &rhs,std::vector face, int dimension,int plane); +// friend void Copy_plane(Lattice &rhs,std::vector face, int dimension,int plane); +// + +////////////////////////////////////////////////////// +//Checkerboarded support functions +////////////////////////////////////////////////////// + +//friend void GatherCB_plane (Lattice &rhs,std::vector face, int dimension,int plane); +//friend void GatherCB_plane_extract(Lattice &rhs,std::vector pointers,int dimension,int plane); +// +//friend void ScatterCB_plane (Lattice &rhs,std::vector face, int dimension,int plane); +//friend void ScatterCB_plane_merge (Lattice &rhs,std::vector pointers,int dimension,int plane); +// +//template friend void CopyCB_plane_permute(Lattice &rhs,std::vector face, int dimension,int plane); +// friend void Copy_plane(Lattice &rhs,std::vector face, int dimension,int plane); + + +friend Lattice Cshift(Lattice &rhs,int dimension,int shift) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + const int Nsimd = vector_type::Nsimd(); + + Lattice ret(rhs._grid); + + int fd = rhs._grid->_fdimensions[dimension]; + int rd = rhs._grid->_rdimensions[dimension]; + //int ld = rhs._grid->_ldimensions[dimension]; + //int gd = rhs._grid->_gdimensions[dimension]; + + + // Map to always positive shift modulo global full dimension. + shift = (shift+fd)%fd; + + ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); + + // the permute type + int simd_layout = rhs._grid->_simd_layout[dimension]; + int comm_dim = rhs._grid->_processors[dimension] >1 ; + int permute_dim = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim); + int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim); + + int permute_type=0; + for(int d=0;d_simd_layout[d]>1 ) permute_type++; + } + + // Logic for non-distributed dimension + std::vector comm_offnode(simd_layout); + std::vector comm_to (simd_layout); + std::vector comm_from (simd_layout); + std::vector comm_rx (simd_layout); // reduced coordinate of neighbour plane + std::vector comm_simd_lane(simd_layout);// simd lane of neigbour plane + + /////////////////////////////////////////////// + // Move via a fake comms buffer + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; + int words = sizeof(vobj)/sizeof(vector_type); + + std::vector > comm_buf(buffer_size); + std::vector > comm_buf_extract(Nsimd,std::vector(buffer_size*words) ); + std::vector pointers(Nsimd); + + + + if ( permute_dim ) { + + for(int x=0;x_ostride[dimension]; // base offset for result + int o = 0; // relative offset to base + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + int sx = (x+sshift)%rd; + int so = sx*rhs._grid->_ostride[dimension]; + int permute_slice=0; + int wrap = sshift/rd; + int num = sshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + if ( permute_slice ) { + permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type); + } else { + ret._odata[ro+o+b]=rhs._odata[so+o+b]; + } + } + o +=rhs._grid->_slice_stride[dimension]; + } + } + + } else if ( splice_dim ) { + + if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace + if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1); + + + for(int i=0;i_ostride[dimension]; // base offset for result + + o = 0; // relative offset to base + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + int sx = (x+sshift)%rd; + + // base offset for source + int so = sx*rhs._grid->_ostride[dimension]; + + int permute_slice=0; + int wrap = sshift/rd; + int num = sshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + if ( permute_slice ) { + extract(rhs._odata[so+o+b],pointers); + } + } + o +=rhs._grid->_slice_stride[dimension]; + } + + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + + for(int s=0;s ld; + comm_send_rx[s] = shifted_x%rd; // which slice geton the other node + comm_send_simd_lane [s] = shifted_x/rd; // which slice on the other node + comm_from[s] = shifted_x/ld; + comm_to [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension]; + comm_from[s] = (comm_from[s]+_processors[dimension]) % _processors[dimension]; + + } + + //////////////////////////////////////////////// + // Insert communication phase + //////////////////////////////////////////////// +#if 0 + } else if (comm_dim ) { + + // Packed gather sequence is clean + int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension]; + std::vector > send_buf(buffer_size); + std::vector > recv_buf(buffer_size); + + // off node; communcate slice (ld==rd) + if ( x+shift > rd ) { + int sb=0; + for(int n=0;n_slice_nblock[dimension];n++){ + for(int i=0;i_slice_block[dimension];i++){ + send_buf[sb++]=rhs._odata[so+i]; + } + so+=rhs._grid->_slice_stride[dimension]; + } + + // Make a comm_fake them mimics comms in periodic case. + + // scatter face + int rb=0; + for(int n=0;n_slice_nblock[dimension];n++){ + for(int i=0;i_slice_block[dimension];i++){ + ret._odata[so+i]=recv_buf[rb++]; + } + so+=rhs._grid->_slice_stride[dimension]; + } + + } else { + + for(int n=0;n_slice_nblock[dimension];n++){ + for(int i=0;i_slice_block[dimension];i++){ + ret._odata[o+i]=rhs._odata[so+i]; + } + o+=rhs._grid->_slice_stride[dimension]; + so+=rhs._grid->_slice_stride[dimension]; + } + } + +#endif + + //////////////////////////////////////////////// + // Pull receive buffers and permuted buffers in + //////////////////////////////////////////////// + for(int i=0;i_ostride[dimension]; // base offset for result + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + int sx = (x+sshift)%rd; + + // base offset for source + int so = sx*rhs._grid->_ostride[dimension]; + + int permute_slice=0; + int wrap = sshift/rd; + int num = sshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + if ( permute_slice ) { + merge(ret._odata[ro+o+b],pointers); + } + } + o +=rhs._grid->_slice_stride[dimension]; + } + } + + + } else if ( comm_dim ) { + + + int co; // comm offset + int o; + + co=0; + for(int x=0;x_ostride[dimension]; // base offset for result + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + // This call in inner loop is annoying but necessary for dimension=0 + // in the case of RedBlack grids. Could optimise away with + // alternate code paths for all other cases. + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + int sx = (x+sshift)%rd; + int so = sx*rhs._grid->_ostride[dimension]; + + comm_buf[co++]=rhs._odata[so+o+b]; + + } + o +=rhs._grid->_slice_stride[dimension]; + } + + // Step through a copy into a comms buffer and pull back in. + // Genuine fake implementation could calculate if loops back + co=0; o=0; + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + ret._odata[ro+o+b]=comm_buf[co++]; + } + o +=rhs._grid->_slice_stride[dimension]; + } + } + + } else { // Local dimension, no permute required + + for(int x=0;x_ostride[dimension]; // base offset for result + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + // This call in inner loop is annoying but necessary for dimension=0 + // in the case of RedBlack grids. Could optimise away with + // alternate code paths for all other cases. + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + + int sx = (x+sshift)%rd; + int so = sx*rhs._grid->_ostride[dimension]; + ret._odata[bo+o+b]=rhs._odata[so+o+b]; + + } + o +=rhs._grid->_slice_stride[dimension]; + } + } + + } + + return ret; +} + + +#endif diff --git a/Grid_none_cshift.h b/Grid_none_cshift.h new file mode 100644 index 00000000..87ae6a68 --- /dev/null +++ b/Grid_none_cshift.h @@ -0,0 +1,313 @@ +#ifndef _GRID_NONE_CSHIFT_H_ +#define _GRID_NONE_CSHIFT_H_ + // Work out whether to permute + // ABCDEFGH -> AE BF CG DH permute wrap num + // + // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0 + // Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1 + // Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2 + // Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3 + // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0 + // Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1 + // Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2 + // Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3 + + // Suppose 4way simd in one dim. + // ABCDEFGH -> AECG BFDH permute wrap num + + // Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0 + // Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1 + // Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0 + // Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1 + // Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0 + // Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1 + // Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0 + // Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1 + + // Generalisation to 8 way simd, 16 way simd required. + // + // Need log2 Nway masks. consisting of + // 1 bit 256 bit granule + // 2 bit 128 bit granule + // 4 bits 64 bit granule + // 8 bits 32 bit granules + // + // 15 bits.... + +// For optimisation: +// +// split into Cshift_none_rb_permute +// split into Cshift_none_rb_simple +// +// split into Cshift_none_permute +// split into Cshift_none_simple + +friend Lattice Cshift(Lattice &rhs,int dimension,int shift) +{ + Lattice ret(rhs._grid); + + int fd = rhs._grid->_fdimensions[dimension]; + int rd = rhs._grid->_rdimensions[dimension]; + int ld = rhs._grid->_ldimensions[dimension]; + int gd = rhs._grid->_gdimensions[dimension]; + + + // Map to always positive shift modulo global full dimension. + shift = (shift+fd)%fd; + + ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); + + // the permute type + + int permute_dim =rhs._grid->_simd_layout[dimension]>1 ; + int permute_type=0; + for(int d=0;d_simd_layout[d]>1 ) permute_type++; + } + + for(int x=0;x_ostride[dimension]; + int o = 0; + + if ( permute_dim ) { + + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + + int sx = (x+sshift)%rd; + int so = sx*rhs._grid->_ostride[dimension]; + + int permute_slice=0; + int wrap = sshift/rd; + int num = sshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + if ( permute_slice ) { + permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type); + } else { + ret._odata[bo+o+b]=rhs._odata[so+o+b]; + } + + } + o +=rhs._grid->_slice_stride[dimension]; + + } + } else { + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + // This call in inner loop is annoying but necessary for dimension=0 + // in the case of RedBlack grids. Could optimise away with + // alternate code paths for all other cases. + int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); + + int sx = (x+sshift)%rd; + int so = sx*rhs._grid->_ostride[dimension]; + ret._odata[bo+o+b]=rhs._odata[so+o+b]; + + } + o +=rhs._grid->_slice_stride[dimension]; + } + } + } + return ret; +} + +#if 0 +// Collapse doesn't appear to work the way I think it should in icpc +friend Lattice Cshift(Lattice &rhs,int dimension,int shift) +{ + Lattice ret(rhs._grid); + + ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); + shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); + int sx,so,o; + int rd = rhs._grid->_rdimensions[dimension]; + int ld = rhs._grid->_dimensions[dimension]; + // Map to always positive shift. + shift = (shift+ld)%ld; + // Work out whether to permute and the permute type + // ABCDEFGH -> AE BF CG DH permute + // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH + // Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG + // Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF + // Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE + // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD + // Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC + // Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB + // Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA + int permute_dim =rhs._grid->_layout[dimension]>1 ; + int permute_type=0; + for(int d=0;d_layout[d]>1 ) permute_type++; + + // loop over perp slices. + // Threading considerations: + // Need to map thread_num to + // + // x_min,x_max for Loop-A + // n_min,n_max for Loop-B + // b_min,b_max for Loop-C + // In a way that maximally load balances. + // + // Optimal: + // There are rd*n_block*block items of work. + // These serialise as item "w" + // b=w%block; w=w/block + // n=w%nblock; x=w/nblock. Perhaps 20 cycles? + // + // Logic: + // x_chunk = (rd+thread)/nthreads simply divide work across nodes. + // + // rd=5 , threads = 8; + // 0 1 2 3 4 5 6 7 + // 0 0 0 1 1 1 1 1 + for(int x=0;x_ostride[dimension]; + so =sx*rhs._grid->_ostride[dimension]; + int permute_slice=0; + if ( permute_dim ) { + permute_slice = shift/rd; + if ( x_slice_block[dimension]*internal; + + for(int n=0;n_slice_nblock[dimension];n++){ + vComplex *optr = (vComplex *)&ret._odata[o]; + vComplex *iptr = (vComplex *)&rhs._odata[so]; + for(int b=0;b_slice_stride[dimension]; + so+=rhs._grid->_slice_stride[dimension]; + } + } else { + for(int n=0;n_slice_nblock[dimension];n++){ + for(int i=0;i_slice_block[dimension];i++){ + ret._odata[o+i]=rhs._odata[so+i]; + } + o+=rhs._grid->_slice_stride[dimension]; + so+=rhs._grid->_slice_stride[dimension]; + } + + } +#else + if ( permute_slice ) { + int internal=sizeof(vobj)/sizeof(vComplex); + int num =rhs._grid->_slice_block[dimension]*internal; +#pragma omp parallel for collapse(2) + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_stride[dimension]]; + vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]]; + permute(optr[b],iptr[b],permute_type); + } + } + } else { +#pragma omp parallel for collapse(2) + for(int n=0;n_slice_nblock[dimension];n++){ + for(int i=0;i_slice_block[dimension];i++){ + int oo = o+ n*rhs._grid->_slice_stride[dimension]; + int soo=so+ n*rhs._grid->_slice_stride[dimension]; + ret._odata[oo+i]=rhs._odata[soo+i]; + } + } + + } +#endif + } + return ret; +} +friend Lattice Cshift(Lattice &rhs,int dimension,int shift) +{ + Lattice ret(rhs._grid); + + ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); + shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); + int rd = rhs._grid->_rdimensions[dimension]; + int ld = rhs._grid->_dimensions[dimension]; + + // Map to always positive shift. + shift = (shift+ld)%ld; + + // Work out whether to permute and the permute type + // ABCDEFGH -> AE BF CG DH permute + // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH + // Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG + // Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF + // Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE + // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD + // Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC + // Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB + // Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA + int permute_dim =rhs._grid->_layout[dimension]>1 ; + int permute_type=0; + for(int d=0;d_layout[d]>1 ) permute_type++; + + + // loop over all work + int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; + +#pragma omp parallel for + for(int ww=0;ww_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension]; + int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension]; + int x = w; + + int sx,so,o; + sx = (x-shift+ld)%rd; + o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert. + so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; + + int permute_slice=0; + if ( permute_dim ) { + permute_slice = shift/rd; + if ( x ComplexF; typedef std::complex ComplexD; typedef std::complex Complex; - + inline RealF adj(const RealF & r){ return r; } inline RealF conj(const RealF & r){ return r; } @@ -130,7 +130,7 @@ namespace dpo { #if defined (AVX1) || defined (AVX2) typedef __m256 fvec; typedef __m256d dvec; - typedef __m256 cvec; + typedef __m256 cvec; typedef __m256d zvec; #endif #if defined (AVX512) diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h index 71fa45ef..15dc00c6 100644 --- a/Grid_vComplexD.h +++ b/Grid_vComplexD.h @@ -8,6 +8,9 @@ namespace dpo { protected: zvec v; public: + typedef zvec vector_type; + typedef ComplexD scalar_type; + vComplexD & operator = ( Zero & z){ vzero(*this); return (*this); @@ -145,14 +148,43 @@ namespace dpo { ymm0 = _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK ret.v= _mm512_fmadd_pd(ymm0,imag,ymm1); /* Imag OK */ - - #endif #ifdef QPX ret.v = vec_mul(a.v,b.v); #endif return ret; }; + + ///////////////////////////////////////////////////////////////// + // Extract + ///////////////////////////////////////////////////////////////// + friend inline void extract(vComplexD &y,std::vector &extracted){ + // Bounce off stack is painful + // temporary hack while I figure out the right interface + const int Nsimd = vComplexD::Nsimd(); + std::vector buf(Nsimd); + + vstore(y,&buf[0]); + + for(int i=0;i &extracted){ + // Bounce off stack is painful + // temporary hack while I figure out the right interface + const int Nsimd = vComplexD::Nsimd(); + std::vector buf(Nsimd); + + for(int i=0;i BA i.e. ab cd =>cd ab #endif #ifdef SSE2 - //No cases here + break; #endif #ifdef AVX512 // 4 complex=>2 permute @@ -205,7 +237,7 @@ namespace dpo { #endif } - friend inline void vset(vComplexD &ret, std::complex *a){ + friend inline void vset(vComplexD &ret,ComplexD *a){ #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); #endif @@ -221,7 +253,7 @@ namespace dpo { #endif } -friend inline void vstore(vComplexD &ret, std::complex *a){ +friend inline void vstore(vComplexD &ret, ComplexD *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_pd((double *)a,ret.v); #endif @@ -273,10 +305,10 @@ friend inline void vstore(vComplexD &ret, std::complex *a){ // return std::complex(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v)); __attribute__ ((aligned(32))) double c_[4]; _mm256_store_pd(c_,in.v); - return std::complex(c_[0]+c_[2],c_[1]+c_[3]); + return ComplexD(c_[0]+c_[2],c_[1]+c_[3]); #endif #ifdef AVX512 - return std::complex(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v)); + return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v)); #endif #ifdef QPX #endif diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h index c5c6e58f..0b4f5e61 100644 --- a/Grid_vComplexF.h +++ b/Grid_vComplexF.h @@ -8,6 +8,11 @@ namespace dpo { cvec v; public: + static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;} + public: + typedef cvec vector_type; + typedef ComplexF scalar_type; + vComplexF & operator = ( Zero & z){ vzero(*this); return (*this); @@ -125,6 +130,39 @@ namespace dpo { return ret; }; + ///////////////////////////////////////////////////////////////// + // Extract + ///////////////////////////////////////////////////////////////// + friend inline void extract(vComplexF &y,std::vector &extracted){ + // Bounce off heap is painful + // temporary hack while I figure out the right interface + vComplexF vbuf; + ComplexF *buf = (ComplexF *)&vbuf; + + vstore(y,&buf[0]); + for(int i=0;i &extracted){ + // Bounce off stack is painful + // temporary hack while I figure out the right interface + const int Nsimd = vComplexF::Nsimd(); + vComplexF vbuf; + ComplexF *buf = (ComplexF *)&vbuf; + + for(int i=0;i *a){ +friend inline void vstore(vComplexF &ret, ComplexF *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_ps((float *)a,ret.v); #endif @@ -225,11 +263,11 @@ exit(-1); #if defined (AVX1) || defined(AVX2) __attribute__ ((aligned(32))) float c_[8]; _mm256_store_ps(c_,in.v); - return std::complex(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]); + return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]); #endif #ifdef AVX512 - return std::complex(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v)); + return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v)); #endif #ifdef QPX #endif @@ -321,9 +359,6 @@ exit(-1); return *this; } - - public: - static int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;} }; inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; } diff --git a/Grid_vRealD.h b/Grid_vRealD.h index d092d509..e5d3d4c6 100644 --- a/Grid_vRealD.h +++ b/Grid_vRealD.h @@ -7,7 +7,11 @@ namespace dpo{ class vRealD { protected: dvec v; // dvec is double precision vector + public: + typedef dvec vector_type; + typedef RealD scalar_type; + vRealD(){}; friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);} @@ -94,6 +98,37 @@ namespace dpo{ #endif return ret; }; + + ///////////////////////////////////////////////////////////////// + // Extract + ///////////////////////////////////////////////////////////////// + friend inline void extract(vRealD &y,std::vector &extracted){ + // Bounce off stack is painful + // temporary hack while I figure out the right interface + const int Nsimd = vRealD::Nsimd(); + RealD buf[Nsimd]; + + vstore(y,buf); + + for(int i=0;i &extracted){ + // Bounce off stack is painful + // temporary hack while I figure out the right interface + const int Nsimd = vRealD::Nsimd(); + RealD buf[Nsimd]; + + for(int i=0;i BA DC FE HG diff --git a/Grid_vRealF.h b/Grid_vRealF.h index 72f7d9e6..db74b005 100644 --- a/Grid_vRealF.h +++ b/Grid_vRealF.h @@ -7,7 +7,12 @@ namespace dpo { class vRealF { protected: fvec v; + public: + + typedef fvec vector_type; + typedef RealF scalar_type; + vRealF(){}; //////////////////////////////////// // Arithmetic operator overloads +,-,* @@ -113,6 +118,37 @@ namespace dpo { ////////////////////////////////// friend inline void vone (vRealF &ret){vsplat(ret,1.0);} friend inline void vzero(vRealF &ret){vsplat(ret,0.0);} + + + ///////////////////////////////////////////////////////////////// + // Extract + ///////////////////////////////////////////////////////////////// + friend inline void extract(vRealF &y,std::vector &extracted){ + // Bounce off stack is painful + // temporary hack while I figure out the right interface + const int Nsimd = vRealF::Nsimd(); + RealF buf[Nsimd]; + + vstore(y,buf); + + for(int i=0;i &extracted){ + // Bounce off stack is painful + // temporary hack while I figure out the right interface + const int Nsimd = vRealF::Nsimd(); + RealF buf[Nsimd]; + + for(int i=0;i