mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Updated from upstream and added halo benchmark
This commit is contained in:
commit
428b8ba907
56
.travis.yml
56
.travis.yml
@ -1,56 +0,0 @@
|
||||
language: cpp
|
||||
|
||||
cache:
|
||||
directories:
|
||||
- clang
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- os: osx
|
||||
osx_image: xcode8.3
|
||||
compiler: clang
|
||||
|
||||
before_install:
|
||||
- export GRIDDIR=`pwd`
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
|
||||
|
||||
install:
|
||||
- export CWD=`pwd`
|
||||
- echo $CWD
|
||||
- export CC=$CC$VERSION
|
||||
- export CXX=$CXX$VERSION
|
||||
- echo $PATH
|
||||
- which autoconf
|
||||
- autoconf --version
|
||||
- which automake
|
||||
- automake --version
|
||||
- which $CC
|
||||
- $CC --version
|
||||
- which $CXX
|
||||
- $CXX --version
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
|
||||
|
||||
script:
|
||||
- ./bootstrap.sh
|
||||
- mkdir build
|
||||
- cd build
|
||||
- mkdir lime
|
||||
- cd lime
|
||||
- mkdir build
|
||||
- cd build
|
||||
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
|
||||
- tar xf lime-1.3.2.tar.gz
|
||||
- cd lime-1.3.2
|
||||
- ./configure --prefix=$CWD/build/lime/install
|
||||
- make -j4
|
||||
- make install
|
||||
- cd $CWD/build
|
||||
- ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
|
||||
- make -j4
|
||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
||||
- make check
|
@ -35,6 +35,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#endif
|
||||
#ifdef GRID_SYCl
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
@ -446,6 +449,45 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbfs mapping intended
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
|
||||
#if defined(GRID_SYCL)
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
void * ShmCommBuf ;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// allocate the pointer array for shared windows for our group
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
MPI_Barrier(WorldShmComm);
|
||||
WorldShmCommBufs.resize(WorldShmSize);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
|
||||
assert(WorldShmSize == 1);
|
||||
for(int r=0;r<WorldShmSize;r++){
|
||||
WorldShmCommBufs[r] = ShmCommBuf;
|
||||
}
|
||||
_ShmAllocBytes=bytes;
|
||||
_ShmAlloc=1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GRID_CUDA) ||defined(GRID_HIP)
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
@ -557,6 +599,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
_ShmAllocBytes=bytes;
|
||||
_ShmAlloc=1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
#ifdef GRID_MPI3_SHMMMAP
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
@ -727,16 +771,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
{
|
||||
#ifdef GRID_CUDA
|
||||
cudaMemset(dest,0,bytes);
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
acceleratorMemSet(dest,0,bytes);
|
||||
#else
|
||||
bzero(dest,bytes);
|
||||
#endif
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
#ifdef GRID_CUDA
|
||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
acceleratorCopyToDevice(src,dest,bytes);
|
||||
#else
|
||||
bcopy(src,dest,bytes);
|
||||
#endif
|
||||
|
@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
|
||||
std::time_t t = std::time(nullptr);
|
||||
std::tm tm_ = *std::localtime(&t);
|
||||
std::ostringstream oss;
|
||||
// oss << std::put_time(&tm_, "%c %Z");
|
||||
oss << std::put_time(&tm_, "%c %Z");
|
||||
header.creation_date = oss.str();
|
||||
header.archive_date = header.creation_date;
|
||||
|
||||
|
@ -205,11 +205,20 @@ public:
|
||||
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
|
||||
}
|
||||
|
||||
// Preferred interface
|
||||
template<class GaugeStats=PeriodicGaugeStatistics>
|
||||
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
|
||||
std::string file,
|
||||
std::string ens_label = std::string("DWF"))
|
||||
{
|
||||
writeConfiguration(Umu,file,0,1,ens_label);
|
||||
}
|
||||
template<class GaugeStats=PeriodicGaugeStatistics>
|
||||
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
|
||||
std::string file,
|
||||
int two_row,
|
||||
int bits32)
|
||||
int bits32,
|
||||
std::string ens_label = std::string("DWF"))
|
||||
{
|
||||
typedef vLorentzColourMatrixD vobj;
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
@ -219,8 +228,8 @@ public:
|
||||
// Following should become arguments
|
||||
///////////////////////////////////////////
|
||||
header.sequence_number = 1;
|
||||
header.ensemble_id = "UKQCD";
|
||||
header.ensemble_label = "DWF";
|
||||
header.ensemble_id = std::string("UKQCD");
|
||||
header.ensemble_label = ens_label;
|
||||
|
||||
typedef LorentzColourMatrixD fobj3D;
|
||||
typedef LorentzColour2x3D fobj2D;
|
||||
|
@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
||||
|
||||
#ifndef GRID_CUDA
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
////////////////////
|
||||
|
@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Single flavour one component spinors with colour index. 5d vec
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
|
||||
NAMESPACE_CHECK(ImplStaggered5dVec);
|
||||
// Deprecate Vec5d
|
||||
//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
|
||||
//NAMESPACE_CHECK(ImplStaggered5dVec);
|
||||
|
||||
|
||||
|
@ -680,7 +680,8 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
|
||||
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
||||
gauge3 =(uint64_t)&UU[sU]( T );
|
||||
|
||||
|
||||
#undef STAG_VEC5D
|
||||
#ifdef STAG_VEC5D
|
||||
// This is the single precision 5th direction vectorised kernel
|
||||
#include <Grid/simd/Intel512single.h>
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
|
||||
@ -790,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define PERMUTE_DIR3 __asm__ ( \
|
||||
|
@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#define LOAD_CHI(b) \
|
||||
#ifdef GRID_SIMT
|
||||
|
||||
#define LOAD_CHI(ptype,b) \
|
||||
const SiteSpinor & ref (b[offset]); \
|
||||
Chi_0=ref()()(0);\
|
||||
Chi_1=ref()()(1);\
|
||||
Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane); \
|
||||
Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane); \
|
||||
Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
|
||||
|
||||
#define LOAD_CHI_COMMS(b) \
|
||||
const SiteSpinor & ref (b[offset]); \
|
||||
Chi_0=coalescedRead(ref()()(0),lane); \
|
||||
Chi_1=coalescedRead(ref()()(1),lane); \
|
||||
Chi_2=coalescedRead(ref()()(2),lane);
|
||||
|
||||
#define PERMUTE_DIR(dir) ;
|
||||
#else
|
||||
#define LOAD_CHI(ptype,b) LOAD_CHI_COMMS(b)
|
||||
|
||||
#define LOAD_CHI_COMMS(b) \
|
||||
const SiteSpinor & ref (b[offset]); \
|
||||
Chi_0=ref()()(0); \
|
||||
Chi_1=ref()()(1); \
|
||||
Chi_2=ref()()(2);
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_0,Chi_0); \
|
||||
permute##dir(Chi_1,Chi_1); \
|
||||
permute##dir(Chi_2,Chi_2);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// To splat or not to splat depends on the implementation
|
||||
#define MULT(A,UChi) \
|
||||
auto & ref(U[sU](A)); \
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||
Impl::loadLinkElement(U_02,ref()(0,2)); \
|
||||
Impl::loadLinkElement(U_12,ref()(1,2)); \
|
||||
Impl::loadLinkElement(U_22,ref()(2,2)); \
|
||||
U_00=coalescedRead(ref()(0,0),lane); \
|
||||
U_10=coalescedRead(ref()(1,0),lane); \
|
||||
U_20=coalescedRead(ref()(2,0),lane); \
|
||||
U_01=coalescedRead(ref()(0,1),lane); \
|
||||
U_11=coalescedRead(ref()(1,1),lane); \
|
||||
U_21=coalescedRead(ref()(2,1),lane); \
|
||||
U_02=coalescedRead(ref()(0,2),lane); \
|
||||
U_12=coalescedRead(ref()(1,2),lane); \
|
||||
U_22=coalescedRead(ref()(2,2),lane); \
|
||||
UChi ## _0 = U_00*Chi_0; \
|
||||
UChi ## _1 = U_10*Chi_0;\
|
||||
UChi ## _2 = U_20*Chi_0;\
|
||||
@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#define MULT_ADD(U,A,UChi) \
|
||||
auto & ref(U[sU](A)); \
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||
Impl::loadLinkElement(U_02,ref()(0,2)); \
|
||||
Impl::loadLinkElement(U_12,ref()(1,2)); \
|
||||
Impl::loadLinkElement(U_22,ref()(2,2)); \
|
||||
U_00=coalescedRead(ref()(0,0),lane); \
|
||||
U_10=coalescedRead(ref()(1,0),lane); \
|
||||
U_20=coalescedRead(ref()(2,0),lane); \
|
||||
U_01=coalescedRead(ref()(0,1),lane); \
|
||||
U_11=coalescedRead(ref()(1,1),lane); \
|
||||
U_21=coalescedRead(ref()(2,1),lane); \
|
||||
U_02=coalescedRead(ref()(0,2),lane); \
|
||||
U_12=coalescedRead(ref()(1,2),lane); \
|
||||
U_22=coalescedRead(ref()(2,2),lane); \
|
||||
UChi ## _0 += U_00*Chi_0; \
|
||||
UChi ## _1 += U_10*Chi_0;\
|
||||
UChi ## _2 += U_20*Chi_0;\
|
||||
@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
|
||||
UChi ## _2 += U_22*Chi_2;
|
||||
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_0,Chi_0); \
|
||||
permute##dir(Chi_1,Chi_1); \
|
||||
permute##dir(Chi_2,Chi_2);
|
||||
|
||||
|
||||
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHI(in); \
|
||||
LOAD_CHI(Perm,in); \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(Perm); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI(buf); \
|
||||
LOAD_CHI_COMMS(buf); \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
||||
@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHI(in); \
|
||||
LOAD_CHI(Perm,in); \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(Perm); \
|
||||
} \
|
||||
} else if ( st.same_node[Dir] ) { \
|
||||
LOAD_CHI(buf); \
|
||||
LOAD_CHI_COMMS(buf); \
|
||||
} \
|
||||
if (local || st.same_node[Dir] ) { \
|
||||
MULT_ADD(U,Dir,even); \
|
||||
@ -140,10 +158,32 @@ NAMESPACE_BEGIN(Grid);
|
||||
local = SE->_is_local; \
|
||||
if ((!local) && (!st.same_node[Dir]) ) { \
|
||||
nmu++; \
|
||||
{ LOAD_CHI(buf); } \
|
||||
{ LOAD_CHI_COMMS(buf); } \
|
||||
{ MULT_ADD(U,Dir,even); } \
|
||||
}
|
||||
|
||||
#define HAND_DECLARATIONS(Simd) \
|
||||
Simd even_0; \
|
||||
Simd even_1; \
|
||||
Simd even_2; \
|
||||
Simd odd_0; \
|
||||
Simd odd_1; \
|
||||
Simd odd_2; \
|
||||
\
|
||||
Simd Chi_0; \
|
||||
Simd Chi_1; \
|
||||
Simd Chi_2; \
|
||||
\
|
||||
Simd U_00; \
|
||||
Simd U_10; \
|
||||
Simd U_20; \
|
||||
Simd U_01; \
|
||||
Simd U_11; \
|
||||
Simd U_21; \
|
||||
Simd U_02; \
|
||||
Simd U_12; \
|
||||
Simd U_22;
|
||||
|
||||
|
||||
template <class Impl>
|
||||
template <int Naik> accelerator_inline
|
||||
@ -155,28 +195,14 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
|
||||
calcSiteSpinor result;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
StencilEntry *SE;
|
||||
@ -215,7 +241,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
vstream(out[sF],result);
|
||||
coalescedWrite(out[sF],result);
|
||||
}
|
||||
}
|
||||
|
||||
@ -230,28 +256,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
|
||||
calcSiteSpinor result;
|
||||
int offset, ptype, local, perm;
|
||||
|
||||
StencilEntry *SE;
|
||||
@ -261,8 +272,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
|
||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||
zeroit(even_0); zeroit(even_1); zeroit(even_2);
|
||||
zeroit(odd_0); zeroit(odd_1); zeroit(odd_2);
|
||||
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
||||
@ -294,7 +305,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
vstream(out[sF],result);
|
||||
coalescedWrite(out[sF],result);
|
||||
}
|
||||
}
|
||||
|
||||
@ -309,28 +320,13 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
const int lane=acceleratorSIMTlane(Nsimd);
|
||||
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
|
||||
calcSiteSpinor result;
|
||||
int offset, ptype, local;
|
||||
|
||||
StencilEntry *SE;
|
||||
@ -340,8 +336,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||
// int sF=s+LLs*sU;
|
||||
{
|
||||
|
||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||
zeroit(even_0); zeroit(even_1); zeroit(even_2);
|
||||
zeroit(odd_0); zeroit(odd_1); zeroit(odd_2);
|
||||
int nmu=0;
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
||||
@ -374,7 +370,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
out[sF] = out[sF] + result;
|
||||
coalescedWrite(out[sF] , out(sF)+ result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -397,6 +393,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||
*/
|
||||
#undef LOAD_CHI
|
||||
#undef HAND_DECLARATIONS
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@ -85,21 +85,18 @@ public:
|
||||
|
||||
std::cout << GridLogDebug << "Stout smearing started\n";
|
||||
|
||||
// Smear the configurations
|
||||
// C contains the staples multiplied by some rho
|
||||
u_smr = U ; // set the smeared field to the current gauge field
|
||||
SmearBase->smear(C, U);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
if( mu == OrthogDim )
|
||||
tmp = 1.0; // Don't smear in the orthogonal direction
|
||||
else {
|
||||
tmp = peekLorentz(C, mu);
|
||||
if( mu == OrthogDim ) continue ;
|
||||
// u_smr = exp(iQ_mu)*U_mu apart from Orthogdim
|
||||
Umu = peekLorentz(U, mu);
|
||||
iq_mu = Ta(
|
||||
tmp *
|
||||
adj(Umu)); // iq_mu = Ta(Omega_mu) to match the signs with the paper
|
||||
tmp = peekLorentz(C, mu);
|
||||
iq_mu = Ta( tmp * adj(Umu));
|
||||
exponentiate_iQ(tmp, iq_mu);
|
||||
}
|
||||
pokeLorentz(u_smr, tmp * Umu, mu); // u_smr = exp(iQ_mu)*U_mu
|
||||
pokeLorentz(u_smr, tmp * Umu, mu);
|
||||
}
|
||||
std::cout << GridLogDebug << "Stout smearing completed\n";
|
||||
};
|
||||
|
@ -65,7 +65,8 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
|
||||
#else
|
||||
|
||||
|
||||
#ifndef GRID_SYCL
|
||||
//#ifndef GRID_SYCL
|
||||
#if 1
|
||||
// Use the scalar as our own complex on GPU ... thrust::complex or std::complex
|
||||
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
||||
typename vsimd::scalar_type
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*************************************************************************************
|
||||
|
||||
n
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/tensors/Tensor_extract_merge.h
|
||||
@ -153,7 +153,7 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob
|
||||
// Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj, class sobj> accelerator
|
||||
void extract(const vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
|
||||
void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
|
||||
{
|
||||
typedef typename GridTypeMapper<sobj>::scalar_type sobj_scalar_type;
|
||||
typedef typename GridTypeMapper<vobj>::scalar_type scalar_type;
|
||||
@ -181,7 +181,7 @@ void extract(const vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
|
||||
// Merge bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj, class sobj> accelerator
|
||||
void merge(vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
|
||||
void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
|
||||
{
|
||||
typedef typename GridTypeMapper<sobj>::scalar_type sobj_scalar_type;
|
||||
typedef typename GridTypeMapper<vobj>::scalar_type scalar_type;
|
||||
|
@ -257,11 +257,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
unsigned long nt=acceleratorThreads(); \
|
||||
unsigned long unum1 = num1; \
|
||||
unsigned long unum2 = num2; \
|
||||
if(nt < 8)nt=8; \
|
||||
cl::sycl::range<3> local {nt,1,nsimd}; \
|
||||
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
||||
cgh.parallel_for<class dslash>( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ { \
|
||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
||||
[[intel::reqd_sub_group_size(8)]] \
|
||||
{ \
|
||||
auto iter1 = item.get_global_id(0); \
|
||||
auto iter2 = item.get_global_id(1); \
|
||||
auto lane = item.get_global_id(2); \
|
||||
@ -457,7 +460,7 @@ accelerator_inline void acceleratorSynchronise(void)
|
||||
__syncwarp();
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
cl::sycl::detail::workGroupBarrier();
|
||||
//cl::sycl::detail::workGroupBarrier();
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
__syncthreads();
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
|
||||
# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview)
|
||||
|
||||
**Data parallel C++ mathematical object library.**
|
||||
|
||||
|
99
benchmarks/Benchmark_halo.cpp
Normal file
99
benchmarks/Benchmark_halo.cpp
Normal file
@ -0,0 +1,99 @@
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
#define NCOL 2
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
constexpr int Ndim = 3;
|
||||
typedef ScalarAdjMatrixImplTypes<vComplex,NCOL>::Field SUNField;
|
||||
typedef typename SUNField::vector_object vobj;
|
||||
typedef CartesianStencil<vobj, vobj,int> Stencil;
|
||||
|
||||
int main(int argc, char **argv){
|
||||
|
||||
// Initialise grid //////////////////////////////////////////////
|
||||
Grid_init(&argc,&argv);
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
|
||||
// Module ///////////////////////////////////////////////////////
|
||||
GridModule GridMod;
|
||||
if (GridDefaultLatt().size() != Ndim){
|
||||
std::cout << GridLogError << "Incorrect dimension of the grid\n. Expected dim=" << Ndim << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if (GridDefaultMpi().size() != Ndim){
|
||||
std::cout << GridLogError << "Incorrect dimension of the mpi grid\n. Expected dim=" << Ndim << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
GridMod.set_full(new GridCartesian(GridDefaultLatt(),
|
||||
GridDefaultSimd(Ndim, vComplex::Nsimd()),
|
||||
GridDefaultMpi()));
|
||||
GridMod.set_rb(new GridRedBlackCartesian(GridMod.get_full()));
|
||||
auto grid = GridMod.get_full();
|
||||
|
||||
GridParallelRNG pRNG(grid);
|
||||
pRNG.SeedFixedIntegers({11,84,79,47,90});
|
||||
|
||||
// Stencil //////////////////////////////////////////////////////
|
||||
int npoint = 2 * Ndim;
|
||||
std::vector<int> directions(npoint);
|
||||
std::vector<int> displacements(npoint);
|
||||
|
||||
for (int mu = 0; mu < Ndim; mu++){
|
||||
directions[mu] = mu;
|
||||
directions[mu + Ndim] = mu;
|
||||
displacements[mu] = 1;
|
||||
displacements[mu + Ndim] = -1;
|
||||
}
|
||||
|
||||
Stencil Stencil_phi(grid, npoint, 0, directions, displacements,0);
|
||||
SimpleCompressor<vobj> compressor;
|
||||
|
||||
// Field /////////////////////////////////////////////////////////
|
||||
SUNField phi(grid);
|
||||
|
||||
// MPI sublattice surface area ///////////////////////////////////
|
||||
|
||||
int mpi_area = 0;
|
||||
int mpi_face;
|
||||
// Calculates the total surface area of an MPI hypercube
|
||||
for (int mu_ex=0;mu_ex<Ndim;++mu_ex){
|
||||
mpi_face = 1;
|
||||
|
||||
for (int mu=0; mu<Ndim; ++mu){
|
||||
if (mu != mu_ex) mpi_face *= GridDefaultLatt()[mu]/GridDefaultMpi()[mu];
|
||||
}
|
||||
|
||||
mpi_area += 2*mpi_face;
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "Total MPI surface area = " << mpi_area << std::endl;
|
||||
|
||||
// Benchmarking //////////////////////////////////////////////////
|
||||
|
||||
int nloops = 100;
|
||||
double start;
|
||||
double time;
|
||||
double avgtime = 0;
|
||||
double bytes = sizeof(Complex)*NCOL*NCOL*mpi_area*4.;
|
||||
// 4 is for the two reads and writes in receiving and sending data
|
||||
// I don't know if I am to consider all data being sent and received in all mpi processes across all gpus
|
||||
double avgbandwidth = 0;
|
||||
|
||||
for (int i=0;i<nloops;++i){
|
||||
|
||||
start = usecond();
|
||||
Stencil_phi.HaloExchange(phi, compressor);
|
||||
time = usecond();
|
||||
std::cout << GridLogMessage << "Exchange " << i << " time (us) = " << time-start << " | " << "Bandwidth (GB/s) = " << (bytes/1e9)/(time/1e6) << std::endl;
|
||||
avgtime += time/double(nloops);
|
||||
avgbandwidth += (bytes/1e9)/(time/1e6)/double(nloops);
|
||||
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "Average time (us) = " << avgtime << " | Average bandwidth (GB/s) = " << avgbandwidth << std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
|
||||
}
|
@ -66,7 +66,9 @@ int main(int argc, char** argv)
|
||||
// Set up RNGs
|
||||
std::vector<int> seeds4({1, 2, 3, 4});
|
||||
std::vector<int> seeds5({5, 6, 7, 8});
|
||||
GridSerialRNG sRNG;
|
||||
GridParallelRNG RNG5(FGrid);
|
||||
sRNG.SeedFixedIntegers(seeds5);
|
||||
RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid);
|
||||
RNG4.SeedFixedIntegers(seeds4);
|
||||
@ -84,7 +86,7 @@ int main(int argc, char** argv)
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
|
||||
ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);
|
||||
|
||||
Meofa.refresh(Umu, RNG5);
|
||||
Meofa.refresh(Umu,sRNG, RNG5);
|
||||
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
|
||||
}
|
||||
|
||||
@ -94,7 +96,7 @@ int main(int argc, char** argv)
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
|
||||
ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);
|
||||
|
||||
Meofa.refresh(Umu, RNG5);
|
||||
Meofa.refresh(Umu,sRNG, RNG5);
|
||||
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
|
||||
}
|
||||
|
||||
|
@ -74,6 +74,9 @@ int main(int argc, char** argv)
|
||||
RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid);
|
||||
RNG4.SeedFixedIntegers(seeds4);
|
||||
GridSerialRNG sRNG;
|
||||
RNG4.SeedFixedIntegers(seeds4);
|
||||
sRNG.SeedFixedIntegers(seeds5);
|
||||
|
||||
// Random gauge field
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
@ -90,7 +93,7 @@ int main(int argc, char** argv)
|
||||
ConjugateGradient<FermionField> CG(1.0e-12, 5000);
|
||||
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);
|
||||
|
||||
Meofa.refresh(Umu, RNG5);
|
||||
Meofa.refresh(Umu,sRNG, RNG5);
|
||||
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
|
||||
}
|
||||
|
||||
@ -100,7 +103,7 @@ int main(int argc, char** argv)
|
||||
ConjugateGradient<FermionField> CG(1.0e-12, 5000);
|
||||
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);
|
||||
|
||||
Meofa.refresh(Umu, RNG5);
|
||||
Meofa.refresh(Umu,sRNG, RNG5);
|
||||
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
|
||||
}
|
||||
|
||||
|
@ -68,8 +68,10 @@ int main(int argc, char** argv)
|
||||
// Set up RNGs
|
||||
std::vector<int> seeds4({1, 2, 3, 4});
|
||||
std::vector<int> seeds5({5, 6, 7, 8});
|
||||
GridSerialRNG sRNG;
|
||||
GridParallelRNG RNG5(FGrid);
|
||||
RNG5.SeedFixedIntegers(seeds5);
|
||||
sRNG.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid);
|
||||
RNG4.SeedFixedIntegers(seeds4);
|
||||
|
||||
@ -86,7 +88,7 @@ int main(int argc, char** argv)
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
|
||||
ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);
|
||||
|
||||
Meofa.refresh(Umu, RNG5);
|
||||
Meofa.refresh(Umu, sRNG,RNG5);
|
||||
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
|
||||
}
|
||||
|
||||
@ -96,7 +98,7 @@ int main(int argc, char** argv)
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
|
||||
ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);
|
||||
|
||||
Meofa.refresh(Umu, RNG5);
|
||||
Meofa.refresh(Umu, sRNG,RNG5);
|
||||
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
|
||||
}
|
||||
|
||||
|
@ -73,7 +73,9 @@ int main(int argc, char** argv)
|
||||
std::vector<int> seeds4({1, 2, 3, 4});
|
||||
std::vector<int> seeds5({5, 6, 7, 8});
|
||||
GridParallelRNG RNG5(FGrid);
|
||||
GridSerialRNG sRNG;
|
||||
RNG5.SeedFixedIntegers(seeds5);
|
||||
sRNG.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid);
|
||||
RNG4.SeedFixedIntegers(seeds4);
|
||||
|
||||
@ -91,7 +93,7 @@ int main(int argc, char** argv)
|
||||
ConjugateGradient<FermionField> CG(1.0e-12, 5000);
|
||||
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);
|
||||
|
||||
Meofa.refresh(Umu, RNG5);
|
||||
Meofa.refresh(Umu, sRNG, RNG5);
|
||||
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
|
||||
}
|
||||
|
||||
@ -101,7 +103,7 @@ int main(int argc, char** argv)
|
||||
ConjugateGradient<FermionField> CG(1.0e-12, 5000);
|
||||
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);
|
||||
|
||||
Meofa.refresh(Umu, RNG5);
|
||||
Meofa.refresh(Umu, sRNG, RNG5);
|
||||
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
|
||||
}
|
||||
|
||||
|
@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
;
|
||||
|
||||
|
||||
|
||||
@ -59,6 +58,10 @@ int main (int argc, char ** argv)
|
||||
double beta = 1.0;
|
||||
double c1 = 0.331;
|
||||
|
||||
const int nu = 1;
|
||||
std::vector<int> twists(Nd,0);
|
||||
twists[nu] = 1;
|
||||
ConjugateGimplD::setDirections(twists);
|
||||
ConjugatePlaqPlusRectangleActionR Action(beta,c1);
|
||||
//ConjugateWilsonGaugeActionR Action(beta);
|
||||
//WilsonGaugeActionR Action(beta);
|
||||
|
@ -61,7 +61,9 @@ int main (int argc, char ** argv)
|
||||
std::vector<int> seeds({1,2,3,4});
|
||||
|
||||
GridParallelRNG pRNG(&Grid);
|
||||
GridSerialRNG sRNG;
|
||||
pRNG.SeedFixedIntegers(seeds);
|
||||
sRNG.SeedFixedIntegers(seeds);
|
||||
|
||||
typedef PeriodicGimplR Gimpl;
|
||||
typedef WilsonGaugeAction<Gimpl> GaugeAction;
|
||||
@ -115,7 +117,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
integrator.setMomentumFilter(filter);
|
||||
|
||||
integrator.refresh(U, pRNG); //doesn't actually change the gauge field
|
||||
integrator.refresh(U, sRNG, pRNG); //doesn't actually change the gauge field
|
||||
|
||||
//Check the momentum is zero on the boundary
|
||||
const auto &P = integrator.getMomentum();
|
||||
|
Loading…
Reference in New Issue
Block a user