mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Performance regressed and is OK in icpx 2023.2
This commit is contained in:
parent
afc316f501
commit
d93eac7b1c
@ -464,7 +464,8 @@ public:
|
|||||||
//U_padded: the gauge link fields padded out using the PaddedCell class
|
//U_padded: the gauge link fields padded out using the PaddedCell class
|
||||||
//Cell: the padded cell class
|
//Cell: the padded cell class
|
||||||
//gStencil: the precomputed generalized local stencil for the staple
|
//gStencil: the precomputed generalized local stencil for the staple
|
||||||
static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
|
static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil)
|
||||||
|
{
|
||||||
double t0 = usecond();
|
double t0 = usecond();
|
||||||
assert(U_padded.size() == Nd); assert(staple.size() == Nd);
|
assert(U_padded.size() == Nd); assert(staple.size() == Nd);
|
||||||
assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
|
assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
|
||||||
@ -489,7 +490,7 @@ public:
|
|||||||
autoView( gStaple_v , gStaple, AcceleratorWrite);
|
autoView( gStaple_v , gStaple, AcceleratorWrite);
|
||||||
auto gStencil_v = gStencil.View();
|
auto gStencil_v = gStencil.View();
|
||||||
|
|
||||||
accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
|
accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), {
|
||||||
decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
|
decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
|
||||||
stencil_ss = Zero();
|
stencil_ss = Zero();
|
||||||
int off = outer_off;
|
int off = outer_off;
|
||||||
@ -1201,7 +1202,7 @@ public:
|
|||||||
autoView( gStaple_v , gStaple, AcceleratorWrite);
|
autoView( gStaple_v , gStaple, AcceleratorWrite);
|
||||||
auto gStencil_v = gStencil.View();
|
auto gStencil_v = gStencil.View();
|
||||||
|
|
||||||
accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
|
accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), {
|
||||||
decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
|
decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
|
||||||
stencil_ss = Zero();
|
stencil_ss = Zero();
|
||||||
int s=offset;
|
int s=offset;
|
||||||
|
@ -43,7 +43,7 @@ class GeneralLocalStencilView {
|
|||||||
int _npoints; // Move to template param?
|
int _npoints; // Move to template param?
|
||||||
GeneralStencilEntry* _entries_p;
|
GeneralStencilEntry* _entries_p;
|
||||||
|
|
||||||
accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) {
|
accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) const {
|
||||||
return & this->_entries_p[point+this->_npoints*osite];
|
return & this->_entries_p[point+this->_npoints*osite];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ AC_PROG_RANLIB
|
|||||||
|
|
||||||
############### Get compiler informations
|
############### Get compiler informations
|
||||||
AC_LANG([C++])
|
AC_LANG([C++])
|
||||||
AX_CXX_COMPILE_STDCXX(14,noext,mandatory)
|
AX_CXX_COMPILE_STDCXX(17,noext,mandatory)
|
||||||
AX_COMPILER_VENDOR
|
AX_COMPILER_VENDOR
|
||||||
AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
|
AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
|
||||||
[vendor of C++ compiler that will compile the code])
|
[vendor of C++ compiler that will compile the code])
|
||||||
|
1018
m4/ax_cxx_compile_stdcxx.m4
Normal file
1018
m4/ax_cxx_compile_stdcxx.m4
Normal file
File diff suppressed because it is too large
Load Diff
34
m4/ax_cxx_compile_stdcxx_14.m4
Normal file
34
m4/ax_cxx_compile_stdcxx_14.m4
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
# =============================================================================
|
||||||
|
# https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_14.html
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# SYNOPSIS
|
||||||
|
#
|
||||||
|
# AX_CXX_COMPILE_STDCXX_14([ext|noext], [mandatory|optional])
|
||||||
|
#
|
||||||
|
# DESCRIPTION
|
||||||
|
#
|
||||||
|
# Check for baseline language coverage in the compiler for the C++14
|
||||||
|
# standard; if necessary, add switches to CXX and CXXCPP to enable
|
||||||
|
# support.
|
||||||
|
#
|
||||||
|
# This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX
|
||||||
|
# macro with the version set to C++14. The two optional arguments are
|
||||||
|
# forwarded literally as the second and third argument respectively.
|
||||||
|
# Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for
|
||||||
|
# more information. If you want to use this macro, you also need to
|
||||||
|
# download the ax_cxx_compile_stdcxx.m4 file.
|
||||||
|
#
|
||||||
|
# LICENSE
|
||||||
|
#
|
||||||
|
# Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
|
||||||
|
#
|
||||||
|
# Copying and distribution of this file, with or without modification, are
|
||||||
|
# permitted in any medium without royalty provided the copyright notice
|
||||||
|
# and this notice are preserved. This file is offered as-is, without any
|
||||||
|
# warranty.
|
||||||
|
|
||||||
|
#serial 5
|
||||||
|
|
||||||
|
AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX])
|
||||||
|
AC_DEFUN([AX_CXX_COMPILE_STDCXX_14], [AX_CXX_COMPILE_STDCXX([14], [$1], [$2])])
|
@ -20,7 +20,7 @@ unset OMP_PLACES
|
|||||||
|
|
||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
qsub jobscript.pbs
|
#qsub jobscript.pbs
|
||||||
|
|
||||||
echo Jobid: $PBS_JOBID
|
echo Jobid: $PBS_JOBID
|
||||||
echo Running on host `hostname`
|
echo Running on host `hostname`
|
||||||
@ -44,3 +44,4 @@ CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -enva
|
|||||||
./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
|
./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
|
|
||||||
|
$CMD
|
||||||
|
@ -45,8 +45,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A
|
|||||||
|
|
||||||
if [ $PALS_LOCAL_RANKID = 0 ]
|
if [ $PALS_LOCAL_RANKID = 0 ]
|
||||||
then
|
then
|
||||||
onetrace --chrome-device-timeline "$@"
|
# onetrace --chrome-device-timeline "$@"
|
||||||
# "$@"
|
"$@"
|
||||||
else
|
else
|
||||||
"$@"
|
"$@"
|
||||||
fi
|
fi
|
||||||
|
@ -11,6 +11,6 @@ TOOLS=$HOME/tools
|
|||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
MPICXX=mpicxx \
|
MPICXX=mpicxx \
|
||||||
CXX=icpx \
|
CXX=icpx \
|
||||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \
|
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
|
||||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
|
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user