mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-24 02:32:02 +01:00
Compare commits
26 Commits
feature/di
...
477ebf24f4
Author | SHA1 | Date | |
---|---|---|---|
477ebf24f4 | |||
0d5639f707 | |||
413312f9a9 | |||
03508448f8 | |||
e1e5c75023 | |||
9296299b61 | |||
913fbca74a | |||
60dfb49afa | |||
554c238359 | |||
f922adf05e | |||
188d2c7a4d | |||
17d7177105 | |||
bb0a0da47a | |||
84110166e4 | |||
d32b923b6c | |||
2ab1af5754 | |||
5f8892bf03 | |||
f14e7e51e7 | |||
042ab1a052 | |||
2df98a99bc | |||
315ea18be2 | |||
a9c2e1df03 | |||
da4daea57a | |||
e346154c5d | |||
3ca0de1c40 | |||
c7205d2a73 |
@ -16,6 +16,7 @@
|
||||
#include <functional>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <strings.h>
|
||||
#include <stdio.h>
|
||||
#include <signal.h>
|
||||
#include <ctime>
|
||||
|
@ -262,7 +262,7 @@ public:
|
||||
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
|
||||
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
||||
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
|
||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
||||
});
|
||||
|
@ -264,7 +264,7 @@ public:
|
||||
auto Tnp_v = Tnp->View();
|
||||
auto Tnm_v = Tnm->View();
|
||||
constexpr int Nsimd = vector_type::Nsimd();
|
||||
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
|
||||
accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
|
||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
||||
});
|
||||
|
@ -392,9 +392,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
|
||||
}
|
||||
|
||||
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||
this->StencilSendToRecvFromComplete(list,dir);
|
||||
}
|
||||
// if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||
// this->StencilSendToRecvFromComplete(list,dir);
|
||||
// }
|
||||
|
||||
return off_node_bytes;
|
||||
}
|
||||
|
23387
Grid/json/json.hpp
23387
Grid/json/json.hpp
File diff suppressed because it is too large
Load Diff
@ -117,19 +117,19 @@ public:
|
||||
typedef decltype(coalescedRead(*in)) sobj;
|
||||
typedef decltype(coalescedRead(*out0)) hsobj;
|
||||
|
||||
unsigned int Nsimd = vobj::Nsimd();
|
||||
constexpr unsigned int Nsimd = vobj::Nsimd();
|
||||
unsigned int mask = Nsimd >> (type + 1);
|
||||
int lane = acceleratorSIMTlane(Nsimd);
|
||||
int j0 = lane &(~mask); // inner coor zero
|
||||
int j1 = lane |(mask) ; // inner coor one
|
||||
const vobj *vp0 = &in[k];
|
||||
const vobj *vp1 = &in[m];
|
||||
const vobj *vp = (lane&mask) ? vp1:vp0;
|
||||
auto sa = coalescedRead(*vp,j0);
|
||||
auto sb = coalescedRead(*vp,j1);
|
||||
const vobj *vp0 = &in[k]; // out0[j] = merge low bit of type from in[k] and in[m]
|
||||
const vobj *vp1 = &in[m]; // out1[j] = merge hi bit of type from in[k] and in[m]
|
||||
const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
|
||||
auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
|
||||
auto sb = coalescedRead(*vp,j1); // lane to read for out 1
|
||||
hsobj psa, psb;
|
||||
projector::Proj(psa,sa,mu,dag);
|
||||
projector::Proj(psb,sb,mu,dag);
|
||||
projector::Proj(psa,sa,mu,dag); // spin project the result0
|
||||
projector::Proj(psb,sb,mu,dag); // spin project the result1
|
||||
coalescedWrite(out0[j],psa);
|
||||
coalescedWrite(out1[j],psb);
|
||||
#else
|
||||
|
@ -498,6 +498,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||
#endif
|
||||
acceleratorFenceComputeStream();
|
||||
} else if( interior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||
@ -505,11 +506,13 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
acceleratorFenceComputeStream();
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||
#endif
|
||||
acceleratorFenceComputeStream();
|
||||
}
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
|
@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
typedef Lattice<SiteLink> LinkField;
|
||||
typedef Lattice<SiteField> Field;
|
||||
typedef Field ComplexField;
|
||||
typedef LinkField ComplexField;
|
||||
};
|
||||
|
||||
typedef QedGImpl<vComplex> QedGImplR;
|
||||
|
@ -26,7 +26,7 @@
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||
#ifndef GRID_HIP
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
@ -82,7 +82,7 @@ void JSONWriter::writeDefault(const std::string &s, const std::string &x)
|
||||
if (s.size())
|
||||
ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ;
|
||||
else
|
||||
ss_ << os.str() << " ," ;
|
||||
ss_ << "\""<< os.str() << "\" ," ;
|
||||
}
|
||||
|
||||
// Reader implementation ///////////////////////////////////////////////////////
|
||||
|
@ -54,7 +54,7 @@ namespace Grid
|
||||
void pop(void);
|
||||
template <typename U>
|
||||
void writeDefault(const std::string &s, const U &x);
|
||||
#ifdef __NVCC__
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
void writeDefault(const std::string &s, const Grid::ComplexD &x)
|
||||
{
|
||||
std::complex<double> z(real(x),imag(x));
|
||||
@ -101,7 +101,7 @@ namespace Grid
|
||||
void readDefault(const std::string &s, std::vector<U> &output);
|
||||
template <typename U, typename P>
|
||||
void readDefault(const std::string &s, std::pair<U,P> &output);
|
||||
#ifdef __NVCC__
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
void readDefault(const std::string &s, ComplexD &output)
|
||||
{
|
||||
std::complex<double> z;
|
||||
|
@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include "BinaryIO.h"
|
||||
#include "TextIO.h"
|
||||
#include "XmlIO.h"
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||
#ifndef GRID_HIP
|
||||
#include "JSON_IO.h"
|
||||
#endif
|
||||
|
||||
|
@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
|
||||
///////////////////////////////////////////////////////////////////
|
||||
template<class cobj,class vobj,class compressor>
|
||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
|
||||
commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
|
||||
commVector<cobj *> pointers,
|
||||
int dimension,int plane,
|
||||
int cbmask,compressor &compress,int type) __attribute__((noinline));
|
||||
|
||||
template<class cobj,class vobj,class compressor>
|
||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type)
|
||||
{
|
||||
assert( (table.size()&0x1)==0);
|
||||
@ -92,11 +95,12 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
|
||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
auto rhs_v = rhs.View(AcceleratorRead);
|
||||
auto rhs_p = &rhs_v[0];
|
||||
auto p0=&pointers[0][0];
|
||||
auto p1=&pointers[1][0];
|
||||
auto tp=&table[0];
|
||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
||||
compress.CompressExchange(p0,p1, &rhs_v[0], j,
|
||||
compress.CompressExchange(p0,p1, rhs_p, j,
|
||||
so+tp[2*j ].second,
|
||||
so+tp[2*j+1].second,
|
||||
type);
|
||||
@ -230,8 +234,8 @@ public:
|
||||
};
|
||||
struct Merge {
|
||||
cobj * mpointer;
|
||||
Vector<scalar_object *> rpointers;
|
||||
Vector<cobj *> vpointers;
|
||||
// std::vector<scalar_object *> rpointers;
|
||||
std::vector<cobj *> vpointers;
|
||||
Integer buffer_size;
|
||||
Integer type;
|
||||
};
|
||||
@ -406,6 +410,7 @@ public:
|
||||
comms_bytes+=bytes;
|
||||
shm_bytes +=2*Packets[i].bytes-bytes;
|
||||
}
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
}
|
||||
|
||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
@ -420,7 +425,7 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void Communicate(void)
|
||||
{
|
||||
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
|
||||
if ( 0 ){
|
||||
thread_region {
|
||||
// must be called in parallel region
|
||||
int mythread = thread_num();
|
||||
@ -569,7 +574,7 @@ public:
|
||||
d.buffer_size = buffer_size;
|
||||
dv.push_back(d);
|
||||
}
|
||||
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||
Merge m;
|
||||
m.type = type;
|
||||
m.mpointer = merge_p;
|
||||
@ -582,6 +587,7 @@ public:
|
||||
}
|
||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||
mpi3synctime-=usecond();
|
||||
accelerator_barrier();
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
mpi3synctime+=usecond();
|
||||
shmmergetime-=usecond();
|
||||
@ -1114,8 +1120,8 @@ public:
|
||||
int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
|
||||
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
|
||||
|
||||
Vector<cobj *> rpointers(maxl);
|
||||
Vector<cobj *> spointers(maxl);
|
||||
std::vector<cobj *> rpointers(maxl);
|
||||
std::vector<cobj *> spointers(maxl);
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
|
@ -195,12 +195,15 @@ void acceleratorInit(void)
|
||||
#ifdef GRID_SYCL
|
||||
|
||||
cl::sycl::queue *theGridAccelerator;
|
||||
cl::sycl::queue *theCopyAccelerator;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
cl::sycl::gpu_selector selector;
|
||||
cl::sycl::device selectedDevice { selector };
|
||||
theGridAccelerator = new sycl::queue (selectedDevice);
|
||||
// theCopyAccelerator = new sycl::queue (selectedDevice);
|
||||
theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
||||
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
zeInit(0);
|
||||
|
@ -247,7 +247,6 @@ inline int acceleratorIsCommunicable(void *ptr)
|
||||
//////////////////////////////////////////////
|
||||
// SyCL acceleration
|
||||
//////////////////////////////////////////////
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
NAMESPACE_END(Grid);
|
||||
#include <CL/sycl.hpp>
|
||||
@ -262,6 +261,7 @@ NAMESPACE_END(Grid);
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern cl::sycl::queue *theGridAccelerator;
|
||||
extern cl::sycl::queue *theCopyAccelerator;
|
||||
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
#define GRID_SIMT
|
||||
@ -289,7 +289,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
cgh.parallel_for( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
||||
[[intel::reqd_sub_group_size(8)]] \
|
||||
[[intel::reqd_sub_group_size(16)]] \
|
||||
{ \
|
||||
auto iter1 = item.get_global_id(0); \
|
||||
auto iter2 = item.get_global_id(1); \
|
||||
@ -298,19 +298,19 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
}); \
|
||||
});
|
||||
|
||||
#define accelerator_barrier(dummy) theGridAccelerator->wait();
|
||||
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
|
||||
|
||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) {
|
||||
theGridAccelerator->memcpy(to,from,bytes);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; }
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
|
||||
|
||||
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
{
|
||||
#if 0
|
||||
@ -511,7 +511,16 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
|
||||
inline void acceleratorFreeCpu (void *ptr){free(ptr);};
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Fencing needed ONLY for SYCL
|
||||
//////////////////////////////////////////////
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
inline void acceleratorFenceComputeStream(void){ accelerator_barrier();};
|
||||
#else
|
||||
// Ordering within a stream guaranteed on Nvidia & AMD
|
||||
inline void acceleratorFenceComputeStream(void){ };
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Synchronise across local threads for divergence resynch
|
||||
|
@ -27,6 +27,7 @@
|
||||
/* END LEGAL */
|
||||
extern "C" {
|
||||
#include <openssl/sha.h>
|
||||
#include <openssl/evp.h>
|
||||
}
|
||||
#ifdef USE_IPP
|
||||
#include "ipp.h"
|
||||
@ -70,10 +71,8 @@ public:
|
||||
static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
|
||||
{
|
||||
std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
|
||||
SHA256_CTX sha256;
|
||||
SHA256_Init (&sha256);
|
||||
SHA256_Update(&sha256, data,bytes);
|
||||
SHA256_Final (&hash[0], &sha256);
|
||||
auto digest = EVP_get_digestbyname("SHA256");
|
||||
EVP_Digest(data, bytes, &hash[0], NULL, digest, NULL);
|
||||
return hash;
|
||||
}
|
||||
static inline std::vector<int> sha256_seeds(const std::string &s)
|
||||
|
@ -148,7 +148,7 @@ If you want to build all the tests at once just use `make tests`.
|
||||
- `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
|
||||
- `--enable-numa`: enable NUMA first touch optimisation
|
||||
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
|
||||
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
|
||||
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 64 bytes).
|
||||
- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
|
||||
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
|
||||
- `--disable-timers`: disable system dependent high-resolution timers.
|
||||
|
131
benchmarks/Benchmark_halo.cc
Normal file
131
benchmarks/Benchmark_halo.cc
Normal file
@ -0,0 +1,131 @@
|
||||
/*************************************************************************************
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PROFILE
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
Coordinate latt4= GridDefaultLatt();
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd());
|
||||
|
||||
GridLogLayout();
|
||||
|
||||
int Ls=16;
|
||||
for(int i=0;i<argc;i++)
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi);
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermionF src (FGrid); random(RNG5,src);
|
||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||
src = src*N2;
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
LatticeGaugeFieldF Umu(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
RealD NN = UGrid->NodeCount();
|
||||
|
||||
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
|
||||
const int ncall = 500;
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::HaloGatherOpt "<<std::endl;
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
{
|
||||
typename DomainWallFermionF::Compressor compressor(0);
|
||||
FGrid->Barrier();
|
||||
Dw.Stencil.HaloExchangeOptGather(src,compressor);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.Stencil.HaloExchangeOptGather(src,compressor);
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double bytes=0.0;
|
||||
if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3];
|
||||
if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3];
|
||||
if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3];
|
||||
if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2];
|
||||
bytes = bytes * Ls * 8.* (24.+12.)* 2.0;
|
||||
|
||||
std::cout<<GridLogMessage << "Gather us /call = "<< (t1-t0)/ncall<<std::endl;
|
||||
std::cout<<GridLogMessage << "Gather MBs /call = "<< bytes*ncall/(t1-t0)<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
62
systems/PVC/benchmarks/run-1tile.sh
Normal file
62
systems/PVC/benchmarks/run-1tile.sh
Normal file
@ -0,0 +1,62 @@
|
||||
#!/bin/sh
|
||||
##SBATCH -p PVC-SPR-QZEH
|
||||
##SBATCH -p PVC-ICX-QZNW
|
||||
#SBATCH -p QZ1J-ICX-PVC
|
||||
##SBATCH -p QZ1J-SPR-PVC-2C
|
||||
|
||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
|
||||
export NT=16
|
||||
|
||||
export I_MPI_OFFLOAD=1
|
||||
export I_MPI_OFFLOAD_TOPOLIB=level_zero
|
||||
export I_MPI_OFFLOAD_DOMAIN_SIZE=-1
|
||||
|
||||
# export IGC_EnableLSCFenceUGMBeforeEOT=0
|
||||
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
|
||||
export SYCL_DEVICE_FILTER=gpu,level_zero
|
||||
#export IGC_ShaderDumpEnable=1
|
||||
#export IGC_DumpToCurrentDir=1
|
||||
export I_MPI_OFFLOAD_CELL=tile
|
||||
export EnableImplicitScaling=0
|
||||
export EnableWalkerPartition=0
|
||||
export ZE_AFFINITY_MASK=0.0
|
||||
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
|
||||
|
||||
export ZE_AFFINITY_MASK=0
|
||||
export I_MPI_OFFLOAD_CELL=device
|
||||
export EnableImplicitScaling=1
|
||||
export EnableWalkerPartition=1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#mpiexec -launcher ssh -n 2 -host localhost vtune -collect gpu-hotspots -knob gpu-sampling-interval=1 -data-limit=0 -r ./vtune_run4 -- ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
|
||||
|
||||
#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
|
||||
|
||||
#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1
|
||||
|
||||
#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
|
||||
|
||||
#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
|
||||
|
||||
#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 0
|
||||
#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 1
|
||||
|
33
systems/PVC/benchmarks/run-2tile-mpi.sh
Executable file
33
systems/PVC/benchmarks/run-2tile-mpi.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
##SBATCH -p PVC-SPR-QZEH
|
||||
##SBATCH -p PVC-ICX-QZNW
|
||||
|
||||
#SBATCH -p QZ1J-ICX-PVC
|
||||
|
||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
|
||||
export NT=16
|
||||
|
||||
|
||||
# export IGC_EnableLSCFenceUGMBeforeEOT=0
|
||||
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
|
||||
#export IGC_ShaderDumpEnable=1
|
||||
#export IGC_DumpToCurrentDir=1
|
||||
export I_MPI_OFFLOAD=1
|
||||
export I_MPI_OFFLOAD_TOPOLIB=level_zero
|
||||
export I_MPI_OFFLOAD_DOMAIN_SIZE=-1
|
||||
export SYCL_DEVICE_FILTER=gpu,level_zero
|
||||
export I_MPI_OFFLOAD_CELL=tile
|
||||
export EnableImplicitScaling=0
|
||||
export EnableWalkerPartition=0
|
||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
|
||||
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.1x2.log
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.2x1.log
|
||||
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log
|
||||
|
||||
|
14
systems/PVC/benchmarks/wrap.sh
Executable file
14
systems/PVC/benchmarks/wrap.sh
Executable file
@ -0,0 +1,14 @@
|
||||
#!/bin/sh
|
||||
|
||||
export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
|
||||
|
||||
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
|
||||
|
||||
|
||||
if [ $MPI_LOCALRANKID = "0" ]
|
||||
then
|
||||
# ~psteinbr/build_pti/ze_tracer -h $@
|
||||
onetrace --chrome-device-timeline $@
|
||||
else
|
||||
$@
|
||||
fi
|
15
systems/PVC/config-command
Normal file
15
systems/PVC/config-command
Normal file
@ -0,0 +1,15 @@
|
||||
INSTALL=/nfs/site/home/azusayax/install
|
||||
../../configure \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi \
|
||||
--disable-accelerator-cshift \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-unified=yes \
|
||||
CXX=mpicxx \
|
||||
LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
|
||||
CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
|
||||
|
11
systems/PVC/setup.sh
Normal file
11
systems/PVC/setup.sh
Normal file
@ -0,0 +1,11 @@
|
||||
export https_proxy=http://proxy-chain.intel.com:911
|
||||
export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
|
||||
|
||||
module load intel-release
|
||||
source /opt/intel/oneapi/PVC_setup.sh
|
||||
#source /opt/intel/oneapi/ATS_setup.sh
|
||||
module load intel/mpich/pvc45.3
|
||||
export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH
|
||||
|
||||
#clsh embargo-ci-neo-022845
|
||||
#source /opt/intel/vtune_amplifier/amplxe-vars.sh
|
@ -793,6 +793,7 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
std::cout <<" OK ! "<<std::endl;
|
||||
|
||||
#ifdef USE_FP16
|
||||
// Double to Half
|
||||
std::cout << GridLogMessage<< "Double to half" ;
|
||||
precisionChange(&H[0],&D[0],Ndp);
|
||||
@ -822,6 +823,7 @@ int main (int argc, char ** argv)
|
||||
assert( tmp < 1.0e-3 );
|
||||
}
|
||||
std::cout <<" OK ! "<<std::endl;
|
||||
#endif
|
||||
|
||||
}
|
||||
Grid_finalize();
|
||||
|
Reference in New Issue
Block a user