mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Compare commits
34 Commits
dfc4d4f857
...
927f8b800e
Author | SHA1 | Date | |
---|---|---|---|
|
927f8b800e | ||
|
ee1b8bbdbd | ||
|
3f1636637d | ||
|
2e570f5300 | ||
|
9f89486df5 | ||
|
22b43b86cb | ||
|
3c9012676a | ||
|
b507fe209c | ||
|
6cd2d8fcd5 | ||
|
0a816b5509 | ||
|
1c8b807c2e | ||
|
66391f84f2 | ||
|
97f7a9ecb3 | ||
|
15878f7613 | ||
|
e0d5e3c6c7 | ||
|
6f3455900e | ||
|
b5659d106e | ||
|
4b43307402 | ||
|
09af8c25a2 | ||
|
9514035b87 | ||
|
1514b4f137 | ||
|
ab2de131bd | ||
|
5af8da76d7 | ||
|
b8b9dc952d | ||
|
79a6ed32d8 | ||
|
caa5f97723 | ||
|
f2648e94b9 | ||
|
e6ed516052 | ||
|
e2a3dae1f2 | ||
|
452bf2e907 | ||
|
e8c29e2fe5 | ||
|
da9cbfc7cc | ||
|
6b9f07c1ed | ||
|
5f75735dab |
@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if ( (!defined(GRID_CUDA)) )
|
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
|
||||||
int max_threads = thread_max();
|
int max_threads = thread_max();
|
||||||
Vector < vobj > Bt(Nm * max_threads);
|
Vector < vobj > Bt(Nm * max_threads);
|
||||||
thread_region
|
thread_region
|
||||||
|
@ -31,6 +31,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
|||||||
#if defined(GRID_SYCL)
|
#if defined(GRID_SYCL)
|
||||||
#include <Grid/lattice/Lattice_reduction_sycl.h>
|
#include <Grid/lattice/Lattice_reduction_sycl.h>
|
||||||
#endif
|
#endif
|
||||||
|
#include <Grid/lattice/Lattice_slicesum_core.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -448,19 +449,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
int e1= grid->_slice_nblock[orthogdim];
|
int e1= grid->_slice_nblock[orthogdim];
|
||||||
int e2= grid->_slice_block [orthogdim];
|
int e2= grid->_slice_block [orthogdim];
|
||||||
int stride=grid->_slice_stride[orthogdim];
|
int stride=grid->_slice_stride[orthogdim];
|
||||||
|
int ostride=grid->_ostride[orthogdim];
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
|
||||||
// Parallel over orthog direction
|
//Reduce Data down to lvSum
|
||||||
autoView( Data_v, Data, CpuRead);
|
sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
|
||||||
thread_for( r,rd, {
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
|
||||||
for(int n=0;n<e1;n++){
|
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
int ss= so+n*stride+b;
|
|
||||||
lvSum[r]=lvSum[r]+Data_v[ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
Coordinate icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
@ -504,6 +496,7 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
||||||
{
|
{
|
||||||
|
213
Grid/lattice/Lattice_slicesum_core.h
Normal file
213
Grid/lattice/Lattice_slicesum_core.h
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <type_traits>
|
||||||
|
#if defined(GRID_CUDA)
|
||||||
|
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
#define gpucub cub
|
||||||
|
#define gpuError_t cudaError_t
|
||||||
|
#define gpuSuccess cudaSuccess
|
||||||
|
|
||||||
|
#elif defined(GRID_HIP)
|
||||||
|
|
||||||
|
#include <hipcub/hipcub.hpp>
|
||||||
|
#define gpucub hipcub
|
||||||
|
#define gpuError_t hipError_t
|
||||||
|
#define gpuSuccess hipSuccess
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
|
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
|
size_t subvol_size = e1*e2;
|
||||||
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
|
auto rb_p = &reduction_buffer[0];
|
||||||
|
vobj zero_init;
|
||||||
|
zeroit(zero_init);
|
||||||
|
|
||||||
|
|
||||||
|
void *temp_storage_array = NULL;
|
||||||
|
size_t temp_storage_bytes = 0;
|
||||||
|
vobj *d_out;
|
||||||
|
int* d_offsets;
|
||||||
|
|
||||||
|
std::vector<int> offsets(rd+1,0);
|
||||||
|
|
||||||
|
for (int i = 0; i < offsets.size(); i++) {
|
||||||
|
offsets[i] = i*subvol_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Allocate memory for output and offset arrays on device
|
||||||
|
d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
|
||||||
|
|
||||||
|
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
||||||
|
|
||||||
|
//copy offsets to device
|
||||||
|
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||||
|
|
||||||
|
|
||||||
|
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||||
|
if (gpuErr!=gpuSuccess) {
|
||||||
|
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
//allocate memory for temp_storage_array
|
||||||
|
temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
|
||||||
|
|
||||||
|
//prepare buffer for reduction
|
||||||
|
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
|
||||||
|
//use 2d accelerator_for to avoid launch latencies found when serially looping over rd
|
||||||
|
accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{
|
||||||
|
|
||||||
|
int n = s / e2;
|
||||||
|
int b = s % e2;
|
||||||
|
int so=r*ostride; // base offset for start of plane
|
||||||
|
int ss= so+n*stride+b;
|
||||||
|
|
||||||
|
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
//issue segmented reductions in computeStream
|
||||||
|
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
|
||||||
|
if (gpuErr!=gpuSuccess) {
|
||||||
|
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||||
|
|
||||||
|
//sync after copy
|
||||||
|
accelerator_barrier();
|
||||||
|
|
||||||
|
acceleratorFreeDevice(temp_storage_array);
|
||||||
|
acceleratorFreeDevice(d_out);
|
||||||
|
acceleratorFreeDevice(d_offsets);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
|
typedef typename vobj::vector_type vector;
|
||||||
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
|
const int osites = rd*e1*e2;
|
||||||
|
commVector<vector>buffer(osites);
|
||||||
|
vector *dat = (vector *)Data;
|
||||||
|
vector *buf = &buffer[0];
|
||||||
|
Vector<vector> lvSum_small(rd);
|
||||||
|
vector *lvSum_ptr = (vector *)&lvSum[0];
|
||||||
|
|
||||||
|
for (int w = 0; w < words; w++) {
|
||||||
|
accelerator_for(ss,osites,1,{
|
||||||
|
buf[ss] = dat[ss*words+w];
|
||||||
|
});
|
||||||
|
|
||||||
|
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
||||||
|
|
||||||
|
for (int r = 0; r < rd; r++) {
|
||||||
|
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||||
|
{
|
||||||
|
autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
|
||||||
|
if constexpr (sizeof(vobj) <= 256) {
|
||||||
|
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(GRID_SYCL)
|
||||||
|
template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
size_t subvol_size = e1*e2;
|
||||||
|
|
||||||
|
vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
|
||||||
|
vobj vobj_zero;
|
||||||
|
zeroit(vobj_zero);
|
||||||
|
|
||||||
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
|
|
||||||
|
auto rb_p = &reduction_buffer[0];
|
||||||
|
|
||||||
|
autoView(Data_v, Data, AcceleratorRead);
|
||||||
|
|
||||||
|
//prepare reduction buffer
|
||||||
|
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
|
||||||
|
|
||||||
|
int n = s / e2;
|
||||||
|
int b = s % e2;
|
||||||
|
int so=r*ostride; // base offset for start of plane
|
||||||
|
int ss= so+n*stride+b;
|
||||||
|
|
||||||
|
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
for (int r = 0; r < rd; r++) {
|
||||||
|
mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
|
||||||
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
|
auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
|
||||||
|
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||||
|
Reduction,
|
||||||
|
[=](cl::sycl::id<1> item, auto &sum) {
|
||||||
|
auto s = item[0];
|
||||||
|
sum += rb_p[r*subvol_size+s];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
theGridAccelerator->wait();
|
||||||
|
lvSum[r] = mysum[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
free(mysum,*theGridAccelerator);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
|
{
|
||||||
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
|
// Parallel over orthog direction
|
||||||
|
autoView( Data_v, Data, CpuRead);
|
||||||
|
thread_for( r,rd, {
|
||||||
|
int so=r*ostride; // base offset for start of plane
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int ss= so+n*stride+b;
|
||||||
|
lvSum[r]=lvSum[r]+Data_v[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
|
{
|
||||||
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
|
|
||||||
|
sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
|
||||||
|
#elif defined(GRID_SYCL)
|
||||||
|
|
||||||
|
sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
|
||||||
|
#else
|
||||||
|
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
Coordinate fine_rdimensions = fine->_rdimensions;
|
Coordinate fine_rdimensions = fine->_rdimensions;
|
||||||
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
||||||
|
|
||||||
vobj zz = Zero();
|
|
||||||
|
|
||||||
accelerator_for(sc,coarse->oSites(),1,{
|
accelerator_for(sc,coarse->oSites(),1,{
|
||||||
|
|
||||||
// One thread per sub block
|
// One thread per sub block
|
||||||
Coordinate coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
|
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
|
||||||
|
|
||||||
vobj cd = zz;
|
vobj cd = Zero();
|
||||||
|
|
||||||
for(int sb=0;sb<blockVol;sb++){
|
for(int sb=0;sb<blockVol;sb++){
|
||||||
|
|
||||||
|
@ -45,6 +45,7 @@ public:
|
|||||||
};
|
};
|
||||||
// Host only
|
// Host only
|
||||||
GridBase * getGrid(void) const { return _grid; };
|
GridBase * getGrid(void) const { return _grid; };
|
||||||
|
vobj* getHostPointer(void) const { return _odata; };
|
||||||
};
|
};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -34,7 +34,7 @@ class GridTracer {
|
|||||||
};
|
};
|
||||||
inline void tracePush(const char *name) { roctxRangePushA(name); }
|
inline void tracePush(const char *name) { roctxRangePushA(name); }
|
||||||
inline void tracePop(const char *name) { roctxRangePop(); }
|
inline void tracePop(const char *name) { roctxRangePop(); }
|
||||||
inline int traceStart(const char *name) { roctxRangeStart(name); }
|
inline int traceStart(const char *name) { return roctxRangeStart(name); }
|
||||||
inline void traceStop(int ID) { roctxRangeStop(ID); }
|
inline void traceStop(int ID) { roctxRangeStop(ID); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -63,7 +63,9 @@ public:
|
|||||||
virtual void MooeeDag(const FermionField &in, FermionField &out) ;
|
virtual void MooeeDag(const FermionField &in, FermionField &out) ;
|
||||||
virtual void MooeeInv(const FermionField &in, FermionField &out) ;
|
virtual void MooeeInv(const FermionField &in, FermionField &out) ;
|
||||||
virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
|
virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
|
||||||
|
virtual void M(const FermionField &in, FermionField &out) ;
|
||||||
|
virtual void Mdag(const FermionField &in, FermionField &out) ;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
RealD mu; // TwistedMass parameter
|
RealD mu; // TwistedMass parameter
|
||||||
|
|
||||||
|
@ -93,5 +93,25 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
|
|||||||
RealD b = tm /sq;
|
RealD b = tm /sq;
|
||||||
axpibg5x(out,in,a,b);
|
axpibg5x(out,in,a,b);
|
||||||
}
|
}
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
this->Dhop(in, out, DaggerNo);
|
||||||
|
FermionField tmp(out.Grid());
|
||||||
|
RealD a = 4.0+this->mass;
|
||||||
|
RealD b = this->mu;
|
||||||
|
axpibg5x(tmp,in,a,b);
|
||||||
|
axpy(out, 1.0, tmp, out);
|
||||||
|
}
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
this->Dhop(in, out, DaggerYes);
|
||||||
|
FermionField tmp(out.Grid());
|
||||||
|
RealD a = 4.0+this->mass;
|
||||||
|
RealD b = -this->mu;
|
||||||
|
axpibg5x(tmp,in,a,b);
|
||||||
|
axpy(out, 1.0, tmp, out);
|
||||||
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -237,7 +237,7 @@ public:
|
|||||||
|
|
||||||
for (int level = 0; level < as.size(); ++level) {
|
for (int level = 0; level < as.size(); ++level) {
|
||||||
int multiplier = as.at(level).multiplier;
|
int multiplier = as.at(level).multiplier;
|
||||||
ActionLevel<Field> * Level = new ActionLevel<Field>(multiplier);
|
ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
|
||||||
Level->push_back(new EmptyAction<Field>);
|
Level->push_back(new EmptyAction<Field>);
|
||||||
LevelForces.push_back(*Level);
|
LevelForces.push_back(*Level);
|
||||||
// does it copy by value or reference??
|
// does it copy by value or reference??
|
||||||
|
@ -706,7 +706,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
//std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||||
}
|
}
|
||||||
/// Introduce a block structure and switch off comms on boundaries
|
/// Introduce a block structure and switch off comms on boundaries
|
||||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||||
@ -761,7 +761,8 @@ public:
|
|||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances,
|
const std::vector<int> &distances,
|
||||||
Parameters p=Parameters())
|
Parameters p=Parameters(),
|
||||||
|
bool preserve_shm=false)
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_grid = grid;
|
_grid = grid;
|
||||||
@ -855,7 +856,9 @@ public:
|
|||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
_grid->ShmBufferFreeAll();
|
// Allow for multiple stencils to exist simultaneously
|
||||||
|
if (!preserve_shm)
|
||||||
|
_grid->ShmBufferFreeAll();
|
||||||
|
|
||||||
int maxl=2;
|
int maxl=2;
|
||||||
u_simd_send_buf.resize(maxl);
|
u_simd_send_buf.resize(maxl);
|
||||||
|
@ -225,6 +225,8 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
|||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
|
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||||
|
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
@ -287,23 +289,24 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
|
|
||||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
||||||
unsigned long nt=acceleratorThreads(); \
|
unsigned long nt=acceleratorThreads(); \
|
||||||
unsigned long unum1 = num1; \
|
if(nt < 8)nt=8; \
|
||||||
unsigned long unum2 = num2; \
|
unsigned long unum1 = num1; \
|
||||||
if(nt < 8)nt=8; \
|
unsigned long unum2 = num2; \
|
||||||
cl::sycl::range<3> local {nt,1,nsimd}; \
|
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
|
||||||
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
cl::sycl::range<3> local {nt,1,nsimd}; \
|
||||||
cgh.parallel_for( \
|
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
|
||||||
cl::sycl::nd_range<3>(global,local), \
|
cgh.parallel_for( \
|
||||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
cl::sycl::nd_range<3>(global,local), \
|
||||||
[[intel::reqd_sub_group_size(16)]] \
|
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
||||||
{ \
|
[[intel::reqd_sub_group_size(16)]] \
|
||||||
auto iter1 = item.get_global_id(0); \
|
{ \
|
||||||
auto iter2 = item.get_global_id(1); \
|
auto iter1 = item.get_global_id(0); \
|
||||||
auto lane = item.get_global_id(2); \
|
auto iter2 = item.get_global_id(1); \
|
||||||
{ __VA_ARGS__ }; \
|
auto lane = item.get_global_id(2); \
|
||||||
}); \
|
{ if (iter1 < unum1){ __VA_ARGS__ } }; \
|
||||||
});
|
}); \
|
||||||
|
});
|
||||||
|
|
||||||
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
|
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
|
||||||
|
|
||||||
@ -442,6 +445,8 @@ inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
|
|||||||
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||||
|
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
|
||||||
|
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
|
||||||
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||||
//inline void acceleratorCopySynchronise(void) { }
|
//inline void acceleratorCopySynchronise(void) { }
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
|
||||||
|
321
tests/core/Test_sliceSum.cc
Normal file
321
tests/core/Test_sliceSum.cc
Normal file
@ -0,0 +1,321 @@
|
|||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
template<class vobj> inline void sliceSumCPU(const Grid::Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
||||||
|
{
|
||||||
|
using namespace Grid;
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// FIXME precision promoted summation
|
||||||
|
// may be important for correlation functions
|
||||||
|
// But easily avoided by using double precision fields
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename vobj::scalar_object::scalar_type scalar_type;
|
||||||
|
GridBase *grid = Data.Grid();
|
||||||
|
assert(grid!=NULL);
|
||||||
|
|
||||||
|
const int Nd = grid->_ndimension;
|
||||||
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
|
assert(orthogdim >= 0);
|
||||||
|
assert(orthogdim < Nd);
|
||||||
|
|
||||||
|
int fd=grid->_fdimensions[orthogdim];
|
||||||
|
int ld=grid->_ldimensions[orthogdim];
|
||||||
|
int rd=grid->_rdimensions[orthogdim];
|
||||||
|
|
||||||
|
Vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||||
|
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||||
|
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
|
result.resize(fd); // And then global sum to return the same vector to every node
|
||||||
|
for(int r=0;r<rd;r++){
|
||||||
|
lvSum[r]=Zero();
|
||||||
|
}
|
||||||
|
|
||||||
|
int e1= grid->_slice_nblock[orthogdim];
|
||||||
|
int e2= grid->_slice_block [orthogdim];
|
||||||
|
int stride=grid->_slice_stride[orthogdim];
|
||||||
|
int ostride=grid->_ostride[orthogdim];
|
||||||
|
|
||||||
|
//Reduce Data down to lvSum
|
||||||
|
sliceSumReduction_cpu(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
|
||||||
|
|
||||||
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
|
Coordinate icoor(Nd);
|
||||||
|
|
||||||
|
for(int rt=0;rt<rd;rt++){
|
||||||
|
|
||||||
|
extract(lvSum[rt],extracted);
|
||||||
|
|
||||||
|
for(int idx=0;idx<Nsimd;idx++){
|
||||||
|
|
||||||
|
grid->iCoorFromIindex(icoor,idx);
|
||||||
|
|
||||||
|
int ldx =rt+icoor[orthogdim]*rd;
|
||||||
|
|
||||||
|
lsSum[ldx]=lsSum[ldx]+extracted[idx];
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum over nodes.
|
||||||
|
for(int t=0;t<fd;t++){
|
||||||
|
int pt = t/ld; // processor plane
|
||||||
|
int lt = t%ld;
|
||||||
|
if ( pt == grid->_processor_coor[orthogdim] ) {
|
||||||
|
result[t]=lsSum[lt];
|
||||||
|
} else {
|
||||||
|
result[t]=Zero();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
scalar_type * ptr = (scalar_type *) &result[0];
|
||||||
|
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
||||||
|
grid->GlobalSumVector(ptr, words);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main (int argc, char ** argv) {
|
||||||
|
|
||||||
|
using namespace Grid;
|
||||||
|
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
|
||||||
|
Coordinate latt_size({64,64,64,16});
|
||||||
|
auto simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
|
||||||
|
auto mpi_layout = GridDefaultMpi();
|
||||||
|
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||||
|
|
||||||
|
std::vector<int> seeds({1, 2, 3, 4});
|
||||||
|
|
||||||
|
GridParallelRNG pRNG(&Grid);
|
||||||
|
pRNG.SeedFixedIntegers(seeds);
|
||||||
|
|
||||||
|
LatticeComplexD test_data(&Grid);
|
||||||
|
gaussian(pRNG,test_data);
|
||||||
|
|
||||||
|
std::vector<TComplexD> reduction_reference;
|
||||||
|
std::vector<TComplexD> reduction_result;
|
||||||
|
|
||||||
|
//warmup
|
||||||
|
for (int sweeps = 0; sweeps < 5; sweeps++) {
|
||||||
|
reduction_result = sliceSum(test_data,0);
|
||||||
|
}
|
||||||
|
|
||||||
|
int trace_id = traceStart("sliceSum benchmark - ComplexD");
|
||||||
|
std::cout << GridLogMessage << "Testing ComplexD" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "sizeof(ComplexD) = " << sizeof(ComplexD) << std::endl;
|
||||||
|
std::cout << GridLogMessage << "sizeof(vComplexD) = " << sizeof(vComplexD) << std::endl;
|
||||||
|
for (int i = 0; i < Nd; i++) {
|
||||||
|
|
||||||
|
RealD t=-usecond();
|
||||||
|
|
||||||
|
tracePush("sliceSum");
|
||||||
|
sliceSumCPU(test_data,reduction_reference,i);
|
||||||
|
tracePop("sliceSum");
|
||||||
|
|
||||||
|
t+=usecond();
|
||||||
|
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
|
||||||
|
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
RealD tgpu=-usecond();
|
||||||
|
|
||||||
|
tracePush("sliceSumGpu");
|
||||||
|
reduction_result = sliceSum(test_data,i);
|
||||||
|
tracePop("sliceSumGpu");
|
||||||
|
|
||||||
|
tgpu+=usecond();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
|
||||||
|
|
||||||
|
|
||||||
|
for(int t=0;t<reduction_reference.size();t++) {
|
||||||
|
|
||||||
|
auto diff = reduction_reference[t]-reduction_result[t];
|
||||||
|
assert(abs(TensorRemove(diff)) < 1e-8 );
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
traceStop(trace_id);
|
||||||
|
|
||||||
|
LatticeSpinVectorD test_data_cv(&Grid);
|
||||||
|
gaussian(pRNG,test_data_cv);
|
||||||
|
|
||||||
|
std::vector<SpinVectorD> reduction_reference_cv;
|
||||||
|
std::vector<SpinVectorD> reduction_result_cv;
|
||||||
|
|
||||||
|
//warmup
|
||||||
|
for (int sweeps = 0; sweeps < 5; sweeps++) {
|
||||||
|
reduction_result_cv = sliceSum(test_data_cv,0);
|
||||||
|
}
|
||||||
|
trace_id = traceStart("sliceSum benchmark - SpinVectorD");
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Testing SpinVectorD" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "sizeof(SpinVectorD) = " << sizeof(SpinVectorD) << std::endl;
|
||||||
|
std::cout << GridLogMessage << "sizeof(vSpinVectorD) = " << sizeof(vSpinVectorD) << std::endl;
|
||||||
|
for (int i = 0; i < Nd; i++) {
|
||||||
|
|
||||||
|
RealD t=-usecond();
|
||||||
|
|
||||||
|
tracePush("sliceSum");
|
||||||
|
sliceSumCPU(test_data_cv,reduction_reference_cv,i);
|
||||||
|
tracePop("sliceSum");
|
||||||
|
|
||||||
|
t+=usecond();
|
||||||
|
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
|
||||||
|
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
RealD tgpu=-usecond();
|
||||||
|
|
||||||
|
tracePush("sliceSumGpu");
|
||||||
|
reduction_result_cv = sliceSum(test_data_cv,i);
|
||||||
|
tracePop("sliceSumGpu");
|
||||||
|
|
||||||
|
tgpu+=usecond();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
|
||||||
|
|
||||||
|
|
||||||
|
for(int t=0;t<reduction_reference_cv.size();t++) {
|
||||||
|
|
||||||
|
auto diff = reduction_reference_cv[t]-reduction_result_cv[t];
|
||||||
|
assert(abs(diff()(0)()) < 1e-8 );
|
||||||
|
assert(abs(diff()(1)()) < 1e-8 );
|
||||||
|
assert(abs(diff()(2)()) < 1e-8 );
|
||||||
|
assert(abs(diff()(3)()) < 1e-8 );
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
traceStop(trace_id);
|
||||||
|
|
||||||
|
LatticeSpinColourVectorD test_data_scv(&Grid);
|
||||||
|
gaussian(pRNG,test_data_scv);
|
||||||
|
|
||||||
|
std::vector<SpinColourVectorD> reduction_reference_scv;
|
||||||
|
std::vector<SpinColourVectorD> reduction_result_scv;
|
||||||
|
|
||||||
|
//warmup
|
||||||
|
for (int sweeps = 0; sweeps < 5; sweeps++) {
|
||||||
|
reduction_result_scv = sliceSum(test_data_scv,0);
|
||||||
|
}
|
||||||
|
trace_id = traceStart("sliceSum benchmark - SpinColourVectorD");
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Testing SpinColourVectorD" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "sizeof(SpinColourVectorD) = " << sizeof(SpinColourVectorD) << std::endl;
|
||||||
|
std::cout << GridLogMessage << "sizeof(vSpinColourVectorD) = " << sizeof(vSpinColourVectorD) << std::endl;
|
||||||
|
for (int i = 0; i < Nd; i++) {
|
||||||
|
|
||||||
|
RealD t=-usecond();
|
||||||
|
|
||||||
|
tracePush("sliceSum");
|
||||||
|
sliceSumCPU(test_data_scv,reduction_reference_scv,i);
|
||||||
|
tracePop("sliceSum");
|
||||||
|
|
||||||
|
t+=usecond();
|
||||||
|
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
|
||||||
|
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
RealD tgpu=-usecond();
|
||||||
|
|
||||||
|
tracePush("sliceSumGpu");
|
||||||
|
reduction_result_scv = sliceSum(test_data_scv,i);
|
||||||
|
tracePop("sliceSumGpu");
|
||||||
|
|
||||||
|
tgpu+=usecond();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
|
||||||
|
|
||||||
|
|
||||||
|
for(int t=0;t<reduction_reference_scv.size();t++) {
|
||||||
|
|
||||||
|
auto diff = reduction_reference_scv[t]-reduction_result_scv[t];
|
||||||
|
// std::cout << diff <<std::endl;
|
||||||
|
assert(abs(diff()(0)(0)) < 1e-8 );
|
||||||
|
assert(abs(diff()(0)(1)) < 1e-8 );
|
||||||
|
assert(abs(diff()(0)(2)) < 1e-8 );
|
||||||
|
assert(abs(diff()(1)(0)) < 1e-8 );
|
||||||
|
assert(abs(diff()(1)(1)) < 1e-8 );
|
||||||
|
assert(abs(diff()(1)(2)) < 1e-8 );
|
||||||
|
assert(abs(diff()(2)(0)) < 1e-8 );
|
||||||
|
assert(abs(diff()(2)(1)) < 1e-8 );
|
||||||
|
assert(abs(diff()(2)(2)) < 1e-8 );
|
||||||
|
assert(abs(diff()(3)(0)) < 1e-8 );
|
||||||
|
assert(abs(diff()(3)(1)) < 1e-8 );
|
||||||
|
assert(abs(diff()(3)(2)) < 1e-8 );
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
traceStop(trace_id);
|
||||||
|
|
||||||
|
LatticeSpinColourMatrixD test_data_scm(&Grid);
|
||||||
|
gaussian(pRNG,test_data_scm);
|
||||||
|
|
||||||
|
std::vector<SpinColourMatrixD> reduction_reference_scm;
|
||||||
|
std::vector<SpinColourMatrixD> reduction_result_scm;
|
||||||
|
|
||||||
|
//warmup
|
||||||
|
for (int sweeps = 0; sweeps < 5; sweeps++) {
|
||||||
|
reduction_result_scm = sliceSum(test_data_scm,0);
|
||||||
|
}
|
||||||
|
trace_id = traceStart("sliceSum benchmark - SpinColourMatrixD");
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Testing SpinColourMatrixD" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "sizeof(SpinColourMatrixD) = " << sizeof(SpinColourMatrixD) << std::endl;
|
||||||
|
std::cout << GridLogMessage << "sizeof(vSpinColourMatrixD) = " << sizeof(vSpinColourMatrixD) << std::endl;
|
||||||
|
for (int i = 0; i < Nd; i++) {
|
||||||
|
|
||||||
|
RealD t=-usecond();
|
||||||
|
|
||||||
|
tracePush("sliceSum");
|
||||||
|
sliceSumCPU(test_data_scm,reduction_reference_scm,i);
|
||||||
|
tracePop("sliceSum");
|
||||||
|
|
||||||
|
t+=usecond();
|
||||||
|
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
|
||||||
|
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
RealD tgpu=-usecond();
|
||||||
|
|
||||||
|
tracePush("sliceSumGpu");
|
||||||
|
reduction_result_scm = sliceSum(test_data_scm,i);
|
||||||
|
tracePop("sliceSumGpu");
|
||||||
|
|
||||||
|
tgpu+=usecond();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
|
||||||
|
|
||||||
|
|
||||||
|
for(int t=0;t<reduction_reference_scm.size();t++) {
|
||||||
|
|
||||||
|
auto diff = reduction_reference_scm[t]-reduction_result_scm[t];
|
||||||
|
// std::cout << diff <<std::endl;
|
||||||
|
for (int is = 0; is < Ns; is++) {
|
||||||
|
for (int js = 0; js < Ns; js++) {
|
||||||
|
for (int ic = 0; ic < Nc; ic++) {
|
||||||
|
for (int jc = 0; jc < Nc; jc++) {
|
||||||
|
assert(abs(diff()(is,js)(ic,jc)) < 1e-8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
traceStop(trace_id);
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user