mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Compare commits
36 Commits
73c0b29535
...
fce3852dff
Author | SHA1 | Date | |
---|---|---|---|
|
fce3852dff | ||
|
ee1b8bbdbd | ||
|
3f1636637d | ||
|
2e570f5300 | ||
|
9f89486df5 | ||
|
22b43b86cb | ||
|
3c9012676a | ||
|
b507fe209c | ||
|
6cd2d8fcd5 | ||
|
0a816b5509 | ||
|
1c8b807c2e | ||
|
66391f84f2 | ||
|
97f7a9ecb3 | ||
|
15878f7613 | ||
|
e0d5e3c6c7 | ||
|
6f3455900e | ||
e4a641b64e | |||
8849f187f1 | |||
|
b5659d106e | ||
|
4b43307402 | ||
|
09af8c25a2 | ||
|
9514035b87 | ||
|
1514b4f137 | ||
|
ab2de131bd | ||
|
5af8da76d7 | ||
|
b8b9dc952d | ||
|
79a6ed32d8 | ||
|
caa5f97723 | ||
|
f2648e94b9 | ||
|
e6ed516052 | ||
|
e2a3dae1f2 | ||
|
452bf2e907 | ||
|
e8c29e2fe5 | ||
|
da9cbfc7cc | ||
|
6b9f07c1ed | ||
|
5f75735dab |
@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||
}
|
||||
|
||||
#if ( (!defined(GRID_CUDA)) )
|
||||
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
|
||||
int max_threads = thread_max();
|
||||
Vector < vobj > Bt(Nm * max_threads);
|
||||
thread_region
|
||||
|
@ -31,6 +31,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#if defined(GRID_SYCL)
|
||||
#include <Grid/lattice/Lattice_reduction_sycl.h>
|
||||
#endif
|
||||
#include <Grid/lattice/Lattice_slicesum_core.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
@ -448,19 +449,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
int ostride=grid->_ostride[orthogdim];
|
||||
|
||||
// sum over reduced dimension planes, breaking out orthog dir
|
||||
// Parallel over orthog direction
|
||||
autoView( Data_v, Data, CpuRead);
|
||||
thread_for( r,rd, {
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int ss= so+n*stride+b;
|
||||
lvSum[r]=lvSum[r]+Data_v[ss];
|
||||
}
|
||||
}
|
||||
});
|
||||
//Reduce Data down to lvSum
|
||||
sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
|
||||
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
Coordinate icoor(Nd);
|
||||
@ -504,6 +496,7 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
template<class vobj>
|
||||
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
||||
{
|
||||
|
213
Grid/lattice/Lattice_slicesum_core.h
Normal file
213
Grid/lattice/Lattice_slicesum_core.h
Normal file
@ -0,0 +1,213 @@
|
||||
#pragma once
|
||||
#include <type_traits>
|
||||
#if defined(GRID_CUDA)
|
||||
|
||||
#include <cub/cub.cuh>
|
||||
#define gpucub cub
|
||||
#define gpuError_t cudaError_t
|
||||
#define gpuSuccess cudaSuccess
|
||||
|
||||
#elif defined(GRID_HIP)
|
||||
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#define gpucub hipcub
|
||||
#define gpuError_t hipError_t
|
||||
#define gpuSuccess hipSuccess
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||
size_t subvol_size = e1*e2;
|
||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
auto rb_p = &reduction_buffer[0];
|
||||
vobj zero_init;
|
||||
zeroit(zero_init);
|
||||
|
||||
|
||||
void *temp_storage_array = NULL;
|
||||
size_t temp_storage_bytes = 0;
|
||||
vobj *d_out;
|
||||
int* d_offsets;
|
||||
|
||||
std::vector<int> offsets(rd+1,0);
|
||||
|
||||
for (int i = 0; i < offsets.size(); i++) {
|
||||
offsets[i] = i*subvol_size;
|
||||
}
|
||||
|
||||
//Allocate memory for output and offset arrays on device
|
||||
d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
|
||||
|
||||
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
||||
|
||||
//copy offsets to device
|
||||
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||
|
||||
|
||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||
if (gpuErr!=gpuSuccess) {
|
||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
//allocate memory for temp_storage_array
|
||||
temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
|
||||
|
||||
//prepare buffer for reduction
|
||||
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
|
||||
//use 2d accelerator_for to avoid launch latencies found when serially looping over rd
|
||||
accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{
|
||||
|
||||
int n = s / e2;
|
||||
int b = s % e2;
|
||||
int so=r*ostride; // base offset for start of plane
|
||||
int ss= so+n*stride+b;
|
||||
|
||||
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
|
||||
|
||||
});
|
||||
|
||||
//issue segmented reductions in computeStream
|
||||
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
|
||||
if (gpuErr!=gpuSuccess) {
|
||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||
|
||||
//sync after copy
|
||||
accelerator_barrier();
|
||||
|
||||
acceleratorFreeDevice(temp_storage_array);
|
||||
acceleratorFreeDevice(d_out);
|
||||
acceleratorFreeDevice(d_offsets);
|
||||
|
||||
|
||||
}
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||
typedef typename vobj::vector_type vector;
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
const int osites = rd*e1*e2;
|
||||
commVector<vector>buffer(osites);
|
||||
vector *dat = (vector *)Data;
|
||||
vector *buf = &buffer[0];
|
||||
Vector<vector> lvSum_small(rd);
|
||||
vector *lvSum_ptr = (vector *)&lvSum[0];
|
||||
|
||||
for (int w = 0; w < words; w++) {
|
||||
accelerator_for(ss,osites,1,{
|
||||
buf[ss] = dat[ss*words+w];
|
||||
});
|
||||
|
||||
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
||||
|
||||
for (int r = 0; r < rd; r++) {
|
||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||
{
|
||||
autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
|
||||
if constexpr (sizeof(vobj) <= 256) {
|
||||
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
}
|
||||
else {
|
||||
sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(GRID_SYCL)
|
||||
template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
size_t subvol_size = e1*e2;
|
||||
|
||||
vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
|
||||
vobj vobj_zero;
|
||||
zeroit(vobj_zero);
|
||||
|
||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
|
||||
auto rb_p = &reduction_buffer[0];
|
||||
|
||||
autoView(Data_v, Data, AcceleratorRead);
|
||||
|
||||
//prepare reduction buffer
|
||||
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
|
||||
|
||||
int n = s / e2;
|
||||
int b = s % e2;
|
||||
int so=r*ostride; // base offset for start of plane
|
||||
int ss= so+n*stride+b;
|
||||
|
||||
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
|
||||
|
||||
});
|
||||
|
||||
for (int r = 0; r < rd; r++) {
|
||||
mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
|
||||
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||
Reduction,
|
||||
[=](cl::sycl::id<1> item, auto &sum) {
|
||||
auto s = item[0];
|
||||
sum += rb_p[r*subvol_size+s];
|
||||
});
|
||||
});
|
||||
theGridAccelerator->wait();
|
||||
lvSum[r] = mysum[0];
|
||||
}
|
||||
|
||||
free(mysum,*theGridAccelerator);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
{
|
||||
// sum over reduced dimension planes, breaking out orthog dir
|
||||
// Parallel over orthog direction
|
||||
autoView( Data_v, Data, CpuRead);
|
||||
thread_for( r,rd, {
|
||||
int so=r*ostride; // base offset for start of plane
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int ss= so+n*stride+b;
|
||||
lvSum[r]=lvSum[r]+Data_v[ss];
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
{
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
|
||||
sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
|
||||
#elif defined(GRID_SYCL)
|
||||
|
||||
sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
|
||||
#else
|
||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
||||
Coordinate fine_rdimensions = fine->_rdimensions;
|
||||
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
||||
|
||||
vobj zz = Zero();
|
||||
|
||||
accelerator_for(sc,coarse->oSites(),1,{
|
||||
|
||||
// One thread per sub block
|
||||
Coordinate coor_c(_ndimension);
|
||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
|
||||
|
||||
vobj cd = zz;
|
||||
vobj cd = Zero();
|
||||
|
||||
for(int sb=0;sb<blockVol;sb++){
|
||||
|
||||
|
@ -45,6 +45,7 @@ public:
|
||||
};
|
||||
// Host only
|
||||
GridBase * getGrid(void) const { return _grid; };
|
||||
vobj* getHostPointer(void) const { return _odata; };
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -34,7 +34,7 @@ class GridTracer {
|
||||
};
|
||||
inline void tracePush(const char *name) { roctxRangePushA(name); }
|
||||
inline void tracePop(const char *name) { roctxRangePop(); }
|
||||
inline int traceStart(const char *name) { roctxRangeStart(name); }
|
||||
inline int traceStart(const char *name) { return roctxRangeStart(name); }
|
||||
inline void traceStop(int ID) { roctxRangeStop(ID); }
|
||||
#endif
|
||||
|
||||
|
@ -63,6 +63,8 @@ public:
|
||||
virtual void MooeeDag(const FermionField &in, FermionField &out) ;
|
||||
virtual void MooeeInv(const FermionField &in, FermionField &out) ;
|
||||
virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
|
||||
virtual void M(const FermionField &in, FermionField &out) ;
|
||||
virtual void Mdag(const FermionField &in, FermionField &out) ;
|
||||
|
||||
private:
|
||||
RealD mu; // TwistedMass parameter
|
||||
|
@ -93,5 +93,25 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
|
||||
RealD b = tm /sq;
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
this->Dhop(in, out, DaggerNo);
|
||||
FermionField tmp(out.Grid());
|
||||
RealD a = 4.0+this->mass;
|
||||
RealD b = this->mu;
|
||||
axpibg5x(tmp,in,a,b);
|
||||
axpy(out, 1.0, tmp, out);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
this->Dhop(in, out, DaggerYes);
|
||||
FermionField tmp(out.Grid());
|
||||
RealD a = 4.0+this->mass;
|
||||
RealD b = -this->mu;
|
||||
axpibg5x(tmp,in,a,b);
|
||||
axpy(out, 1.0, tmp, out);
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -237,7 +237,7 @@ public:
|
||||
|
||||
for (int level = 0; level < as.size(); ++level) {
|
||||
int multiplier = as.at(level).multiplier;
|
||||
ActionLevel<Field> * Level = new ActionLevel<Field>(multiplier);
|
||||
ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
|
||||
Level->push_back(new EmptyAction<Field>);
|
||||
LevelForces.push_back(*Level);
|
||||
// does it copy by value or reference??
|
||||
|
@ -706,7 +706,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||
//std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||
}
|
||||
/// Introduce a block structure and switch off comms on boundaries
|
||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||
@ -761,7 +761,8 @@ public:
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances,
|
||||
Parameters p=Parameters())
|
||||
Parameters p=Parameters(),
|
||||
bool preserve_shm=false)
|
||||
{
|
||||
face_table_computed=0;
|
||||
_grid = grid;
|
||||
@ -855,6 +856,8 @@ public:
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
// Allow for multiple stencils to exist simultaneously
|
||||
if (!preserve_shm)
|
||||
_grid->ShmBufferFreeAll();
|
||||
|
||||
int maxl=2;
|
||||
|
@ -225,6 +225,8 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
@ -288,11 +290,12 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
||||
unsigned long nt=acceleratorThreads(); \
|
||||
if(nt < 8)nt=8; \
|
||||
unsigned long unum1 = num1; \
|
||||
unsigned long unum2 = num2; \
|
||||
if(nt < 8)nt=8; \
|
||||
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
|
||||
cl::sycl::range<3> local {nt,1,nsimd}; \
|
||||
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
||||
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
|
||||
cgh.parallel_for( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
||||
@ -301,7 +304,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
auto iter1 = item.get_global_id(0); \
|
||||
auto iter2 = item.get_global_id(1); \
|
||||
auto lane = item.get_global_id(2); \
|
||||
{ __VA_ARGS__ }; \
|
||||
{ if (iter1 < unum1){ __VA_ARGS__ } }; \
|
||||
}); \
|
||||
});
|
||||
|
||||
@ -442,6 +445,8 @@ inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
|
||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
|
||||
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
//inline void acceleratorCopySynchronise(void) { }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
|
||||
|
15
bootstrap.sh
15
bootstrap.sh
@ -1,12 +1,12 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
|
||||
EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
|
||||
EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2'
|
||||
EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626'
|
||||
|
||||
|
||||
echo "-- deploying Eigen source..."
|
||||
ARC=`basename ${EIGEN_URL}`
|
||||
ARC=$(basename ${EIGEN_URL})
|
||||
wget ${EIGEN_URL} --no-check-certificate
|
||||
if command -v sha256sum; then
|
||||
echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \
|
||||
@ -14,13 +14,8 @@ if command -v sha256sum; then
|
||||
else
|
||||
echo "WARNING: could not verify checksum, please install sha256sum" >&2
|
||||
fi
|
||||
./scripts/update_eigen.sh ${ARC}
|
||||
rm ${ARC}
|
||||
# patch for non-portable includes in Eigen 3.3.5
|
||||
# apparently already fixed in Eigen HEAD so it should not be
|
||||
# a problem in the future (A.P.)
|
||||
patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
|
||||
|
||||
./scripts/update_eigen.sh "${ARC}"
|
||||
rm "${ARC}"
|
||||
echo '-- generating Make.inc files...'
|
||||
./scripts/filelist
|
||||
echo '-- generating configure script...'
|
||||
|
@ -1,19 +0,0 @@
|
||||
--- ./Eigen/unsupported/Eigen/CXX11/Tensor 2018-07-23 10:33:42.000000000 +0100
|
||||
+++ Tensor 2018-08-28 16:15:56.000000000 +0100
|
||||
@@ -25,7 +25,7 @@
|
||||
#include <utility>
|
||||
#endif
|
||||
|
||||
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "../SpecialFunctions"
|
||||
#include "src/util/CXX11Meta.h"
|
||||
@@ -147,6 +147,6 @@
|
||||
|
||||
#include "src/Tensor/TensorIO.h"
|
||||
|
||||
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
|
||||
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
|
||||
|
||||
//#endif // EIGEN_CXX11_TENSOR_MODULE
|
321
tests/core/Test_sliceSum.cc
Normal file
321
tests/core/Test_sliceSum.cc
Normal file
@ -0,0 +1,321 @@
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
template<class vobj> inline void sliceSumCPU(const Grid::Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
||||
{
|
||||
using namespace Grid;
|
||||
///////////////////////////////////////////////////////
|
||||
// FIXME precision promoted summation
|
||||
// may be important for correlation functions
|
||||
// But easily avoided by using double precision fields
|
||||
///////////////////////////////////////////////////////
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_object::scalar_type scalar_type;
|
||||
GridBase *grid = Data.Grid();
|
||||
assert(grid!=NULL);
|
||||
|
||||
const int Nd = grid->_ndimension;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
assert(orthogdim >= 0);
|
||||
assert(orthogdim < Nd);
|
||||
|
||||
int fd=grid->_fdimensions[orthogdim];
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
Vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||
|
||||
result.resize(fd); // And then global sum to return the same vector to every node
|
||||
for(int r=0;r<rd;r++){
|
||||
lvSum[r]=Zero();
|
||||
}
|
||||
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
int ostride=grid->_ostride[orthogdim];
|
||||
|
||||
//Reduce Data down to lvSum
|
||||
sliceSumReduction_cpu(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
|
||||
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
Coordinate icoor(Nd);
|
||||
|
||||
for(int rt=0;rt<rd;rt++){
|
||||
|
||||
extract(lvSum[rt],extracted);
|
||||
|
||||
for(int idx=0;idx<Nsimd;idx++){
|
||||
|
||||
grid->iCoorFromIindex(icoor,idx);
|
||||
|
||||
int ldx =rt+icoor[orthogdim]*rd;
|
||||
|
||||
lsSum[ldx]=lsSum[ldx]+extracted[idx];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// sum over nodes.
|
||||
for(int t=0;t<fd;t++){
|
||||
int pt = t/ld; // processor plane
|
||||
int lt = t%ld;
|
||||
if ( pt == grid->_processor_coor[orthogdim] ) {
|
||||
result[t]=lsSum[lt];
|
||||
} else {
|
||||
result[t]=Zero();
|
||||
}
|
||||
|
||||
}
|
||||
scalar_type * ptr = (scalar_type *) &result[0];
|
||||
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
||||
grid->GlobalSumVector(ptr, words);
|
||||
}
|
||||
|
||||
|
||||
int main (int argc, char ** argv) {
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
|
||||
Coordinate latt_size({64,64,64,16});
|
||||
auto simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
|
||||
auto mpi_layout = GridDefaultMpi();
|
||||
GridCartesian Grid(latt_size, simd_layout, mpi_layout);
|
||||
|
||||
std::vector<int> seeds({1, 2, 3, 4});
|
||||
|
||||
GridParallelRNG pRNG(&Grid);
|
||||
pRNG.SeedFixedIntegers(seeds);
|
||||
|
||||
LatticeComplexD test_data(&Grid);
|
||||
gaussian(pRNG,test_data);
|
||||
|
||||
std::vector<TComplexD> reduction_reference;
|
||||
std::vector<TComplexD> reduction_result;
|
||||
|
||||
//warmup
|
||||
for (int sweeps = 0; sweeps < 5; sweeps++) {
|
||||
reduction_result = sliceSum(test_data,0);
|
||||
}
|
||||
|
||||
int trace_id = traceStart("sliceSum benchmark - ComplexD");
|
||||
std::cout << GridLogMessage << "Testing ComplexD" << std::endl;
|
||||
std::cout << GridLogMessage << "sizeof(ComplexD) = " << sizeof(ComplexD) << std::endl;
|
||||
std::cout << GridLogMessage << "sizeof(vComplexD) = " << sizeof(vComplexD) << std::endl;
|
||||
for (int i = 0; i < Nd; i++) {
|
||||
|
||||
RealD t=-usecond();
|
||||
|
||||
tracePush("sliceSum");
|
||||
sliceSumCPU(test_data,reduction_reference,i);
|
||||
tracePop("sliceSum");
|
||||
|
||||
t+=usecond();
|
||||
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
|
||||
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
|
||||
|
||||
|
||||
RealD tgpu=-usecond();
|
||||
|
||||
tracePush("sliceSumGpu");
|
||||
reduction_result = sliceSum(test_data,i);
|
||||
tracePop("sliceSumGpu");
|
||||
|
||||
tgpu+=usecond();
|
||||
|
||||
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
|
||||
|
||||
|
||||
for(int t=0;t<reduction_reference.size();t++) {
|
||||
|
||||
auto diff = reduction_reference[t]-reduction_result[t];
|
||||
assert(abs(TensorRemove(diff)) < 1e-8 );
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
traceStop(trace_id);
|
||||
|
||||
LatticeSpinVectorD test_data_cv(&Grid);
|
||||
gaussian(pRNG,test_data_cv);
|
||||
|
||||
std::vector<SpinVectorD> reduction_reference_cv;
|
||||
std::vector<SpinVectorD> reduction_result_cv;
|
||||
|
||||
//warmup
|
||||
for (int sweeps = 0; sweeps < 5; sweeps++) {
|
||||
reduction_result_cv = sliceSum(test_data_cv,0);
|
||||
}
|
||||
trace_id = traceStart("sliceSum benchmark - SpinVectorD");
|
||||
|
||||
std::cout << GridLogMessage << "Testing SpinVectorD" << std::endl;
|
||||
std::cout << GridLogMessage << "sizeof(SpinVectorD) = " << sizeof(SpinVectorD) << std::endl;
|
||||
std::cout << GridLogMessage << "sizeof(vSpinVectorD) = " << sizeof(vSpinVectorD) << std::endl;
|
||||
for (int i = 0; i < Nd; i++) {
|
||||
|
||||
RealD t=-usecond();
|
||||
|
||||
tracePush("sliceSum");
|
||||
sliceSumCPU(test_data_cv,reduction_reference_cv,i);
|
||||
tracePop("sliceSum");
|
||||
|
||||
t+=usecond();
|
||||
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
|
||||
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
|
||||
|
||||
|
||||
RealD tgpu=-usecond();
|
||||
|
||||
tracePush("sliceSumGpu");
|
||||
reduction_result_cv = sliceSum(test_data_cv,i);
|
||||
tracePop("sliceSumGpu");
|
||||
|
||||
tgpu+=usecond();
|
||||
|
||||
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
|
||||
|
||||
|
||||
for(int t=0;t<reduction_reference_cv.size();t++) {
|
||||
|
||||
auto diff = reduction_reference_cv[t]-reduction_result_cv[t];
|
||||
assert(abs(diff()(0)()) < 1e-8 );
|
||||
assert(abs(diff()(1)()) < 1e-8 );
|
||||
assert(abs(diff()(2)()) < 1e-8 );
|
||||
assert(abs(diff()(3)()) < 1e-8 );
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
traceStop(trace_id);
|
||||
|
||||
LatticeSpinColourVectorD test_data_scv(&Grid);
|
||||
gaussian(pRNG,test_data_scv);
|
||||
|
||||
std::vector<SpinColourVectorD> reduction_reference_scv;
|
||||
std::vector<SpinColourVectorD> reduction_result_scv;
|
||||
|
||||
//warmup
|
||||
for (int sweeps = 0; sweeps < 5; sweeps++) {
|
||||
reduction_result_scv = sliceSum(test_data_scv,0);
|
||||
}
|
||||
trace_id = traceStart("sliceSum benchmark - SpinColourVectorD");
|
||||
|
||||
std::cout << GridLogMessage << "Testing SpinColourVectorD" << std::endl;
|
||||
std::cout << GridLogMessage << "sizeof(SpinColourVectorD) = " << sizeof(SpinColourVectorD) << std::endl;
|
||||
std::cout << GridLogMessage << "sizeof(vSpinColourVectorD) = " << sizeof(vSpinColourVectorD) << std::endl;
|
||||
for (int i = 0; i < Nd; i++) {
|
||||
|
||||
RealD t=-usecond();
|
||||
|
||||
tracePush("sliceSum");
|
||||
sliceSumCPU(test_data_scv,reduction_reference_scv,i);
|
||||
tracePop("sliceSum");
|
||||
|
||||
t+=usecond();
|
||||
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
|
||||
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
|
||||
|
||||
|
||||
RealD tgpu=-usecond();
|
||||
|
||||
tracePush("sliceSumGpu");
|
||||
reduction_result_scv = sliceSum(test_data_scv,i);
|
||||
tracePop("sliceSumGpu");
|
||||
|
||||
tgpu+=usecond();
|
||||
|
||||
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
|
||||
|
||||
|
||||
for(int t=0;t<reduction_reference_scv.size();t++) {
|
||||
|
||||
auto diff = reduction_reference_scv[t]-reduction_result_scv[t];
|
||||
// std::cout << diff <<std::endl;
|
||||
assert(abs(diff()(0)(0)) < 1e-8 );
|
||||
assert(abs(diff()(0)(1)) < 1e-8 );
|
||||
assert(abs(diff()(0)(2)) < 1e-8 );
|
||||
assert(abs(diff()(1)(0)) < 1e-8 );
|
||||
assert(abs(diff()(1)(1)) < 1e-8 );
|
||||
assert(abs(diff()(1)(2)) < 1e-8 );
|
||||
assert(abs(diff()(2)(0)) < 1e-8 );
|
||||
assert(abs(diff()(2)(1)) < 1e-8 );
|
||||
assert(abs(diff()(2)(2)) < 1e-8 );
|
||||
assert(abs(diff()(3)(0)) < 1e-8 );
|
||||
assert(abs(diff()(3)(1)) < 1e-8 );
|
||||
assert(abs(diff()(3)(2)) < 1e-8 );
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
traceStop(trace_id);
|
||||
|
||||
LatticeSpinColourMatrixD test_data_scm(&Grid);
|
||||
gaussian(pRNG,test_data_scm);
|
||||
|
||||
std::vector<SpinColourMatrixD> reduction_reference_scm;
|
||||
std::vector<SpinColourMatrixD> reduction_result_scm;
|
||||
|
||||
//warmup
|
||||
for (int sweeps = 0; sweeps < 5; sweeps++) {
|
||||
reduction_result_scm = sliceSum(test_data_scm,0);
|
||||
}
|
||||
trace_id = traceStart("sliceSum benchmark - SpinColourMatrixD");
|
||||
|
||||
std::cout << GridLogMessage << "Testing SpinColourMatrixD" << std::endl;
|
||||
std::cout << GridLogMessage << "sizeof(SpinColourMatrixD) = " << sizeof(SpinColourMatrixD) << std::endl;
|
||||
std::cout << GridLogMessage << "sizeof(vSpinColourMatrixD) = " << sizeof(vSpinColourMatrixD) << std::endl;
|
||||
for (int i = 0; i < Nd; i++) {
|
||||
|
||||
RealD t=-usecond();
|
||||
|
||||
tracePush("sliceSum");
|
||||
sliceSumCPU(test_data_scm,reduction_reference_scm,i);
|
||||
tracePop("sliceSum");
|
||||
|
||||
t+=usecond();
|
||||
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
|
||||
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
|
||||
|
||||
|
||||
RealD tgpu=-usecond();
|
||||
|
||||
tracePush("sliceSumGpu");
|
||||
reduction_result_scm = sliceSum(test_data_scm,i);
|
||||
tracePop("sliceSumGpu");
|
||||
|
||||
tgpu+=usecond();
|
||||
|
||||
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
|
||||
|
||||
|
||||
for(int t=0;t<reduction_reference_scm.size();t++) {
|
||||
|
||||
auto diff = reduction_reference_scm[t]-reduction_result_scm[t];
|
||||
// std::cout << diff <<std::endl;
|
||||
for (int is = 0; is < Ns; is++) {
|
||||
for (int js = 0; js < Ns; js++) {
|
||||
for (int ic = 0; ic < Nc; ic++) {
|
||||
for (int jc = 0; jc < Nc; jc++) {
|
||||
assert(abs(diff()(is,js)(ic,jc)) < 1e-8);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
traceStop(trace_id);
|
||||
|
||||
Grid_finalize();
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user