mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-13 20:57:06 +01:00
Merge branch 'develop' into feature/json-fix
This commit is contained in:
@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
|
||||
extra_sources+=communicator/Communicator_base.cc
|
||||
endif
|
||||
|
||||
if BUILD_COMMS_MPI3L
|
||||
extra_sources+=communicator/Communicator_mpi3_leader.cc
|
||||
if BUILD_COMMS_MPIT
|
||||
extra_sources+=communicator/Communicator_mpit.cc
|
||||
extra_sources+=communicator/Communicator_base.cc
|
||||
endif
|
||||
|
||||
|
@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
sliceInnerProductMatrix(m_rr,R,R,Orthog);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Cholesky from Eigen
|
||||
// There exists a ldlt that is documented as more stable
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||
// Force manifest hermitian to avoid rounding related
|
||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||
|
||||
#if 0
|
||||
std::cout << " Calling Cholesky ldlt on m_rr " << m_rr <<std::endl;
|
||||
Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL();
|
||||
std::cout << " Called Cholesky ldlt on m_rr " << L_ldlt <<std::endl;
|
||||
auto D_ldlt = m_rr.ldlt().vectorD();
|
||||
std::cout << " Called Cholesky ldlt on m_rr " << D_ldlt <<std::endl;
|
||||
#endif
|
||||
|
||||
// std::cout << " Calling Cholesky llt on m_rr " <<std::endl;
|
||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||
// std::cout << " Called Cholesky llt on m_rr " << L <<std::endl;
|
||||
C = L.adjoint();
|
||||
Cinv = C.inverse();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Q = R C^{-1}
|
||||
//
|
||||
@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
||||
//
|
||||
// NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// FIXME:: make a sliceMulMatrix to avoid zero vector
|
||||
sliceMulMatrix(Q,Cinv,R,Orthog);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -199,7 +205,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
||||
|
||||
Linop.HermOp(X, AD);
|
||||
tmp = B - AD;
|
||||
//std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
|
||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||
//std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
|
||||
//std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
|
||||
//std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
|
||||
//std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
|
||||
D=Q;
|
||||
|
||||
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
|
||||
@ -221,13 +232,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
||||
MatrixTimer.Start();
|
||||
Linop.HermOp(D, Z);
|
||||
MatrixTimer.Stop();
|
||||
//std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
|
||||
|
||||
//4. M = [D^dag Z]^{-1}
|
||||
sliceInnerTimer.Start();
|
||||
sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
|
||||
sliceInnerTimer.Stop();
|
||||
m_M = m_DZ.inverse();
|
||||
|
||||
//std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
|
||||
|
||||
//5. X = X + D MC
|
||||
m_tmp = m_M * m_C;
|
||||
sliceMaddTimer.Start();
|
||||
|
@ -1,7 +1,5 @@
|
||||
|
||||
|
||||
|
||||
#include <Grid/GridCore.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
@ -11,7 +9,7 @@ int PointerCache::victim;
|
||||
|
||||
void *PointerCache::Insert(void *ptr,size_t bytes) {
|
||||
|
||||
if (bytes < 4096 ) return NULL;
|
||||
if (bytes < 4096 ) return ptr;
|
||||
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||
{
|
||||
#ifdef __linux__
|
||||
int fd = open("/proc/self/pagemap", O_RDONLY);
|
||||
assert(fd >= 0);
|
||||
const int page_size = 4096;
|
||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||
uint64_t pagedata[npages];
|
||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||
assert(ret == offset);
|
||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||
assert(ret == sizeof(uint64_t) * npages);
|
||||
int nhugepages = npages / 512;
|
||||
int n4ktotal, nnothuge;
|
||||
n4ktotal = 0;
|
||||
nnothuge = 0;
|
||||
for (int i = 0; i < nhugepages; ++i) {
|
||||
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
||||
for (int j = 0; j < 512; ++j) {
|
||||
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
||||
++n4ktotal;
|
||||
if (pageaddr != baseaddr + j * page_size)
|
||||
++nnothuge;
|
||||
}
|
||||
}
|
||||
int rank = CartesianCommunicator::RankWorld();
|
||||
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -64,6 +64,8 @@ namespace Grid {
|
||||
|
||||
};
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// A lattice of something, but assume the something is SIMDized.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
@ -92,12 +94,20 @@ public:
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
|
||||
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
||||
|
||||
// if ( ptr != NULL )
|
||||
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
|
||||
|
||||
//////////////////
|
||||
// Hack 2MB align; could make option probably doesn't need configurability
|
||||
//////////////////
|
||||
//define GRID_ALLOC_ALIGN (128)
|
||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
|
||||
#else
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
|
||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
||||
#endif
|
||||
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
|
||||
// First touch optimise in threaded loop
|
||||
uint8_t *cp = (uint8_t *)ptr;
|
||||
#ifdef GRID_OMP
|
||||
@ -111,6 +121,7 @@ public:
|
||||
|
||||
void deallocate(pointer __p, size_type __n) {
|
||||
size_type bytes = __n * sizeof(_Tp);
|
||||
|
||||
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
||||
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
@ -189,16 +200,18 @@ public:
|
||||
pointer allocate(size_type __n, const void* _p= 0)
|
||||
{
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
|
||||
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
|
||||
#else
|
||||
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
||||
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
|
||||
#endif
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
uint8_t *cp = (uint8_t *)ptr;
|
||||
if ( ptr ) {
|
||||
// One touch per 4k page, static OMP loop to catch same loop order
|
||||
#pragma omp parallel for schedule(static)
|
||||
for(size_type n=0;n<bytes;n+=4096){
|
||||
cp[n]=0;
|
||||
for(size_type n=0;n<bytes;n+=4096){
|
||||
cp[n]=0;
|
||||
}
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
@ -185,17 +185,18 @@ public:
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
void show_decomposition(){
|
||||
std::cout << GridLogMessage << "Full Dimensions : " << _fdimensions << std::endl;
|
||||
std::cout << GridLogMessage << "Global Dimensions : " << _gdimensions << std::endl;
|
||||
std::cout << GridLogMessage << "Local Dimensions : " << _ldimensions << std::endl;
|
||||
std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
|
||||
std::cout << GridLogMessage << "Outer strides : " << _ostride << std::endl;
|
||||
std::cout << GridLogMessage << "Inner strides : " << _istride << std::endl;
|
||||
std::cout << GridLogMessage << "iSites : " << _isites << std::endl;
|
||||
std::cout << GridLogMessage << "oSites : " << _osites << std::endl;
|
||||
std::cout << GridLogMessage << "lSites : " << lSites() << std::endl;
|
||||
std::cout << GridLogMessage << "gSites : " << gSites() << std::endl;
|
||||
std::cout << GridLogMessage << "Nd : " << _ndimension << std::endl;
|
||||
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
|
||||
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
|
||||
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
|
||||
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
|
||||
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
|
||||
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
|
||||
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
|
||||
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
|
||||
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
|
||||
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
|
||||
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
|
||||
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
@ -62,77 +62,81 @@ public:
|
||||
return shift;
|
||||
}
|
||||
GridCartesian(const std::vector<int> &dimensions,
|
||||
const std::vector<int> &simd_layout,
|
||||
const std::vector<int> &processor_grid
|
||||
) : GridBase(processor_grid)
|
||||
const std::vector<int> &simd_layout,
|
||||
const std::vector<int> &processor_grid) : GridBase(processor_grid)
|
||||
{
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
_ndimension = dimensions.size();
|
||||
|
||||
_fdimensions.resize(_ndimension);
|
||||
_gdimensions.resize(_ndimension);
|
||||
_ldimensions.resize(_ndimension);
|
||||
_rdimensions.resize(_ndimension);
|
||||
_simd_layout.resize(_ndimension);
|
||||
_lstart.resize(_ndimension);
|
||||
_lend.resize(_ndimension);
|
||||
|
||||
_ostride.resize(_ndimension);
|
||||
_istride.resize(_ndimension);
|
||||
|
||||
_fsites = _gsites = _osites = _isites = 1;
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
_ndimension = dimensions.size();
|
||||
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
_fsites = _fsites * _fdimensions[d];
|
||||
_gsites = _gsites * _gdimensions[d];
|
||||
_fdimensions.resize(_ndimension);
|
||||
_gdimensions.resize(_ndimension);
|
||||
_ldimensions.resize(_ndimension);
|
||||
_rdimensions.resize(_ndimension);
|
||||
_simd_layout.resize(_ndimension);
|
||||
_lstart.resize(_ndimension);
|
||||
_lend.resize(_ndimension);
|
||||
|
||||
//FIXME check for exact division
|
||||
_ostride.resize(_ndimension);
|
||||
_istride.resize(_ndimension);
|
||||
|
||||
// Use a reduced simd grid
|
||||
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
|
||||
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
|
||||
_lstart[d] = _processor_coor[d]*_ldimensions[d];
|
||||
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _simd_layout[d];
|
||||
|
||||
// Addressing support
|
||||
if ( d==0 ) {
|
||||
_ostride[d] = 1;
|
||||
_istride[d] = 1;
|
||||
} else {
|
||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||
_istride[d] = _istride[d-1]*_simd_layout[d-1];
|
||||
}
|
||||
_fsites = _gsites = _osites = _isites = 1;
|
||||
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
{
|
||||
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
_fsites = _fsites * _fdimensions[d];
|
||||
_gsites = _gsites * _gdimensions[d];
|
||||
|
||||
// Use a reduced simd grid
|
||||
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
|
||||
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
|
||||
|
||||
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
|
||||
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
|
||||
|
||||
_lstart[d] = _processor_coor[d] * _ldimensions[d];
|
||||
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _simd_layout[d];
|
||||
|
||||
// Addressing support
|
||||
if (d == 0)
|
||||
{
|
||||
_ostride[d] = 1;
|
||||
_istride[d] = 1;
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// subplane information
|
||||
///////////////////////
|
||||
_slice_block.resize(_ndimension);
|
||||
_slice_stride.resize(_ndimension);
|
||||
_slice_nblock.resize(_ndimension);
|
||||
|
||||
int block =1;
|
||||
int nblock=1;
|
||||
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
|
||||
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
nblock/=_rdimensions[d];
|
||||
_slice_block[d] =block;
|
||||
_slice_stride[d]=_ostride[d]*_rdimensions[d];
|
||||
_slice_nblock[d]=nblock;
|
||||
block = block*_rdimensions[d];
|
||||
else
|
||||
{
|
||||
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
|
||||
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// subplane information
|
||||
///////////////////////
|
||||
_slice_block.resize(_ndimension);
|
||||
_slice_stride.resize(_ndimension);
|
||||
_slice_nblock.resize(_ndimension);
|
||||
|
||||
int block = 1;
|
||||
int nblock = 1;
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
nblock *= _rdimensions[d];
|
||||
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
{
|
||||
nblock /= _rdimensions[d];
|
||||
_slice_block[d] = block;
|
||||
_slice_stride[d] = _ostride[d] * _rdimensions[d];
|
||||
_slice_nblock[d] = nblock;
|
||||
block = block * _rdimensions[d];
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@ -131,21 +131,21 @@ public:
|
||||
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
|
||||
}
|
||||
void Init(const std::vector<int> &dimensions,
|
||||
const std::vector<int> &simd_layout,
|
||||
const std::vector<int> &processor_grid,
|
||||
const std::vector<int> &checker_dim_mask,
|
||||
int checker_dim)
|
||||
const std::vector<int> &simd_layout,
|
||||
const std::vector<int> &processor_grid,
|
||||
const std::vector<int> &checker_dim_mask,
|
||||
int checker_dim)
|
||||
{
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
_checker_dim = checker_dim;
|
||||
assert(checker_dim_mask[checker_dim]==1);
|
||||
assert(checker_dim_mask[checker_dim] == 1);
|
||||
_ndimension = dimensions.size();
|
||||
assert(checker_dim_mask.size()==_ndimension);
|
||||
assert(processor_grid.size()==_ndimension);
|
||||
assert(simd_layout.size()==_ndimension);
|
||||
|
||||
assert(checker_dim_mask.size() == _ndimension);
|
||||
assert(processor_grid.size() == _ndimension);
|
||||
assert(simd_layout.size() == _ndimension);
|
||||
|
||||
_fdimensions.resize(_ndimension);
|
||||
_gdimensions.resize(_ndimension);
|
||||
_ldimensions.resize(_ndimension);
|
||||
@ -153,114 +153,133 @@ public:
|
||||
_simd_layout.resize(_ndimension);
|
||||
_lstart.resize(_ndimension);
|
||||
_lend.resize(_ndimension);
|
||||
|
||||
|
||||
_ostride.resize(_ndimension);
|
||||
_istride.resize(_ndimension);
|
||||
|
||||
|
||||
_fsites = _gsites = _osites = _isites = 1;
|
||||
|
||||
_checker_dim_mask=checker_dim_mask;
|
||||
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
_fdimensions[d] = dimensions[d];
|
||||
_gdimensions[d] = _fdimensions[d];
|
||||
_fsites = _fsites * _fdimensions[d];
|
||||
_gsites = _gsites * _gdimensions[d];
|
||||
|
||||
if (d==_checker_dim) {
|
||||
_gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
|
||||
}
|
||||
_ldimensions[d] = _gdimensions[d]/_processors[d];
|
||||
_lstart[d] = _processor_coor[d]*_ldimensions[d];
|
||||
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
|
||||
_checker_dim_mask = checker_dim_mask;
|
||||
|
||||
// Use a reduced simd grid
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
|
||||
assert(_rdimensions[d]>0);
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
{
|
||||
_fdimensions[d] = dimensions[d];
|
||||
_gdimensions[d] = _fdimensions[d];
|
||||
_fsites = _fsites * _fdimensions[d];
|
||||
_gsites = _gsites * _gdimensions[d];
|
||||
|
||||
// all elements of a simd vector must have same checkerboard.
|
||||
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
|
||||
if ( _simd_layout[d]>1 ) {
|
||||
if ( checker_dim_mask[d] ) {
|
||||
assert( (_rdimensions[d]&0x1) == 0 );
|
||||
}
|
||||
}
|
||||
if (d == _checker_dim)
|
||||
{
|
||||
assert((_gdimensions[d] & 0x1) == 0);
|
||||
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
|
||||
}
|
||||
_ldimensions[d] = _gdimensions[d] / _processors[d];
|
||||
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
|
||||
_lstart[d] = _processor_coor[d] * _ldimensions[d];
|
||||
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
|
||||
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _simd_layout[d];
|
||||
|
||||
// Addressing support
|
||||
if ( d==0 ) {
|
||||
_ostride[d] = 1;
|
||||
_istride[d] = 1;
|
||||
} else {
|
||||
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
|
||||
_istride[d] = _istride[d-1]*_simd_layout[d-1];
|
||||
}
|
||||
// Use a reduced simd grid
|
||||
_simd_layout[d] = simd_layout[d];
|
||||
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
|
||||
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
|
||||
assert(_rdimensions[d] > 0);
|
||||
|
||||
// all elements of a simd vector must have same checkerboard.
|
||||
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
|
||||
if (_simd_layout[d] > 1)
|
||||
{
|
||||
if (checker_dim_mask[d])
|
||||
{
|
||||
assert((_rdimensions[d] & 0x1) == 0);
|
||||
}
|
||||
}
|
||||
|
||||
_osites *= _rdimensions[d];
|
||||
_isites *= _simd_layout[d];
|
||||
|
||||
// Addressing support
|
||||
if (d == 0)
|
||||
{
|
||||
_ostride[d] = 1;
|
||||
_istride[d] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
|
||||
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// subplane information
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
_slice_block.resize(_ndimension);
|
||||
_slice_stride.resize(_ndimension);
|
||||
_slice_nblock.resize(_ndimension);
|
||||
|
||||
int block =1;
|
||||
int nblock=1;
|
||||
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
|
||||
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
nblock/=_rdimensions[d];
|
||||
_slice_block[d] =block;
|
||||
_slice_stride[d]=_ostride[d]*_rdimensions[d];
|
||||
_slice_nblock[d]=nblock;
|
||||
block = block*_rdimensions[d];
|
||||
|
||||
int block = 1;
|
||||
int nblock = 1;
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
nblock *= _rdimensions[d];
|
||||
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
{
|
||||
nblock /= _rdimensions[d];
|
||||
_slice_block[d] = block;
|
||||
_slice_stride[d] = _ostride[d] * _rdimensions[d];
|
||||
_slice_nblock[d] = nblock;
|
||||
block = block * _rdimensions[d];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Create a checkerboard lookup table
|
||||
////////////////////////////////////////////////
|
||||
int rvol = 1;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
rvol=rvol * _rdimensions[d];
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
{
|
||||
rvol = rvol * _rdimensions[d];
|
||||
}
|
||||
_checker_board.resize(rvol);
|
||||
for(int osite=0;osite<_osites;osite++){
|
||||
_checker_board[osite] = CheckerBoardFromOindex (osite);
|
||||
for (int osite = 0; osite < _osites; osite++)
|
||||
{
|
||||
_checker_board[osite] = CheckerBoardFromOindex(osite);
|
||||
}
|
||||
|
||||
};
|
||||
protected:
|
||||
|
||||
protected:
|
||||
virtual int oIndex(std::vector<int> &coor)
|
||||
{
|
||||
int idx=0;
|
||||
for(int d=0;d<_ndimension;d++) {
|
||||
if( d==_checker_dim ) {
|
||||
idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
|
||||
} else {
|
||||
idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
|
||||
}
|
||||
int idx = 0;
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
{
|
||||
if (d == _checker_dim)
|
||||
{
|
||||
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
|
||||
}
|
||||
else
|
||||
{
|
||||
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
|
||||
}
|
||||
}
|
||||
return idx;
|
||||
};
|
||||
|
||||
|
||||
virtual int iIndex(std::vector<int> &lcoor)
|
||||
{
|
||||
int idx=0;
|
||||
for(int d=0;d<_ndimension;d++) {
|
||||
if( d==_checker_dim ) {
|
||||
idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
|
||||
} else {
|
||||
idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
|
||||
}
|
||||
}
|
||||
return idx;
|
||||
int idx = 0;
|
||||
for (int d = 0; d < _ndimension; d++)
|
||||
{
|
||||
if (d == _checker_dim)
|
||||
{
|
||||
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
|
||||
}
|
||||
else
|
||||
{
|
||||
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
|
||||
}
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/GridCore.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
@ -33,8 +37,11 @@ namespace Grid {
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////
|
||||
void * CartesianCommunicator::ShmCommBuf;
|
||||
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
||||
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
|
||||
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
|
||||
CartesianCommunicator::CommunicatorPolicy_t
|
||||
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
|
||||
int CartesianCommunicator::nCommThreads = -1;
|
||||
int CartesianCommunicator::Hugepages = 0;
|
||||
|
||||
/////////////////////////////////
|
||||
// Alloc, free shmem region
|
||||
@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
||||
GlobalSumVector((double *)c,2*N);
|
||||
}
|
||||
|
||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
|
||||
#if !defined( GRID_COMMS_MPI3)
|
||||
|
||||
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
|
||||
int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes)
|
||||
#endif
|
||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
SendToRecvFromComplete(list);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined( GRID_COMMS_MPI3)
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void){};
|
||||
|
||||
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
|
||||
@ -121,8 +146,25 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
|
||||
return NULL;
|
||||
}
|
||||
void CartesianCommunicator::ShmInitGeneric(void){
|
||||
#if 1
|
||||
|
||||
int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
|
||||
#ifdef MAP_HUGETLB
|
||||
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
|
||||
if (ShmCommBuf == (void *)MAP_FAILED) {
|
||||
perror("mmap failed ");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#ifdef MADV_HUGEPAGE
|
||||
if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
|
||||
#endif
|
||||
#else
|
||||
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
|
||||
ShmCommBuf=(void *)&ShmBufStorageVector[0];
|
||||
#endif
|
||||
bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#ifdef GRID_COMMS_MPI3
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#ifdef GRID_COMMS_MPI3L
|
||||
#ifdef GRID_COMMS_MPIT
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#ifdef GRID_COMMS_SHMEM
|
||||
@ -50,12 +50,24 @@ namespace Grid {
|
||||
class CartesianCommunicator {
|
||||
public:
|
||||
|
||||
// 65536 ranks per node adequate for now
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Isend/Irecv/Wait, or Sendrecv blocking
|
||||
////////////////////////////////////////////
|
||||
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
|
||||
static CommunicatorPolicy_t CommunicatorPolicy;
|
||||
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Up to 65536 ranks per node adequate for now
|
||||
// 128MB shared memory for comms enought for 48^4 local vol comms
|
||||
// Give external control (command line override?) of this
|
||||
|
||||
static const int MAXLOG2RANKSPERNODE = 16;
|
||||
static uint64_t MAX_MPI_SHM_BYTES;
|
||||
///////////////////////////////////////////
|
||||
static const int MAXLOG2RANKSPERNODE = 16;
|
||||
static uint64_t MAX_MPI_SHM_BYTES;
|
||||
static int nCommThreads;
|
||||
// use explicit huge pages
|
||||
static int Hugepages;
|
||||
|
||||
// Communicator should know nothing of the physics grid, only processor grid.
|
||||
int _Nprocessors; // How many in all
|
||||
@ -64,14 +76,18 @@ class CartesianCommunicator {
|
||||
std::vector<int> _processor_coor; // linear processor coordinate
|
||||
unsigned long _ndimension;
|
||||
|
||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
|
||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||
static MPI_Comm communicator_world;
|
||||
MPI_Comm communicator;
|
||||
|
||||
MPI_Comm communicator;
|
||||
std::vector<MPI_Comm> communicator_halo;
|
||||
|
||||
typedef MPI_Request CommsRequest_t;
|
||||
#else
|
||||
typedef int CommsRequest_t;
|
||||
#endif
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Helper functionality for SHM Windows common to all other impls
|
||||
////////////////////////////////////////////////////////////////////
|
||||
@ -117,11 +133,7 @@ class CartesianCommunicator {
|
||||
/////////////////////////////////
|
||||
static void * ShmCommBuf;
|
||||
|
||||
// Isend/Irecv/Wait, or Sendrecv blocking
|
||||
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
|
||||
static CommunicatorPolicy_t CommunicatorPolicy;
|
||||
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
|
||||
|
||||
|
||||
size_t heap_top;
|
||||
size_t heap_bytes;
|
||||
|
||||
@ -211,14 +223,21 @@ class CartesianCommunicator {
|
||||
|
||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||
|
||||
double StencilSendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes,int dir);
|
||||
|
||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes,int dir);
|
||||
|
||||
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||
|
||||
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
|
||||
void StencilBarrier(void);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
|
@ -41,9 +41,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
|
||||
namespace Grid {
|
||||
|
||||
@ -200,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
ShmCommBuf = 0;
|
||||
ShmCommBufs.resize(ShmSize);
|
||||
|
||||
#if 1
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbf and others map filesystems as mappable huge pages
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_MPI3_SHMMMAP
|
||||
char shm_name [NAME_MAX];
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
|
||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
||||
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
|
||||
//sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
|
||||
// printf("Opening file %s \n",shm_name);
|
||||
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
|
||||
if ( fd == -1) {
|
||||
printf("open %s failed\n",shm_name);
|
||||
perror("open hugetlbfs");
|
||||
exit(0);
|
||||
}
|
||||
int mmap_flag = MAP_SHARED ;
|
||||
#ifdef MAP_POPULATE
|
||||
mmap_flag|=MAP_POPULATE;
|
||||
#endif
|
||||
#ifdef MAP_HUGETLB
|
||||
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
|
||||
if ( ptr == (void *)MAP_FAILED ) {
|
||||
printf("mmap %s failed\n",shm_name);
|
||||
perror("failed mmap"); assert(0);
|
||||
}
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
ShmCommBufs[r] =ptr;
|
||||
|
||||
}
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
|
||||
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
|
||||
// the posix shm virtual file system
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_MPI3_SHMOPEN
|
||||
char shm_name [NAME_MAX];
|
||||
if ( ShmRank == 0 ) {
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
@ -213,13 +250,22 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
|
||||
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
|
||||
ftruncate(fd, size);
|
||||
|
||||
int mmap_flag = MAP_SHARED;
|
||||
#ifdef MAP_POPULATE
|
||||
mmap_flag |= MAP_POPULATE;
|
||||
#endif
|
||||
#ifdef MAP_HUGETLB
|
||||
if (Hugepages) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
|
||||
|
||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
||||
if ( ptr == (void * )MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
|
||||
// Try to force numa domain on the shm segment if we have numaif.h
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
|
||||
#if 0
|
||||
//#ifdef HAVE_NUMAIF_H
|
||||
int status;
|
||||
int flags=MPOL_MF_MOVE;
|
||||
#ifdef KNL
|
||||
@ -236,7 +282,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
if (ierr && (page==0)) perror("numa relocate command failed");
|
||||
}
|
||||
#endif
|
||||
ShmCommBufs[r] =ptr;
|
||||
ShmCommBufs[r] =ptr;
|
||||
|
||||
}
|
||||
}
|
||||
@ -258,21 +304,32 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
ShmCommBufs[r] =ptr;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// SHMGET SHMAT and SHM_HUGETLB flag
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_MPI3_SHMGET
|
||||
std::vector<int> shmids(ShmSize);
|
||||
|
||||
if ( ShmRank == 0 ) {
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
||||
key_t key = 0x4545 + r;
|
||||
if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
|
||||
key_t key = IPC_PRIVATE;
|
||||
int flags = IPC_CREAT | SHM_R | SHM_W;
|
||||
#ifdef SHM_HUGETLB
|
||||
if (Hugepages) flags|=SHM_HUGETLB;
|
||||
#endif
|
||||
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
|
||||
int errsv = errno;
|
||||
printf("Errno %d\n",errsv);
|
||||
printf("key %d\n",key);
|
||||
printf("size %lld\n",size);
|
||||
printf("flags %d\n",flags);
|
||||
perror("shmget");
|
||||
exit(1);
|
||||
} else {
|
||||
printf("shmid: 0x%x\n", shmids[r]);
|
||||
}
|
||||
printf("shmid: 0x%x\n", shmids[r]);
|
||||
}
|
||||
}
|
||||
MPI_Barrier(ShmComm);
|
||||
@ -397,8 +454,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
int ierr;
|
||||
communicator=communicator_world;
|
||||
|
||||
_ndimension = processors.size();
|
||||
|
||||
communicator_halo.resize (2*_ndimension);
|
||||
for(int i=0;i<_ndimension*2;i++){
|
||||
MPI_Comm_dup(communicator,&communicator_halo[i]);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Assert power of two shm_size.
|
||||
////////////////////////////////////////////////////////////////
|
||||
@ -621,13 +684,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
}
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes,int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
|
||||
StencilSendToRecvFromComplete(list,dir);
|
||||
return offbytes;
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes,int dir)
|
||||
{
|
||||
assert(dir < communicator_halo.size());
|
||||
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
@ -646,26 +723,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
gfrom = MPI_UNDEFINED;
|
||||
#endif
|
||||
if ( gfrom ==MPI_UNDEFINED) {
|
||||
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(rrq);
|
||||
off_node_bytes+=bytes;
|
||||
}
|
||||
|
||||
if ( gdest == MPI_UNDEFINED ) {
|
||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=bytes;
|
||||
}
|
||||
|
||||
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||
this->StencilSendToRecvFromComplete(list);
|
||||
this->StencilSendToRecvFromComplete(list,dir);
|
||||
}
|
||||
|
||||
return off_node_bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
|
286
lib/communicator/Communicator_mpit.cc
Normal file
286
lib/communicator/Communicator_mpit.cc
Normal file
@ -0,0 +1,286 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/Communicator_mpi.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/GridCore.h>
|
||||
#include <Grid/GridQCDcore.h>
|
||||
#include <Grid/qcd/action/ActionCore.h>
|
||||
#include <mpi.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
// Should error check all MPI calls.
|
||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
int flag;
|
||||
int provided;
|
||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||
if ( !flag ) {
|
||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||
if ( provided != MPI_THREAD_MULTIPLE ) {
|
||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||
}
|
||||
}
|
||||
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||
ShmInitGeneric();
|
||||
}
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
_ndimension = processors.size();
|
||||
std::vector<int> periodic(_ndimension,1);
|
||||
|
||||
_Nprocessors=1;
|
||||
_processors = processors;
|
||||
_processor_coor.resize(_ndimension);
|
||||
|
||||
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||
MPI_Comm_rank(communicator,&_processor);
|
||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||
|
||||
for(int i=0;i<_ndimension;i++){
|
||||
_Nprocessors*=_processors[i];
|
||||
}
|
||||
|
||||
communicator_halo.resize (2*_ndimension);
|
||||
for(int i=0;i<_ndimension*2;i++){
|
||||
MPI_Comm_dup(communicator,&communicator_halo[i]);
|
||||
}
|
||||
|
||||
int Size;
|
||||
MPI_Comm_size(communicator,&Size);
|
||||
|
||||
assert(Size==_Nprocessors);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||
assert(ierr==0);
|
||||
}
|
||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||
{
|
||||
int rank;
|
||||
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||
assert(ierr==0);
|
||||
return rank;
|
||||
}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||
{
|
||||
coor.resize(_ndimension);
|
||||
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||
SendToRecvFromComplete(reqs);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int sender,
|
||||
int receiver,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Status stat;
|
||||
assert(sender != receiver);
|
||||
int tag = sender;
|
||||
if ( _processor == sender ) {
|
||||
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||
}
|
||||
if ( _processor == receiver ) {
|
||||
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||
}
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
list.push_back(rrq);
|
||||
} else {
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
}
|
||||
}
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||
int nreq=list.size();
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
int ierr=MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
///////////////////////////////////////////////////////
|
||||
// Should only be used prior to Grid Init finished.
|
||||
// Check for this?
|
||||
///////////////////////////////////////////////////////
|
||||
int CartesianCommunicator::RankWorld(void){
|
||||
int r;
|
||||
MPI_Comm_rank(communicator_world,&r);
|
||||
return r;
|
||||
}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
{
|
||||
int ierr= MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator_world);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes,int dir)
|
||||
{
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
assert(dir < communicator_halo.size());
|
||||
|
||||
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
MPI_Request req[2];
|
||||
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
|
||||
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
|
||||
|
||||
list.push_back(req[0]);
|
||||
list.push_back(req[1]);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
int nreq=waitall.size();
|
||||
MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
|
||||
};
|
||||
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes,int dir)
|
||||
{
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
assert(dir < communicator_halo.size());
|
||||
|
||||
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
MPI_Request req[2];
|
||||
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
|
||||
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
|
||||
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/cshift/Cshift_mpi.h>
|
||||
#endif
|
||||
|
||||
#ifdef GRID_COMMS_MPI3L
|
||||
#ifdef GRID_COMMS_MPIT
|
||||
#include <Grid/cshift/Cshift_mpi.h>
|
||||
#endif
|
||||
|
||||
|
@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
|
||||
{
|
||||
int NN = BlockSolverGrid->_ndimension;
|
||||
@ -387,6 +388,7 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
|
||||
}
|
||||
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
||||
}
|
||||
*/
|
||||
|
||||
template<class vobj>
|
||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
||||
@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
||||
int Nblock = X._grid->GlobalDimensions()[Orthog];
|
||||
|
||||
GridBase *FullGrid = X._grid;
|
||||
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
|
||||
Lattice<vobj> Xslice(SliceGrid);
|
||||
Lattice<vobj> Rslice(SliceGrid);
|
||||
// Lattice<vobj> Xslice(SliceGrid);
|
||||
// Lattice<vobj> Rslice(SliceGrid);
|
||||
|
||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||
int nh = FullGrid->_ndimension;
|
||||
int nl = SliceGrid->_ndimension;
|
||||
// int nl = SliceGrid->_ndimension;
|
||||
int nl = nh-1;
|
||||
|
||||
//FIXME package in a convenient iterator
|
||||
//Should loop over a plane orthogonal to direction "Orthog"
|
||||
@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
||||
int Nblock = X._grid->GlobalDimensions()[Orthog];
|
||||
|
||||
GridBase *FullGrid = X._grid;
|
||||
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
|
||||
Lattice<vobj> Xslice(SliceGrid);
|
||||
Lattice<vobj> Rslice(SliceGrid);
|
||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
// Lattice<vobj> Xslice(SliceGrid);
|
||||
// Lattice<vobj> Rslice(SliceGrid);
|
||||
|
||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||
int nh = FullGrid->_ndimension;
|
||||
int nl = SliceGrid->_ndimension;
|
||||
// int nl = SliceGrid->_ndimension;
|
||||
int nl=1;
|
||||
|
||||
//FIXME package in a convenient iterator
|
||||
//Should loop over a plane orthogonal to direction "Orthog"
|
||||
@ -498,18 +501,19 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
GridBase *FullGrid = lhs._grid;
|
||||
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
|
||||
int Nblock = FullGrid->GlobalDimensions()[Orthog];
|
||||
|
||||
Lattice<vobj> Lslice(SliceGrid);
|
||||
Lattice<vobj> Rslice(SliceGrid);
|
||||
// Lattice<vobj> Lslice(SliceGrid);
|
||||
// Lattice<vobj> Rslice(SliceGrid);
|
||||
|
||||
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
||||
|
||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||
int nh = FullGrid->_ndimension;
|
||||
int nl = SliceGrid->_ndimension;
|
||||
// int nl = SliceGrid->_ndimension;
|
||||
int nl = nh-1;
|
||||
|
||||
//FIXME package in a convenient iterator
|
||||
//Should loop over a plane orthogonal to direction "Orthog"
|
||||
@ -550,6 +554,14 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
||||
mat += mat_thread;
|
||||
}
|
||||
}
|
||||
|
||||
for(int i=0;i<Nblock;i++){
|
||||
for(int j=0;j<Nblock;j++){
|
||||
ComplexD sum = mat(i,j);
|
||||
FullGrid->GlobalSum(sum);
|
||||
mat(i,j)=sum;
|
||||
}}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
||||
////////////////////////////////////////////////////////////
|
||||
void Grid_quiesce_nodes(void) {
|
||||
int me = 0;
|
||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
|
||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
#endif
|
||||
#ifdef GRID_COMMS_SHMEM
|
||||
|
@ -29,7 +29,7 @@
|
||||
#ifndef GRID_BINARY_IO_H
|
||||
#define GRID_BINARY_IO_H
|
||||
|
||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
|
||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
|
||||
#define USE_MPI_IO
|
||||
#else
|
||||
#undef USE_MPI_IO
|
||||
@ -98,35 +98,39 @@ class BinaryIO {
|
||||
|
||||
NerscChecksum(grid,scalardata,nersc_csum);
|
||||
}
|
||||
|
||||
template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
|
||||
|
||||
template <class fobj>
|
||||
static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
|
||||
{
|
||||
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
|
||||
const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
|
||||
|
||||
|
||||
uint64_t lsites =grid->lSites();
|
||||
if (fbuf.size()==1) {
|
||||
lsites=1;
|
||||
uint64_t lsites = grid->lSites();
|
||||
if (fbuf.size() == 1)
|
||||
{
|
||||
lsites = 1;
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
uint32_t nersc_csum_thr=0;
|
||||
#pragma omp parallel
|
||||
{
|
||||
uint32_t nersc_csum_thr = 0;
|
||||
|
||||
#pragma omp for
|
||||
for(uint64_t local_site=0;local_site<lsites;local_site++){
|
||||
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
|
||||
for(uint64_t j=0;j<size32;j++){
|
||||
nersc_csum_thr=nersc_csum_thr+site_buf[j];
|
||||
}
|
||||
#pragma omp for
|
||||
for (uint64_t local_site = 0; local_site < lsites; local_site++)
|
||||
{
|
||||
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
|
||||
for (uint64_t j = 0; j < size32; j++)
|
||||
{
|
||||
nersc_csum_thr = nersc_csum_thr + site_buf[j];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp critical
|
||||
#pragma omp critical
|
||||
{
|
||||
nersc_csum += nersc_csum_thr;
|
||||
nersc_csum += nersc_csum_thr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
|
||||
{
|
||||
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
|
||||
@ -266,7 +270,7 @@ class BinaryIO {
|
||||
grid->Barrier();
|
||||
GridStopWatch timer;
|
||||
GridStopWatch bstimer;
|
||||
|
||||
|
||||
nersc_csum=0;
|
||||
scidac_csuma=0;
|
||||
scidac_csumb=0;
|
||||
@ -362,18 +366,22 @@ class BinaryIO {
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
} else {
|
||||
std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
|
||||
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
|
||||
std::ifstream fin;
|
||||
fin.open(file,std::ios::binary|std::ios::in);
|
||||
if ( control & BINARYIO_MASTER_APPEND ) {
|
||||
fin.seekg(-sizeof(fobj),fin.end);
|
||||
} else {
|
||||
fin.seekg(offset+myrank*lsites*sizeof(fobj));
|
||||
}
|
||||
fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
|
||||
fin.close();
|
||||
} else {
|
||||
std::cout << GridLogMessage << "C++ read I/O " << file << " : "
|
||||
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
|
||||
std::ifstream fin;
|
||||
fin.open(file, std::ios::binary | std::ios::in);
|
||||
if (control & BINARYIO_MASTER_APPEND)
|
||||
{
|
||||
fin.seekg(-sizeof(fobj), fin.end);
|
||||
}
|
||||
else
|
||||
{
|
||||
fin.seekg(offset + myrank * lsites * sizeof(fobj));
|
||||
}
|
||||
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
|
||||
assert(fin.fail() == 0);
|
||||
fin.close();
|
||||
}
|
||||
timer.Stop();
|
||||
|
||||
@ -405,30 +413,78 @@ class BinaryIO {
|
||||
timer.Start();
|
||||
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
|
||||
#ifdef USE_MPI_IO
|
||||
std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
|
||||
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
|
||||
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
|
||||
ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
|
||||
MPI_File_close(&fh);
|
||||
MPI_Type_free(&fileArray);
|
||||
MPI_Type_free(&localArray);
|
||||
std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
|
||||
ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
|
||||
std::cout << GridLogMessage << "Checking for errors" << std::endl;
|
||||
if (ierr != MPI_SUCCESS)
|
||||
{
|
||||
char error_string[BUFSIZ];
|
||||
int length_of_error_string, error_class;
|
||||
|
||||
MPI_Error_class(ierr, &error_class);
|
||||
MPI_Error_string(error_class, error_string, &length_of_error_string);
|
||||
fprintf(stderr, "%3d: %s\n", myrank, error_string);
|
||||
MPI_Error_string(ierr, error_string, &length_of_error_string);
|
||||
fprintf(stderr, "%3d: %s\n", myrank, error_string);
|
||||
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
|
||||
}
|
||||
|
||||
std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
|
||||
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
|
||||
assert(ierr == 0);
|
||||
|
||||
std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
|
||||
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
|
||||
assert(ierr == 0);
|
||||
|
||||
MPI_File_close(&fh);
|
||||
MPI_Type_free(&fileArray);
|
||||
MPI_Type_free(&localArray);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
} else {
|
||||
std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
|
||||
std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
|
||||
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
|
||||
if ( control & BINARYIO_MASTER_APPEND ) {
|
||||
|
||||
std::ofstream fout;
|
||||
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
|
||||
try {
|
||||
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
|
||||
} catch (const std::fstream::failure& exc) {
|
||||
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
|
||||
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
|
||||
std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
|
||||
#ifdef USE_MPI_IO
|
||||
MPI_Abort(MPI_COMM_WORLD,1);
|
||||
#else
|
||||
exit(1);
|
||||
#endif
|
||||
}
|
||||
std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
|
||||
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
|
||||
|
||||
if ( control & BINARYIO_MASTER_APPEND ) {
|
||||
fout.seekp(0,fout.end);
|
||||
} else {
|
||||
fout.seekp(offset+myrank*lsites*sizeof(fobj));
|
||||
}
|
||||
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
|
||||
|
||||
try {
|
||||
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
|
||||
}
|
||||
catch (const std::fstream::failure& exc) {
|
||||
std::cout << "Exception in writing file " << file << std::endl;
|
||||
std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
|
||||
#ifdef USE_MPI_IO
|
||||
MPI_Abort(MPI_COMM_WORLD,1);
|
||||
#else
|
||||
exit(1);
|
||||
#endif
|
||||
}
|
||||
|
||||
fout.close();
|
||||
}
|
||||
timer.Stop();
|
||||
}
|
||||
}
|
||||
timer.Stop();
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage<<"IOobject: ";
|
||||
if ( control & BINARYIO_READ) std::cout << " read ";
|
||||
@ -442,11 +498,14 @@ class BinaryIO {
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Safety check
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
grid->Barrier();
|
||||
grid->GlobalSum(nersc_csum);
|
||||
grid->GlobalXOR(scidac_csuma);
|
||||
grid->GlobalXOR(scidac_csumb);
|
||||
grid->Barrier();
|
||||
// if the data size is 1 we do not want to sum over the MPI ranks
|
||||
if (iodata.size() != 1){
|
||||
grid->Barrier();
|
||||
grid->GlobalSum(nersc_csum);
|
||||
grid->GlobalXOR(scidac_csuma);
|
||||
grid->GlobalXOR(scidac_csumb);
|
||||
grid->Barrier();
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
@ -546,9 +605,9 @@ class BinaryIO {
|
||||
int gsites = grid->gSites();
|
||||
int lsites = grid->lSites();
|
||||
|
||||
uint32_t nersc_csum_tmp;
|
||||
uint32_t scidac_csuma_tmp;
|
||||
uint32_t scidac_csumb_tmp;
|
||||
uint32_t nersc_csum_tmp = 0;
|
||||
uint32_t scidac_csuma_tmp = 0;
|
||||
uint32_t scidac_csumb_tmp = 0;
|
||||
|
||||
GridStopWatch timer;
|
||||
|
||||
|
@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
||||
for(int i=0; i < Ls; i++){
|
||||
as[i] = 1.0;
|
||||
omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
|
||||
// assert(fabs(omega[i])>0.0);
|
||||
assert(omega[i]!=Coeff_t(0.0));
|
||||
bs[i] = 0.5*(bpc/omega[i] + bmc);
|
||||
cs[i] = 0.5*(bpc/omega[i] - bmc);
|
||||
}
|
||||
@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
||||
|
||||
for(int i=0;i<Ls;i++){
|
||||
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
|
||||
// assert(fabs(bee[i])>0.0);
|
||||
assert(bee[i]!=Coeff_t(0.0));
|
||||
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
|
||||
beo[i]=as[i]*bs[i];
|
||||
ceo[i]=-as[i]*cs[i];
|
||||
@ -455,11 +455,17 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
||||
dee[i] = bee[i];
|
||||
|
||||
if ( i < Ls-1 ) {
|
||||
|
||||
assert(bee[i]!=Coeff_t(0.0));
|
||||
assert(bee[0]!=Coeff_t(0.0));
|
||||
|
||||
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
|
||||
|
||||
leem[i]=mass*cee[Ls-1]/bee[0];
|
||||
for(int j=0;j<i;j++) leem[i]*= aee[j]/bee[j+1];
|
||||
for(int j=0;j<i;j++) {
|
||||
assert(bee[j+1]!=Coeff_t(0.0));
|
||||
leem[i]*= aee[j]/bee[j+1];
|
||||
}
|
||||
|
||||
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
|
||||
|
||||
@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
||||
{
|
||||
Coeff_t delta_d=mass*cee[Ls-1];
|
||||
for(int j=0;j<Ls-1;j++) {
|
||||
// assert(fabs(bee[j])>0.0);
|
||||
assert(bee[j] != Coeff_t(0.0));
|
||||
delta_d *= cee[j]/bee[j];
|
||||
}
|
||||
dee[Ls-1] += delta_d;
|
||||
|
@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
||||
{
|
||||
Compressor compressor;
|
||||
int LLs = in._grid->_rdimensions[0];
|
||||
|
||||
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
DhopCommTime -= usecond();
|
||||
st.HaloExchange(in,compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
DhopComputeTime -= usecond();
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
if (dag == DaggerYes) {
|
||||
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||
@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
||||
}
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
DhopCalls+=1;
|
||||
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
|
||||
conformable(in._grid,out._grid); // drops the cb check
|
||||
|
||||
@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
DhopCalls+=1;
|
||||
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
|
||||
conformable(in._grid,out._grid); // drops the cb check
|
||||
|
||||
@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
DhopCalls+=2;
|
||||
conformable(in._grid,FermionGrid()); // verifies full grid
|
||||
conformable(in._grid,out._grid);
|
||||
|
||||
@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
|
||||
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::Report(void)
|
||||
{
|
||||
std::vector<int> latt = GridDefaultLatt();
|
||||
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = _FourDimGrid->_Nprocessors;
|
||||
RealD NN = _FourDimGrid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : "
|
||||
<< DhopCalls << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : "
|
||||
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : "
|
||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : "
|
||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
|
||||
// Average the compute time
|
||||
_FourDimGrid->GlobalSum(DhopComputeTime);
|
||||
DhopComputeTime/=NP;
|
||||
|
||||
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||
|
||||
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" <<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
|
||||
{
|
||||
DhopCalls = 0;
|
||||
DhopTotalTime = 0;
|
||||
DhopCommTime = 0;
|
||||
DhopComputeTime = 0;
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Implement the general interface. Here we use SAME mass on all slices
|
||||
|
@ -55,6 +55,16 @@ namespace QCD {
|
||||
FermionField _tmp;
|
||||
FermionField &tmp(void) { return _tmp; }
|
||||
|
||||
////////////////////////////////////////
|
||||
// Performance monitoring
|
||||
////////////////////////////////////////
|
||||
void Report(void);
|
||||
void ZeroCounters(void);
|
||||
double DhopTotalTime;
|
||||
double DhopCalls;
|
||||
double DhopCommTime;
|
||||
double DhopComputeTime;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Implement the abstract base
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
|
||||
template<class vobj,class cobj>
|
||||
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
||||
public:
|
||||
|
||||
double timer0;
|
||||
double timer1;
|
||||
double timer2;
|
||||
double timer3;
|
||||
double timer4;
|
||||
double timer5;
|
||||
double timer6;
|
||||
uint64_t callsi;
|
||||
void ZeroCountersi(void)
|
||||
{
|
||||
timer0=0;
|
||||
timer1=0;
|
||||
timer2=0;
|
||||
timer3=0;
|
||||
timer4=0;
|
||||
timer5=0;
|
||||
timer6=0;
|
||||
callsi=0;
|
||||
}
|
||||
void Reporti(int calls)
|
||||
{
|
||||
if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
|
||||
if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate) " <<timer1/calls <<std::endl;
|
||||
if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge ) " <<timer2/calls <<std::endl;
|
||||
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
|
||||
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
||||
}
|
||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||
|
||||
std::vector<int> same_node;
|
||||
@ -252,6 +278,7 @@ public:
|
||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
||||
same_node(npoints)
|
||||
{
|
||||
ZeroCountersi();
|
||||
surface_list.resize(0);
|
||||
};
|
||||
|
||||
@ -261,7 +288,6 @@ public:
|
||||
// Here we know the distance is 1 for WilsonStencil
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
same_node[point] = this->SameNode(point);
|
||||
// std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
|
||||
}
|
||||
|
||||
for(int site = 0 ;site< vol4;site++){
|
||||
@ -282,17 +308,28 @@ public:
|
||||
{
|
||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||
this->HaloExchangeOptGather(source,compress);
|
||||
this->CommunicateBegin(reqs);
|
||||
this->CommunicateComplete(reqs);
|
||||
double t1=usecond();
|
||||
// Asynchronous MPI calls multidirectional, Isend etc...
|
||||
// this->CommunicateBegin(reqs);
|
||||
// this->CommunicateComplete(reqs);
|
||||
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
|
||||
this->Communicate();
|
||||
double t2=usecond(); timer1 += t2-t1;
|
||||
this->CommsMerge(compress);
|
||||
double t3=usecond(); timer2 += t3-t2;
|
||||
this->CommsMergeSHM(compress);
|
||||
double t4=usecond(); timer3 += t4-t3;
|
||||
}
|
||||
|
||||
template <class compressor>
|
||||
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
|
||||
{
|
||||
this->Prepare();
|
||||
double t0=usecond();
|
||||
this->HaloGatherOpt(source,compress);
|
||||
double t1=usecond();
|
||||
timer0 += t1-t0;
|
||||
callsi++;
|
||||
}
|
||||
|
||||
template <class compressor>
|
||||
@ -304,7 +341,9 @@ public:
|
||||
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
|
||||
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
|
||||
|
||||
this->mpi3synctime_g-=usecond();
|
||||
this->_grid->StencilBarrier();
|
||||
this->mpi3synctime_g+=usecond();
|
||||
|
||||
assert(source._grid==this->_grid);
|
||||
this->halogtime-=usecond();
|
||||
@ -323,7 +362,6 @@ public:
|
||||
int dag = compress.dag;
|
||||
int face_idx=0;
|
||||
if ( dag ) {
|
||||
// std::cout << " Optimised Dagger compress " <<std::endl;
|
||||
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
||||
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
||||
|
@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
||||
int vol4;
|
||||
vol4=FourDimGrid.oSites();
|
||||
Stencil.BuildSurfaceList(LLs,vol4);
|
||||
|
||||
vol4=FourDimRedBlackGrid.oSites();
|
||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||
|
||||
std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
|
||||
<<" " << StencilEven.surface_list.size()<<std::endl;
|
||||
// std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
|
||||
// <<" " << StencilEven.surface_list.size()<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::Report(void)
|
||||
{
|
||||
std::vector<int> latt = GridDefaultLatt();
|
||||
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = _FourDimGrid->_Nprocessors;
|
||||
RealD NN = _FourDimGrid->NodeCount();
|
||||
RealD NP = _FourDimGrid->_Nprocessors;
|
||||
RealD NN = _FourDimGrid->NodeCount();
|
||||
RealD volume = Ls;
|
||||
std::vector<int> latt = _FourDimGrid->GlobalDimensions();
|
||||
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
|
||||
if ( DhopCalls > 0 ) {
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
|
||||
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
if ( DhopCalls > 0){
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
|
||||
std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
|
||||
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
Stencil.ZeroCountersi();
|
||||
StencilEven.ZeroCountersi();
|
||||
StencilOdd.ZeroCountersi();
|
||||
}
|
||||
|
||||
|
||||
@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.HaloExchangeOptGather(in,compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||
|
||||
// Rely on async comms; start comms before merge of local data
|
||||
DhopCommTime-=usecond();
|
||||
st.CommunicateBegin(reqs);
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMergeSHM(compressor);
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
// Perhaps use omp task and region
|
||||
#pragma omp parallel
|
||||
double ctime=0;
|
||||
double ptime=0;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int me = omp_get_thread_num();
|
||||
int myoff, mywork;
|
||||
|
||||
GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
|
||||
int sF = LLs * myoff;
|
||||
|
||||
if ( me == 0 ) {
|
||||
st.CommunicateComplete(reqs);
|
||||
DhopCommTime+=usecond();
|
||||
} else {
|
||||
// Interior links in stencil
|
||||
if ( me==1 ) DhopComputeTime-=usecond();
|
||||
if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
|
||||
else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
|
||||
if ( me==1 ) DhopComputeTime+=usecond();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
if (tid >= ncomms) {
|
||||
double start = usecond();
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = U._grid->oSites();
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
|
||||
// do the compute
|
||||
if (dag == DaggerYes) {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
int sF = LLs * sU;
|
||||
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
||||
}
|
||||
} else {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
int sF = LLs * sU;
|
||||
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
||||
}
|
||||
}
|
||||
ptime = usecond() - start;
|
||||
}
|
||||
{
|
||||
double start = usecond();
|
||||
st.CommunicateThreaded();
|
||||
ctime = usecond() - start;
|
||||
}
|
||||
}
|
||||
DhopCommTime += ctime;
|
||||
DhopComputeTime+=ptime;
|
||||
|
||||
// First to enter, last to leave timing
|
||||
st.CollateThreads();
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
// Load imbalance alert. Should use dynamic schedule OMP for loop
|
||||
// Perhaps create a list of only those sites with face work, and
|
||||
// load balance process the list.
|
||||
DhopComputeTime2-=usecond();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
|
@ -165,7 +165,7 @@ class HMCResourceManager {
|
||||
// Grids
|
||||
//////////////////////////////////////////////////////////////
|
||||
|
||||
void AddGrid(std::string s, GridModule& M) {
|
||||
void AddGrid(const std::string s, GridModule& M) {
|
||||
// Check for name clashes
|
||||
auto search = Grids.find(s);
|
||||
if (search != Grids.end()) {
|
||||
@ -174,14 +174,24 @@ class HMCResourceManager {
|
||||
exit(1);
|
||||
}
|
||||
Grids[s] = std::move(M);
|
||||
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
|
||||
std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
|
||||
std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
|
||||
Grids[s].show_full_decomposition();
|
||||
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
|
||||
}
|
||||
|
||||
// Add a named grid set, 4d shortcut
|
||||
void AddFourDimGrid(std::string s) {
|
||||
void AddFourDimGrid(const std::string s) {
|
||||
GridFourDimModule<vComplex> Mod;
|
||||
AddGrid(s, Mod);
|
||||
}
|
||||
|
||||
// Add a named grid set, 4d shortcut + tweak simd lanes
|
||||
void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
|
||||
GridFourDimModule<vComplex> Mod(simd_decomposition);
|
||||
AddGrid(s, Mod);
|
||||
}
|
||||
|
||||
|
||||
GridCartesian* GetCartesian(std::string s = "") {
|
||||
|
@ -33,28 +33,29 @@ directory
|
||||
namespace Grid {
|
||||
|
||||
// Resources
|
||||
// Modules for grids
|
||||
// Modules for grids
|
||||
|
||||
// Introduce another namespace HMCModules?
|
||||
|
||||
class GridModuleParameters: Serializable{
|
||||
class GridModuleParameters: Serializable{
|
||||
public:
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
|
||||
std::string, lattice,
|
||||
std::string, mpi);
|
||||
|
||||
std::vector<int> getLattice(){return strToVec<int>(lattice);}
|
||||
std::vector<int> getMpi() {return strToVec<int>(mpi);}
|
||||
std::vector<int> getLattice() const {return strToVec<int>(lattice);}
|
||||
std::vector<int> getMpi() const {return strToVec<int>(mpi);}
|
||||
|
||||
void check(){
|
||||
if (getLattice().size() != getMpi().size()) {
|
||||
std::cout << GridLogError
|
||||
|
||||
void check() const {
|
||||
if (getLattice().size() != getMpi().size() ) {
|
||||
std::cout << GridLogError
|
||||
<< "Error in GridModuleParameters: lattice and mpi dimensions "
|
||||
"do not match"
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class ReaderClass>
|
||||
GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
|
||||
@ -75,51 +76,94 @@ private:
|
||||
// Lower level class
|
||||
class GridModule {
|
||||
public:
|
||||
GridCartesian* get_full() {
|
||||
GridCartesian* get_full() {
|
||||
std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
|
||||
return grid_.get(); }
|
||||
GridRedBlackCartesian* get_rb() {
|
||||
GridRedBlackCartesian* get_rb() {
|
||||
std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
|
||||
return rbgrid_.get(); }
|
||||
|
||||
void set_full(GridCartesian* grid) { grid_.reset(grid); }
|
||||
void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
|
||||
void show_full_decomposition(){ grid_->show_decomposition(); }
|
||||
void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
|
||||
|
||||
protected:
|
||||
std::unique_ptr<GridCartesian> grid_;
|
||||
std::unique_ptr<GridRedBlackCartesian> rbgrid_;
|
||||
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
// Classes for the user
|
||||
////////////////////////////////////
|
||||
// Note: the space time grid should be out of the QCD namespace
|
||||
template< class vector_type>
|
||||
class GridFourDimModule : public GridModule {
|
||||
public:
|
||||
GridFourDimModule() {
|
||||
template <class vector_type>
|
||||
class GridFourDimModule : public GridModule
|
||||
{
|
||||
public:
|
||||
GridFourDimModule()
|
||||
{
|
||||
using namespace QCD;
|
||||
set_full(SpaceTimeGrid::makeFourDimGrid(
|
||||
GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
|
||||
GridDefaultLatt(),
|
||||
GridDefaultSimd(4, vector_type::Nsimd()),
|
||||
GridDefaultMpi()));
|
||||
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
|
||||
}
|
||||
|
||||
GridFourDimModule(GridModuleParameters Params) {
|
||||
GridFourDimModule(const std::vector<int> tweak_simd)
|
||||
{
|
||||
using namespace QCD;
|
||||
if (tweak_simd.size() != 4)
|
||||
{
|
||||
std::cout << GridLogError
|
||||
<< "Error in GridFourDimModule: SIMD size different from 4"
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Checks that the product agrees with the expectation
|
||||
int simd_sum = 1;
|
||||
for (auto &n : tweak_simd)
|
||||
simd_sum *= n;
|
||||
std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << " Sum: " << simd_sum << std::endl;
|
||||
|
||||
if (simd_sum == vector_type::Nsimd())
|
||||
{
|
||||
set_full(SpaceTimeGrid::makeFourDimGrid(
|
||||
GridDefaultLatt(),
|
||||
tweak_simd,
|
||||
GridDefaultMpi()));
|
||||
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << GridLogError
|
||||
<< "Error in GridFourDimModule: SIMD lanes must sum to "
|
||||
<< vector_type::Nsimd()
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
GridFourDimModule(const GridModuleParameters Params)
|
||||
{
|
||||
using namespace QCD;
|
||||
Params.check();
|
||||
std::vector<int> lattice_v = Params.getLattice();
|
||||
std::vector<int> mpi_v = Params.getMpi();
|
||||
if (lattice_v.size() == 4) {
|
||||
if (lattice_v.size() == 4)
|
||||
{
|
||||
set_full(SpaceTimeGrid::makeFourDimGrid(
|
||||
lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
|
||||
lattice_v,
|
||||
GridDefaultSimd(4, vector_type::Nsimd()),
|
||||
mpi_v));
|
||||
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
|
||||
} else {
|
||||
std::cout << GridLogError
|
||||
<< "Error in GridFourDimModule: lattice dimension different from 4"
|
||||
<< std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << GridLogError
|
||||
<< "Error in GridFourDimModule: lattice dimension different from 4"
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
|
||||
typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
|
||||
using ObsBase::ObsBase; // for constructors
|
||||
|
||||
|
||||
|
||||
// acquire resource
|
||||
virtual void initialize(){
|
||||
this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
|
||||
@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
|
||||
PlaquetteMod(): ObsBase(NoParameters()){}
|
||||
};
|
||||
|
||||
|
||||
template < class Impl >
|
||||
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
|
||||
typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
|
||||
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
|
||||
typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
|
||||
using ObsBase::ObsBase; // for constructors
|
||||
|
||||
|
||||
|
||||
// acquire resource
|
||||
virtual void initialize(){
|
||||
this->ObservablePtr.reset(new TopologicalCharge<Impl>());
|
||||
this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
|
||||
}
|
||||
public:
|
||||
TopologicalChargeMod(): ObsBase(NoParameters()){}
|
||||
TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
|
||||
TopologicalChargeMod(): ObsBase(){}
|
||||
};
|
||||
|
||||
|
||||
|
||||
}// QCD temporarily here
|
||||
|
||||
|
||||
|
@ -33,9 +33,45 @@ directory
|
||||
namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
struct TopologySmearingParameters : Serializable {
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
|
||||
int, steps,
|
||||
float, step_size,
|
||||
int, meas_interval,
|
||||
float, maxTau);
|
||||
|
||||
TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
|
||||
steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
|
||||
|
||||
template < class ReaderClass >
|
||||
TopologySmearingParameters(Reader<ReaderClass>& Reader){
|
||||
read(Reader, "Smearing", *this);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct TopologyObsParameters : Serializable {
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
|
||||
int, interval,
|
||||
bool, do_smearing,
|
||||
TopologySmearingParameters, Smearing);
|
||||
|
||||
TopologyObsParameters(int interval = 1, bool smearing = false):
|
||||
interval(interval), Smearing(smearing){}
|
||||
|
||||
template <class ReaderClass >
|
||||
TopologyObsParameters(Reader<ReaderClass>& Reader){
|
||||
read(Reader, "TopologyMeasurement", *this);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// this is only defined for a gauge theory
|
||||
template <class Impl>
|
||||
class TopologicalCharge : public HmcObservable<typename Impl::Field> {
|
||||
TopologyObsParameters Pars;
|
||||
|
||||
public:
|
||||
// here forces the Impl to be of gauge fields
|
||||
// if not the compiler will complain
|
||||
@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
|
||||
// necessary for HmcObservable compatibility
|
||||
typedef typename Impl::Field Field;
|
||||
|
||||
TopologicalCharge(int interval = 1, bool do_smearing = false):
|
||||
Pars(interval, do_smearing){}
|
||||
|
||||
TopologicalCharge(TopologyObsParameters P):Pars(P){
|
||||
std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
|
||||
}
|
||||
|
||||
void TrajectoryComplete(int traj,
|
||||
Field &U,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
|
||||
Real q = WilsonLoops<Impl>::TopologicalCharge(U);
|
||||
if (traj%Pars.interval == 0){
|
||||
// Smearing
|
||||
Field Usmear = U;
|
||||
int def_prec = std::cout.precision();
|
||||
|
||||
if (Pars.do_smearing){
|
||||
// using wilson flow by default here
|
||||
WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
|
||||
WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
|
||||
Real T0 = WF.energyDensityPlaquette(Usmear);
|
||||
std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
|
||||
<< "T0 : [ " << traj << " ] "<< T0 << std::endl;
|
||||
}
|
||||
|
||||
int def_prec = std::cout.precision();
|
||||
Real q = WilsonLoops<Impl>::TopologicalCharge(Usmear);
|
||||
std::cout << GridLogMessage
|
||||
<< std::setprecision(std::numeric_limits<Real>::digits10 + 1)
|
||||
<< "Topological Charge: [ " << traj << " ] "<< q << std::endl;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< std::setprecision(std::numeric_limits<Real>::digits10 + 1)
|
||||
<< "Topological Charge: [ " << traj << " ] "<< q << std::endl;
|
||||
|
||||
std::cout.precision(def_prec);
|
||||
std::cout.precision(def_prec);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -26,12 +26,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
//#include <Grid/Grid.h>
|
||||
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
#ifndef GRID_QCD_GAUGE_FIX_H
|
||||
#define GRID_QCD_GAUGE_FIX_H
|
||||
namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
template <class Gimpl>
|
||||
class FourierAcceleratedGaugeFixer : public Gimpl {
|
||||
public:
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
typedef typename Gimpl::GaugeLinkField GaugeMat;
|
||||
@ -186,3 +188,6 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -82,11 +82,11 @@ namespace Optimization {
|
||||
double tmp[2]={a,b};
|
||||
return vld1q_f64(tmp);
|
||||
}
|
||||
//Real double // N:tbc
|
||||
//Real double
|
||||
inline float64x2_t operator()(double a){
|
||||
return vdupq_n_f64(a);
|
||||
}
|
||||
//Integer // N:tbc
|
||||
//Integer
|
||||
inline uint32x4_t operator()(Integer a){
|
||||
return vdupq_n_u32(a);
|
||||
}
|
||||
@ -124,33 +124,32 @@ namespace Optimization {
|
||||
// Nils: Vset untested; not used currently in Grid at all;
|
||||
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
|
||||
struct Vset{
|
||||
// Complex float // N:ok
|
||||
// Complex float
|
||||
inline float32x4_t operator()(Grid::ComplexF *a){
|
||||
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
// Complex double // N:ok
|
||||
// Complex double
|
||||
inline float64x2_t operator()(Grid::ComplexD *a){
|
||||
double tmp[2]={a[0].imag(),a[0].real()};
|
||||
return vld1q_f64(tmp);
|
||||
}
|
||||
// Real float // N:ok
|
||||
// Real float
|
||||
inline float32x4_t operator()(float *a){
|
||||
float tmp[4]={a[3],a[2],a[1],a[0]};
|
||||
return vld1q_f32(tmp);
|
||||
}
|
||||
// Real double // N:ok
|
||||
// Real double
|
||||
inline float64x2_t operator()(double *a){
|
||||
double tmp[2]={a[1],a[0]};
|
||||
return vld1q_f64(tmp);
|
||||
}
|
||||
// Integer // N:ok
|
||||
// Integer
|
||||
inline uint32x4_t operator()(Integer *a){
|
||||
return vld1q_dup_u32(a);
|
||||
}
|
||||
};
|
||||
|
||||
// N:leaving as is
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
@ -249,9 +248,9 @@ namespace Optimization {
|
||||
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
|
||||
|
||||
// no fma, use mul and add
|
||||
//float32x4_t r5;
|
||||
//r5 = vmulq_f32(r0, a);
|
||||
//return vaddq_f32(r4, r5);
|
||||
// float32x4_t r5;
|
||||
// r5 = vmulq_f32(r0, a);
|
||||
// return vaddq_f32(r4, r5);
|
||||
}
|
||||
// Complex double
|
||||
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
|
||||
@ -272,9 +271,9 @@ namespace Optimization {
|
||||
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
|
||||
|
||||
// no fma, use mul and add
|
||||
//float64x2_t r5;
|
||||
//r5 = vmulq_f64(r0, a);
|
||||
//return vaddq_f64(r4, r5);
|
||||
// float64x2_t r5;
|
||||
// r5 = vmulq_f64(r0, a);
|
||||
// return vaddq_f64(r4, r5);
|
||||
}
|
||||
};
|
||||
|
||||
@ -421,11 +420,6 @@ namespace Optimization {
|
||||
}
|
||||
}
|
||||
|
||||
// working, but no restriction on n
|
||||
// template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
|
||||
// template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
|
||||
|
||||
// restriction on n
|
||||
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
|
||||
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
|
||||
|
||||
@ -441,7 +435,7 @@ namespace Optimization {
|
||||
sb = vcvt_high_f32_f16(h);
|
||||
// there is no direct conversion from lower float32x4_t to float64x2_t
|
||||
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
|
||||
//float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
|
||||
// float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
|
||||
// workaround for clang
|
||||
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
|
||||
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
|
||||
@ -547,7 +541,7 @@ namespace Optimization {
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<> // N:by Boyle
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
|
||||
u128d conv; conv.v = in;
|
||||
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||
@ -562,9 +556,7 @@ namespace Optimization {
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
|
||||
// FIXME unimplemented
|
||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
||||
assert(0);
|
||||
return vaddvq_u32(in);
|
||||
}
|
||||
}
|
||||
|
||||
@ -603,4 +595,5 @@ namespace Optimization {
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
// Timing info; ugly; possibly temporary
|
||||
/////////////////////////////////////////
|
||||
double commtime;
|
||||
double mpi3synctime;
|
||||
double mpi3synctime_g;
|
||||
double shmmergetime;
|
||||
double gathertime;
|
||||
double gathermtime;
|
||||
double halogtime;
|
||||
@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
double splicetime;
|
||||
double nosplicetime;
|
||||
double calls;
|
||||
std::vector<double> comm_bytes_thr;
|
||||
std::vector<double> comm_time_thr;
|
||||
std::vector<double> comm_enter_thr;
|
||||
std::vector<double> comm_leave_thr;
|
||||
|
||||
////////////////////////////////////////
|
||||
// Stencil query
|
||||
@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
//////////////////////////////////////////
|
||||
// Comms packet queue for asynch thread
|
||||
//////////////////////////////////////////
|
||||
void CommunicateThreaded()
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
// must be called in parallel region
|
||||
int mythread = omp_get_thread_num();
|
||||
int nthreads = CartesianCommunicator::nCommThreads;
|
||||
#else
|
||||
int mythread = 0;
|
||||
int nthreads = 1;
|
||||
#endif
|
||||
if (nthreads == -1) nthreads = 1;
|
||||
if (mythread < nthreads) {
|
||||
comm_enter_thr[mythread] = usecond();
|
||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes,i);
|
||||
comm_bytes_thr[mythread] += bytes;
|
||||
}
|
||||
comm_leave_thr[mythread]= usecond();
|
||||
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
|
||||
}
|
||||
}
|
||||
|
||||
void CollateThreads(void)
|
||||
{
|
||||
int nthreads = CartesianCommunicator::nCommThreads;
|
||||
double first=0.0;
|
||||
double last =0.0;
|
||||
|
||||
for(int t=0;t<nthreads;t++) {
|
||||
|
||||
double t0 = comm_enter_thr[t];
|
||||
double t1 = comm_leave_thr[t];
|
||||
comms_bytes+=comm_bytes_thr[t];
|
||||
|
||||
comm_enter_thr[t] = 0.0;
|
||||
comm_leave_thr[t] = 0.0;
|
||||
comm_time_thr[t] = 0.0;
|
||||
comm_bytes_thr[t]=0;
|
||||
|
||||
if ( first == 0.0 ) first = t0; // first is t0
|
||||
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
|
||||
|
||||
if ( t1 > last ) last = t1; // max time seen
|
||||
|
||||
}
|
||||
commtime+= last-first;
|
||||
}
|
||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
reqs.resize(Packets.size());
|
||||
commtime-=usecond();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes);
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes,i);
|
||||
}
|
||||
}
|
||||
|
||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
||||
_grid->StencilSendToRecvFromComplete(reqs[i],i);
|
||||
}
|
||||
commtime+=usecond();
|
||||
}
|
||||
void Communicate(void)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
#pragma omp parallel
|
||||
{
|
||||
// must be called in parallel region
|
||||
int mythread = omp_get_thread_num();
|
||||
int maxthreads= omp_get_max_threads();
|
||||
int nthreads = CartesianCommunicator::nCommThreads;
|
||||
assert(nthreads <= maxthreads);
|
||||
|
||||
if (nthreads == -1) nthreads = 1;
|
||||
#else
|
||||
int mythread = 0;
|
||||
int nthreads = 1;
|
||||
#endif
|
||||
if (mythread < nthreads) {
|
||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||
double start = usecond();
|
||||
comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes,i);
|
||||
comm_time_thr[mythread] += usecond() - start;
|
||||
}
|
||||
}
|
||||
#ifdef GRID_OMP
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||
{
|
||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||
Prepare();
|
||||
HaloGather(source,compress);
|
||||
CommunicateBegin(reqs);
|
||||
CommunicateComplete(reqs);
|
||||
// Concurrent
|
||||
//CommunicateBegin(reqs);
|
||||
//CommunicateComplete(reqs);
|
||||
// Sequential, possibly threaded
|
||||
Communicate();
|
||||
CommsMergeSHM(compress);
|
||||
CommsMerge(compress);
|
||||
}
|
||||
@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
template<class compressor>
|
||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||
{
|
||||
mpi3synctime_g-=usecond();
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
mpi3synctime_g+=usecond();
|
||||
|
||||
// conformable(source._grid,_grid);
|
||||
assert(source._grid==_grid);
|
||||
@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
CommsMerge(decompress,Mergers,Decompressions);
|
||||
}
|
||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||
mpi3synctime-=usecond();
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
mpi3synctime+=usecond();
|
||||
shmmergetime-=usecond();
|
||||
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
||||
shmmergetime+=usecond();
|
||||
}
|
||||
|
||||
template<class decompressor>
|
||||
@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances)
|
||||
: _permute_type(npoints), _comm_buf_size(npoints)
|
||||
: _permute_type(npoints),
|
||||
_comm_buf_size(npoints),
|
||||
comm_bytes_thr(npoints),
|
||||
comm_enter_thr(npoints),
|
||||
comm_leave_thr(npoints),
|
||||
comm_time_thr(npoints)
|
||||
{
|
||||
face_table_computed=0;
|
||||
_npoints = npoints;
|
||||
@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
void ZeroCounters(void) {
|
||||
gathertime = 0.;
|
||||
commtime = 0.;
|
||||
mpi3synctime=0.;
|
||||
mpi3synctime_g=0.;
|
||||
shmmergetime=0.;
|
||||
for(int i=0;i<_npoints;i++){
|
||||
comm_time_thr[i]=0;
|
||||
comm_bytes_thr[i]=0;
|
||||
comm_enter_thr[i]=0;
|
||||
comm_leave_thr[i]=0;
|
||||
}
|
||||
halogtime = 0.;
|
||||
mergetime = 0.;
|
||||
decompresstime = 0.;
|
||||
@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
||||
RealD NP = _grid->_Nprocessors;
|
||||
RealD NN = _grid->NodeCount();
|
||||
double t = 0;
|
||||
// if comm_time_thr is set they were all done in parallel so take the max
|
||||
// but add up the bytes
|
||||
int threaded = 0 ;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if ( comm_time_thr[i]>0.0 ) {
|
||||
threaded = 1;
|
||||
comms_bytes += comm_bytes_thr[i];
|
||||
if (t < comm_time_thr[i]) t = comm_time_thr[i];
|
||||
}
|
||||
}
|
||||
if (threaded) commtime += t;
|
||||
|
||||
_grid->GlobalSum(commtime); commtime/=NP;
|
||||
if ( calls > 0. ) {
|
||||
@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
}
|
||||
PRINTIT(mpi3synctime);
|
||||
PRINTIT(mpi3synctime_g);
|
||||
PRINTIT(shmmergetime);
|
||||
PRINTIT(splicetime);
|
||||
PRINTIT(nosplicetime);
|
||||
}
|
||||
|
@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
|
||||
strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
|
||||
const iVector<vtype,N> * __restrict__ rhs,
|
||||
const iScalar<mtype> * __restrict__ lhs){
|
||||
mult(ret,lhs,rhs);
|
||||
for(int c1=0;c1<N;c1++){
|
||||
mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -219,9 +219,15 @@ void Grid_init(int *argc,char ***argv)
|
||||
int MB;
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
|
||||
GridCmdOptionInt(arg,MB);
|
||||
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
|
||||
uint64_t MB64 = MB;
|
||||
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
|
||||
CartesianCommunicator::Hugepages = 1;
|
||||
}
|
||||
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||
Grid_debug_handler_init();
|
||||
}
|
||||
@ -304,6 +310,7 @@ void Grid_init(int *argc,char ***argv)
|
||||
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
@ -317,7 +324,7 @@ void Grid_init(int *argc,char ***argv)
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
||||
@ -356,10 +363,15 @@ void Grid_init(int *argc,char ***argv)
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
|
||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||
LebesgueOrder::UseLebesgueOrder=1;
|
||||
}
|
||||
|
||||
CartesianCommunicator::nCommThreads = -1;
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
|
||||
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
||||
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
||||
@ -374,10 +386,13 @@ void Grid_init(int *argc,char ***argv)
|
||||
Grid_default_latt,
|
||||
Grid_default_mpi);
|
||||
|
||||
std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
||||
std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
||||
if ( CartesianCommunicator::Hugepages) {
|
||||
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||
std::cout<<GridLogMessage<<"Grid Decomposition\n";
|
||||
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
||||
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
||||
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
|
||||
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
|
||||
@ -393,7 +408,7 @@ void Grid_init(int *argc,char ***argv)
|
||||
|
||||
void Grid_finalize(void)
|
||||
{
|
||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
|
||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||
MPI_Finalize();
|
||||
Grid_unquiesce_nodes();
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user