1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Merge branch 'develop' into feature/hadrons

This commit is contained in:
2017-08-24 17:05:45 +01:00
42 changed files with 1906 additions and 512 deletions

View File

@ -199,7 +199,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD);
tmp = B - AD;
//std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
//std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
//std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
//std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
//std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
D=Q;
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@ -221,13 +226,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
MatrixTimer.Start();
Linop.HermOp(D, Z);
MatrixTimer.Stop();
//std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
//4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
sliceInnerTimer.Stop();
m_M = m_DZ.inverse();
//std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
//5. X = X + D MC
m_tmp = m_M * m_C;
sliceMaddTimer.Start();

View File

@ -11,7 +11,7 @@ int PointerCache::victim;
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return NULL;
if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);

View File

@ -98,7 +98,14 @@ public:
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
#endif
// First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP
#pragma omp parallel for
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
return ptr;
}
@ -186,6 +193,13 @@ public:
#else
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
#endif
size_type bytes = __n*sizeof(_Tp);
uint8_t *cp = (uint8_t *)ptr;
// One touch per 4k page, static OMP loop to catch same loop order
#pragma omp parallel for schedule(static)
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
return ptr;
}
void deallocate(pointer __p, size_type) {

View File

@ -185,17 +185,18 @@ public:
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "Full Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "Global Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "Local Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "Outer strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "Inner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "iSites : " << _isites << std::endl;
std::cout << GridLogMessage << "oSites : " << _osites << std::endl;
std::cout << GridLogMessage << "lSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "gSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "Nd : " << _ndimension << std::endl;
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////

View File

@ -62,77 +62,81 @@ public:
return shift;
}
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid
) : GridBase(processor_grid)
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
///////////////////////
// Grid information
///////////////////////
_ndimension = dimensions.size();
for(int d=0;d<_ndimension;d++){
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
//FIXME check for exact division
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
// Use a reduced simd grid
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
_lstart[d] = _processor_coor[d]*_ldimensions[d];
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if ( d==0 ) {
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
_fsites = _gsites = _osites = _isites = 1;
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
// Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block =1;
int nblock=1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d];
_slice_block[d] =block;
_slice_stride[d]=_ostride[d]*_rdimensions[d];
_slice_nblock[d]=nblock;
block = block*_rdimensions[d];
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
};
};
}
#endif

View File

@ -131,21 +131,21 @@ public:
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
}
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim)
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim)
{
///////////////////////
// Grid information
///////////////////////
///////////////////////
// Grid information
///////////////////////
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim]==1);
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size()==_ndimension);
assert(processor_grid.size()==_ndimension);
assert(simd_layout.size()==_ndimension);
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
@ -153,114 +153,133 @@ public:
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask=checker_dim_mask;
for(int d=0;d<_ndimension;d++){
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
if (d==_checker_dim) {
_gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
}
_ldimensions[d] = _gdimensions[d]/_processors[d];
_lstart[d] = _processor_coor[d]*_ldimensions[d];
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
_checker_dim_mask = checker_dim_mask;
// Use a reduced simd grid
_simd_layout[d] = simd_layout[d];
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
assert(_rdimensions[d]>0);
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if ( _simd_layout[d]>1 ) {
if ( checker_dim_mask[d] ) {
assert( (_rdimensions[d]&0x1) == 0 );
}
}
if (d == _checker_dim)
{
assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
}
_ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if ( d==0 ) {
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
// Use a reduced simd grid
_simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
assert(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if (_simd_layout[d] > 1)
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
////////////////////////////////////////////////////////////////////////////////////////////
// subplane information
////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block =1;
int nblock=1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d];
_slice_block[d] =block;
_slice_stride[d]=_ostride[d]*_rdimensions[d];
_slice_nblock[d]=nblock;
block = block*_rdimensions[d];
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for(int d=0;d<_ndimension;d++){
rvol=rvol * _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
rvol = rvol * _rdimensions[d];
}
_checker_board.resize(rvol);
for(int osite=0;osite<_osites;osite++){
_checker_board[osite] = CheckerBoardFromOindex (osite);
for (int osite = 0; osite < _osites; osite++)
{
_checker_board[osite] = CheckerBoardFromOindex(osite);
}
};
protected:
protected:
virtual int oIndex(std::vector<int> &coor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) {
if( d==_checker_dim ) {
idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
} else {
idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
}
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
}
else
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
}
return idx;
};
virtual int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) {
if( d==_checker_dim ) {
idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
} else {
idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
}
}
return idx;
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
}
else
{
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
}
return idx;
}
};
}
#endif

View File

@ -37,7 +37,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
//#include <zlib.h>
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
@ -214,6 +217,25 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
// Try to force numa domain on the shm segment if we have numaif.h
#ifdef HAVE_NUMAIF_H
int status;
int flags=MPOL_MF_MOVE;
#ifdef KNL
int nodes=1; // numa domain == MCDRAM
// Find out if in SNC2,SNC4 mode ?
#else
int nodes=r; // numa domain == MPI ID
#endif
unsigned long count=1;
for(uint64_t page=0;page<size;page+=4096){
void *pages = (void *) ( page + (uint64_t)ptr );
uint64_t *cow_it = (uint64_t *)pages; *cow_it = 1;
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
if (ierr && (page==0)) perror("numa relocate command failed");
}
#endif
ShmCommBufs[r] =ptr;
}

View File

@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
}
};
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
@ -387,6 +388,7 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
*/
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl=1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@ -498,18 +501,19 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
Lattice<vobj> Lslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@ -540,7 +544,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
vector_typeD rtmp = TensorRemove(tmp);
// vector_typeD rtmp = TensorRemove(tmp);
auto rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp);
}}
}}
@ -549,6 +554,14 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}

View File

@ -98,35 +98,39 @@ class BinaryIO {
NerscChecksum(grid,scalardata,nersc_csum);
}
template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
template <class fobj>
static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
{
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
uint64_t lsites =grid->lSites();
if (fbuf.size()==1) {
lsites=1;
uint64_t lsites = grid->lSites();
if (fbuf.size() == 1)
{
lsites = 1;
}
#pragma omp parallel
{
uint32_t nersc_csum_thr=0;
#pragma omp parallel
{
uint32_t nersc_csum_thr = 0;
#pragma omp for
for(uint64_t local_site=0;local_site<lsites;local_site++){
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
for(uint64_t j=0;j<size32;j++){
nersc_csum_thr=nersc_csum_thr+site_buf[j];
}
#pragma omp for
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for (uint64_t j = 0; j < size32; j++)
{
nersc_csum_thr = nersc_csum_thr + site_buf[j];
}
}
#pragma omp critical
#pragma omp critical
{
nersc_csum += nersc_csum_thr;
nersc_csum += nersc_csum_thr;
}
}
}
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@ -266,7 +270,7 @@ class BinaryIO {
grid->Barrier();
GridStopWatch timer;
GridStopWatch bstimer;
nersc_csum=0;
scidac_csuma=0;
scidac_csumb=0;
@ -362,18 +366,22 @@ class BinaryIO {
#else
assert(0);
#endif
} else {
std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
std::ifstream fin;
fin.open(file,std::ios::binary|std::ios::in);
if ( control & BINARYIO_MASTER_APPEND ) {
fin.seekg(-sizeof(fobj),fin.end);
} else {
fin.seekg(offset+myrank*lsites*sizeof(fobj));
}
fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
fin.close();
} else {
std::cout << GridLogMessage << "C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin;
fin.open(file, std::ios::binary | std::ios::in);
if (control & BINARYIO_MASTER_APPEND)
{
fin.seekg(-sizeof(fobj), fin.end);
}
else
{
fin.seekg(offset + myrank * lsites * sizeof(fobj));
}
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0);
fin.close();
}
timer.Stop();
@ -405,30 +413,78 @@ class BinaryIO {
timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
std::cout << GridLogMessage << "Checking for errors" << std::endl;
if (ierr != MPI_SUCCESS)
{
char error_string[BUFSIZ];
int length_of_error_string, error_class;
MPI_Error_class(ierr, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Error_string(ierr, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
#else
assert(0);
#endif
} else {
std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
if ( control & BINARYIO_MASTER_APPEND ) {
std::ofstream fout;
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
try {
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
if ( control & BINARYIO_MASTER_APPEND ) {
fout.seekp(0,fout.end);
} else {
fout.seekp(offset+myrank*lsites*sizeof(fobj));
}
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
try {
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
}
catch (const std::fstream::failure& exc) {
std::cout << "Exception in writing file " << file << std::endl;
std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
fout.close();
}
timer.Stop();
}
}
timer.Stop();
}
std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read ";
@ -442,11 +498,14 @@ class BinaryIO {
//////////////////////////////////////////////////////////////////////////////
// Safety check
//////////////////////////////////////////////////////////////////////////////
grid->Barrier();
grid->GlobalSum(nersc_csum);
grid->GlobalXOR(scidac_csuma);
grid->GlobalXOR(scidac_csumb);
grid->Barrier();
// if the data size is 1 we do not want to sum over the MPI ranks
if (iodata.size() != 1){
grid->Barrier();
grid->GlobalSum(nersc_csum);
grid->GlobalXOR(scidac_csuma);
grid->GlobalXOR(scidac_csumb);
grid->Barrier();
}
}
/////////////////////////////////////////////////////////////////////////////
@ -546,9 +605,9 @@ class BinaryIO {
int gsites = grid->gSites();
int lsites = grid->lSites();
uint32_t nersc_csum_tmp;
uint32_t scidac_csuma_tmp;
uint32_t scidac_csumb_tmp;
uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp = 0;
uint32_t scidac_csumb_tmp = 0;
GridStopWatch timer;

View File

@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
// 4
#ifdef AVX512
#ifdef KNL
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },

View File

@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
{
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
}
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=1;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check
@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=1;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check
@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=2;
conformable(in._grid,FermionGrid()); // verifies full grid
conformable(in._grid,out._grid);
@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Report(void)
{
std::vector<int> latt = GridDefaultLatt();
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _FourDimGrid->_Nprocessors;
RealD NN = _FourDimGrid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_FourDimGrid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
/////////////////////////////////////////////////////////////////////////
// Implement the general interface. Here we use SAME mass on all slices

View File

@ -55,6 +55,16 @@ namespace QCD {
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////

View File

@ -93,6 +93,8 @@ class ScalarImplTypes {
class ScalarAdjMatrixImplTypes {
public:
typedef S Simd;
typedef QCD::SU<N> Group;
template <typename vtype>
using iImplField = iScalar<iScalar<iMatrix<vtype, N>>>;
template <typename vtype>
@ -108,7 +110,7 @@ class ScalarImplTypes {
typedef Field PropagatorField;
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
QCD::SU<N>::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
}
static inline Field projectForce(Field& P) {return P;}
@ -122,11 +124,11 @@ class ScalarImplTypes {
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
QCD::SU<N>::LieRandomize(pRNG, U);
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
}
static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
QCD::SU<N>::LieRandomize(pRNG, U, 0.01);
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
}
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {

View File

@ -81,7 +81,7 @@ namespace Grid {
phiStencil.HaloExchange(p, compressor);
Field action(p._grid), pshift(p._grid), phisquared(p._grid);
phisquared = p*p;
action = (2.0*Ndim + mass_square)*phisquared + lambda*phisquared*phisquared;
action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
for (int mu = 0; mu < Ndim; mu++) {
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
parallel_for (int i = 0; i < p._grid->oSites(); i++) {
@ -98,7 +98,7 @@ namespace Grid {
permute(temp2, *temp, permute_type);
action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
} else {
action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp);
action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
}
} else {
action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
@ -113,7 +113,7 @@ namespace Grid {
virtual void deriv(const Field &p, Field &force) {
assert(p._grid->Nd() == Ndim);
force = (2.0*Ndim + mass_square)*p + 2.0*lambda*p*p*p;
force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
// move this outside
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);

View File

@ -76,7 +76,7 @@ struct HMCparameters: Serializable {
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
std::cout << "Reading HMC\n";
std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}

View File

@ -165,7 +165,7 @@ class HMCResourceManager {
// Grids
//////////////////////////////////////////////////////////////
void AddGrid(std::string s, GridModule& M) {
void AddGrid(const std::string s, GridModule& M) {
// Check for name clashes
auto search = Grids.find(s);
if (search != Grids.end()) {
@ -174,14 +174,24 @@ class HMCResourceManager {
exit(1);
}
Grids[s] = std::move(M);
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
Grids[s].show_full_decomposition();
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
}
// Add a named grid set, 4d shortcut
void AddFourDimGrid(std::string s) {
void AddFourDimGrid(const std::string s) {
GridFourDimModule<vComplex> Mod;
AddGrid(s, Mod);
}
// Add a named grid set, 4d shortcut + tweak simd lanes
void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
GridFourDimModule<vComplex> Mod(simd_decomposition);
AddGrid(s, Mod);
}
GridCartesian* GetCartesian(std::string s = "") {
@ -253,6 +263,7 @@ class HMCResourceManager {
template<class T, class... Types>
void AddObservable(Types&&... Args){
ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
ObservablesList.back()->print_parameters();
}
std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
@ -297,4 +308,4 @@ private:
}
}
#endif // HMC_RESOURCE_MANAGER_H
#endif // HMC_RESOURCE_MANAGER_H

View File

@ -33,28 +33,29 @@ directory
namespace Grid {
// Resources
// Modules for grids
// Modules for grids
// Introduce another namespace HMCModules?
class GridModuleParameters: Serializable{
class GridModuleParameters: Serializable{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
std::string, lattice,
std::string, mpi);
std::vector<int> getLattice(){return strToVec<int>(lattice);}
std::vector<int> getMpi() {return strToVec<int>(mpi);}
std::vector<int> getLattice() const {return strToVec<int>(lattice);}
std::vector<int> getMpi() const {return strToVec<int>(mpi);}
void check(){
if (getLattice().size() != getMpi().size()) {
std::cout << GridLogError
void check() const {
if (getLattice().size() != getMpi().size() ) {
std::cout << GridLogError
<< "Error in GridModuleParameters: lattice and mpi dimensions "
"do not match"
<< std::endl;
exit(1);
}
}
}
template <class ReaderClass>
GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
@ -75,51 +76,94 @@ private:
// Lower level class
class GridModule {
public:
GridCartesian* get_full() {
GridCartesian* get_full() {
std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
return grid_.get(); }
GridRedBlackCartesian* get_rb() {
GridRedBlackCartesian* get_rb() {
std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
return rbgrid_.get(); }
void set_full(GridCartesian* grid) { grid_.reset(grid); }
void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
void show_full_decomposition(){ grid_->show_decomposition(); }
void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
protected:
std::unique_ptr<GridCartesian> grid_;
std::unique_ptr<GridRedBlackCartesian> rbgrid_;
};
////////////////////////////////////
// Classes for the user
////////////////////////////////////
// Note: the space time grid should be out of the QCD namespace
template< class vector_type>
class GridFourDimModule : public GridModule {
public:
GridFourDimModule() {
template <class vector_type>
class GridFourDimModule : public GridModule
{
public:
GridFourDimModule()
{
using namespace QCD;
set_full(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
GridDefaultLatt(),
GridDefaultSimd(4, vector_type::Nsimd()),
GridDefaultMpi()));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
}
GridFourDimModule(GridModuleParameters Params) {
GridFourDimModule(const std::vector<int> tweak_simd)
{
using namespace QCD;
if (tweak_simd.size() != 4)
{
std::cout << GridLogError
<< "Error in GridFourDimModule: SIMD size different from 4"
<< std::endl;
exit(1);
}
// Checks that the product agrees with the expectation
int simd_sum = 1;
for (auto &n : tweak_simd)
simd_sum *= n;
std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << " Sum: " << simd_sum << std::endl;
if (simd_sum == vector_type::Nsimd())
{
set_full(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(),
tweak_simd,
GridDefaultMpi()));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
}
else
{
std::cout << GridLogError
<< "Error in GridFourDimModule: SIMD lanes must sum to "
<< vector_type::Nsimd()
<< std::endl;
}
}
GridFourDimModule(const GridModuleParameters Params)
{
using namespace QCD;
Params.check();
std::vector<int> lattice_v = Params.getLattice();
std::vector<int> mpi_v = Params.getMpi();
if (lattice_v.size() == 4) {
if (lattice_v.size() == 4)
{
set_full(SpaceTimeGrid::makeFourDimGrid(
lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
lattice_v,
GridDefaultSimd(4, vector_type::Nsimd()),
mpi_v));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
} else {
std::cout << GridLogError
<< "Error in GridFourDimModule: lattice dimension different from 4"
<< std::endl;
}
else
{
std::cout << GridLogError
<< "Error in GridFourDimModule: lattice dimension different from 4"
<< std::endl;
exit(1);
}
}

View File

@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
using ObsBase::ObsBase; // for constructors
// acquire resource
virtual void initialize(){
this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
PlaquetteMod(): ObsBase(NoParameters()){}
};
template < class Impl >
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
using ObsBase::ObsBase; // for constructors
// acquire resource
virtual void initialize(){
this->ObservablePtr.reset(new TopologicalCharge<Impl>());
this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
}
public:
TopologicalChargeMod(): ObsBase(NoParameters()){}
TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
TopologicalChargeMod(): ObsBase(){}
};
}// QCD temporarily here

View File

@ -33,9 +33,45 @@ directory
namespace Grid {
namespace QCD {
struct TopologySmearingParameters : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
int, steps,
float, step_size,
int, meas_interval,
float, maxTau);
TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
template < class ReaderClass >
TopologySmearingParameters(Reader<ReaderClass>& Reader){
read(Reader, "Smearing", *this);
}
};
struct TopologyObsParameters : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
int, interval,
bool, do_smearing,
TopologySmearingParameters, Smearing);
TopologyObsParameters(int interval = 1, bool smearing = false):
interval(interval), Smearing(smearing){}
template <class ReaderClass >
TopologyObsParameters(Reader<ReaderClass>& Reader){
read(Reader, "TopologyMeasurement", *this);
}
};
// this is only defined for a gauge theory
template <class Impl>
class TopologicalCharge : public HmcObservable<typename Impl::Field> {
TopologyObsParameters Pars;
public:
// here forces the Impl to be of gauge fields
// if not the compiler will complain
@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
// necessary for HmcObservable compatibility
typedef typename Impl::Field Field;
TopologicalCharge(int interval = 1, bool do_smearing = false):
Pars(interval, do_smearing){}
TopologicalCharge(TopologyObsParameters P):Pars(P){
std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
}
void TrajectoryComplete(int traj,
Field &U,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
Real q = WilsonLoops<Impl>::TopologicalCharge(U);
if (traj%Pars.interval == 0){
// Smearing
Field Usmear = U;
int def_prec = std::cout.precision();
if (Pars.do_smearing){
// using wilson flow by default here
WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
Real T0 = WF.energyDensityPlaquette(Usmear);
std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
<< "T0 : [ " << traj << " ] "<< T0 << std::endl;
}
int def_prec = std::cout.precision();
Real q = WilsonLoops<Impl>::TopologicalCharge(Usmear);
std::cout << GridLogMessage
<< std::setprecision(std::numeric_limits<Real>::digits10 + 1)
<< "Topological Charge: [ " << traj << " ] "<< q << std::endl;
std::cout << GridLogMessage
<< std::setprecision(std::numeric_limits<Real>::digits10 + 1)
<< "Topological Charge: [ " << traj << " ] "<< q << std::endl;
std::cout.precision(def_prec);
std::cout.precision(def_prec);
}
}
};

View File

@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
if (maxTau - taus < epsilon){
epsilon = maxTau-taus;
}
std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
//std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
GaugeField Z(U._grid);
GaugeField Zprime(U._grid);
GaugeField tmp(U._grid), Uprime(U._grid);
@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
// adjust integration step
taus += epsilon;
std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
//std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
//std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
}
@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
out = in;
for (unsigned int step = 1; step <= Nstep; step++) {
auto start = std::chrono::high_resolution_clock::now();
std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
evolve_step(out);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
unsigned int step = 0;
do{
step++;
std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
//std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
evolve_step_adaptive(out, maxTau);
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
<< step << " "

View File

@ -26,12 +26,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
//#include <Grid/Grid.h>
using namespace Grid;
using namespace Grid::QCD;
#ifndef GRID_QCD_GAUGE_FIX_H
#define GRID_QCD_GAUGE_FIX_H
namespace Grid {
namespace QCD {
template <class Gimpl>
class FourierAcceleratedGaugeFixer : public Gimpl {
public:
public:
INHERIT_GIMPL_TYPES(Gimpl);
typedef typename Gimpl::GaugeLinkField GaugeMat;
@ -186,3 +188,6 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
}
};
}
}
#endif

View File

@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>
for (int a = 0; a < AdjointDimension; a++) {
generator(a, Ta);
auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
pokeColour(h_out, tmp, a);
pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
}
}

View File

@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName)
Hdf5Type<unsigned int>::type());
}
void Hdf5Reader::push(const std::string &s)
bool Hdf5Reader::push(const std::string &s)
{
group_ = group_.openGroup(s);
path_.push_back(s);
return true;
}
void Hdf5Reader::pop(void)

View File

@ -54,7 +54,7 @@ namespace Grid
public:
Hdf5Reader(const std::string &fileName);
virtual ~Hdf5Reader(void) = default;
void push(const std::string &s);
bool push(const std::string &s);
void pop(void);
template <typename U>
void readDefault(const std::string &s, U &output);

View File

@ -1,13 +1,14 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_neon.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: Nils Meyer <nils.meyer@ur.de>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
//----------------------------------------------------------------------
/*! @file Grid_sse4.h
@brief Optimization libraries for NEON (ARM) instructions set ARMv8
Experimental - Using intrinsics - DEVELOPING!
/*
ARMv8 NEON intrinsics layer by
Nils Meyer <nils.meyer@ur.de>,
University of Regensburg, Germany
SFB/TRR55
*/
// Time-stamp: <2015-07-10 17:45:09 neo>
//----------------------------------------------------------------------
#ifndef GEN_SIMD_WIDTH
#define GEN_SIMD_WIDTH 16u
#endif
#include "Grid_generic_types.h"
#include <arm_neon.h>
// ARMv8 supports double precision
namespace Grid {
namespace Optimization {
template<class vtype>
@ -46,16 +53,20 @@ namespace Optimization {
float32x4_t f;
vtype v;
};
union u128f {
float32x4_t v;
float f[4];
};
union u128d {
float64x2_t v;
double f[4];
double f[2];
};
// half precision
union u128h {
float16x8_t v;
uint16_t f[8];
};
struct Vsplat{
//Complex float
inline float32x4_t operator()(float a, float b){
@ -64,31 +75,31 @@ namespace Optimization {
}
// Real float
inline float32x4_t operator()(float a){
return vld1q_dup_f32(&a);
return vdupq_n_f32(a);
}
//Complex double
inline float32x4_t operator()(double a, double b){
float tmp[4]={(float)a,(float)b,(float)a,(float)b};
return vld1q_f32(tmp);
inline float64x2_t operator()(double a, double b){
double tmp[2]={a,b};
return vld1q_f64(tmp);
}
//Real double
inline float32x4_t operator()(double a){
return vld1q_dup_f32(&a);
//Real double // N:tbc
inline float64x2_t operator()(double a){
return vdupq_n_f64(a);
}
//Integer
//Integer // N:tbc
inline uint32x4_t operator()(Integer a){
return vld1q_dup_u32(&a);
return vdupq_n_u32(a);
}
};
struct Vstore{
//Float
//Float
inline void operator()(float32x4_t a, float* F){
vst1q_f32(F, a);
}
//Double
inline void operator()(float32x4_t a, double* D){
vst1q_f32((float*)D, a);
inline void operator()(float64x2_t a, double* D){
vst1q_f64(D, a);
}
//Integer
inline void operator()(uint32x4_t a, Integer* I){
@ -97,54 +108,54 @@ namespace Optimization {
};
struct Vstream{
//Float
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
//Float // N:generic
inline void operator()(float * a, float32x4_t b){
memcpy(a,&b,4*sizeof(float));
}
//Double
inline void operator()(double * a, float32x4_t b){
//Double // N:generic
inline void operator()(double * a, float64x2_t b){
memcpy(a,&b,2*sizeof(double));
}
};
// Nils: Vset untested; not used currently in Grid at all;
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
struct Vset{
// Complex float
// Complex float // N:ok
inline float32x4_t operator()(Grid::ComplexF *a){
float32x4_t foo;
return foo;
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
return vld1q_f32(tmp);
}
// Complex double
inline float32x4_t operator()(Grid::ComplexD *a){
float32x4_t foo;
return foo;
// Complex double // N:ok
inline float64x2_t operator()(Grid::ComplexD *a){
double tmp[2]={a[0].imag(),a[0].real()};
return vld1q_f64(tmp);
}
// Real float
// Real float // N:ok
inline float32x4_t operator()(float *a){
float32x4_t foo;
return foo;
float tmp[4]={a[3],a[2],a[1],a[0]};
return vld1q_f32(tmp);
}
// Real double
inline float32x4_t operator()(double *a){
float32x4_t foo;
return foo;
// Real double // N:ok
inline float64x2_t operator()(double *a){
double tmp[2]={a[1],a[0]};
return vld1q_f64(tmp);
}
// Integer
// Integer // N:ok
inline uint32x4_t operator()(Integer *a){
uint32x4_t foo;
return foo;
return vld1q_dup_u32(a);
}
};
// N:leaving as is
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
@ -184,26 +195,98 @@ namespace Optimization {
}
};
struct MultRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t re = vtrn1q_f32(a, a);
return vmulq_f32(re, b);
}
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float64x2_t re = vzip1q_f64(a, a);
return vmulq_f64(re, b);
}
};
struct MaddRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
float32x4_t re = vtrn1q_f32(a, a);
return vfmaq_f32(c, re, b);
}
inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
float64x2_t re = vzip1q_f64(a, a);
return vfmaq_f64(c, re, b);
}
};
struct Div{
// Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vdivq_f32(a, b);
}
// Real double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vdivq_f64(a, b);
}
};
struct MultComplex{
// Complex float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t foo;
return foo;
float32x4_t r0, r1, r2, r3, r4;
// a = ar ai Ar Ai
// b = br bi Br Bi
// collect real/imag part, negate bi and Bi
r0 = vtrn1q_f32(b, b); // br br Br Br
r1 = vnegq_f32(b); // -br -bi -Br -Bi
r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi
// the fun part
r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ...
r4 = vrev64q_f32(r3); // -bi*ai bi*ar ...
// fma(a,b,c) = a+b*c
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
// no fma, use mul and add
//float32x4_t r5;
//r5 = vmulq_f32(r0, a);
//return vaddq_f32(r4, r5);
}
// Complex double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float32x4_t foo;
return foo;
float64x2_t r0, r1, r2, r3, r4;
// b = br bi
// collect real/imag part, negate bi
r0 = vtrn1q_f64(b, b); // br br
r1 = vnegq_f64(b); // -br -bi
r2 = vtrn2q_f64(b, r1); // bi -bi
// the fun part
r3 = vmulq_f64(r2, a); // bi*ar -bi*ai
r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar
// fma(a,b,c) = a+b*c
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
// no fma, use mul and add
//float64x2_t r5;
//r5 = vmulq_f64(r0, a);
//return vaddq_f64(r4, r5);
}
};
struct Mult{
// Real float
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
return vaddq_f32(vmulq_f32(b,c),a);
//return vaddq_f32(vmulq_f32(b,c),a);
return vfmaq_f32(a, b, c);
}
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
return vaddq_f64(vmulq_f64(b,c),a);
//return vaddq_f64(vmulq_f64(b,c),a);
return vfmaq_f64(a, b, c);
}
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vmulq_f32(a,b);
@ -221,89 +304,275 @@ namespace Optimization {
struct Conj{
// Complex single
inline float32x4_t operator()(float32x4_t in){
return in;
// ar ai br bi -> ar -ai br -bi
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
return vtrn1q_f32(in, r1); // ar -ai br -bi
}
// Complex double
//inline float32x4_t operator()(float32x4_t in){
// return 0;
//}
inline float64x2_t operator()(float64x2_t in){
float64x2_t r0, r1;
r0 = vextq_f64(in, in, 1); // ai ar
r1 = vnegq_f64(r0); // -ai -ar
return vextq_f64(r0, r1, 1); // ar -ai
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
return in;
// ar ai br bi -> ai -ar ai -br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(in); // ai ar bi br
return vtrn1q_f32(r1, r0); // ar -ai br -bi
}
//Complex double
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
// return in;
//}
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
// a ib -> b -ia
float64x2_t tmp;
tmp = vnegq_f64(in);
return vextq_f64(in, tmp, 1);
}
};
struct TimesI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
//need shuffle
return in;
// ar ai br bi -> -ai ar -bi br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
return vtrn1q_f32(r1, in); // -ai ar -bi br
}
//Complex double
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
// return 0;
//}
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
// a ib -> -b ia
float64x2_t tmp;
tmp = vnegq_f64(in);
return vextq_f64(tmp, in, 1);
}
};
struct Permute{
static inline float32x4_t Permute0(float32x4_t in){ // N:ok
// AB CD -> CD AB
return vextq_f32(in, in, 2);
};
static inline float32x4_t Permute1(float32x4_t in){ // N:ok
// AB CD -> BA DC
return vrev64q_f32(in);
};
static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
return in;
};
static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute0(float64x2_t in){ // N:ok
// AB -> BA
return vextq_f64(in, in, 1);
};
static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
return in;
};
};
struct Rotate{
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
switch(n){
case 0: // AB CD -> AB CD
return tRotate<0>(in);
break;
case 1: // AB CD -> BC DA
return tRotate<1>(in);
break;
case 2: // AB CD -> CD AB
return tRotate<2>(in);
break;
case 3: // AB CD -> DA BC
return tRotate<3>(in);
break;
default: assert(0);
}
}
static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
switch(n){
case 0: // AB -> AB
return tRotate<0>(in);
break;
case 1: // AB -> BA
return tRotate<1>(in);
break;
default: assert(0);
}
}
// working, but no restriction on n
// template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
// template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
// restriction on n
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
};
struct PrecisionChange {
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
float16x4_t h = vcvt_f16_f32(a);
return vcvt_high_f16_f32(h, b);
}
static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
sb = vcvt_high_f32_f16(h);
// there is no direct conversion from lower float32x4_t to float64x2_t
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
//float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
// workaround for clang
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
sa = vcvt_high_f32_f16(h1);
}
static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
float32x2_t s = vcvt_f32_f64(a);
return vcvt_high_f32_f64(s, b);
}
static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
b = vcvt_high_f64_f32(s);
// there is no direct conversion from lower float32x4_t to float64x2_t
float32x4_t s1 = vextq_f32(s, s, 2);
a = vcvt_high_f64_f32(s1);
}
static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
float32x4_t s1 = DtoS(a, b);
float32x4_t s2 = DtoS(c, d);
return StoH(s1, s2);
}
static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
float32x4_t s1, s2;
HtoS(h, s1, s2);
StoD(s1, a, b);
StoD(s2, c, d);
}
};
//////////////////////////////////////////////
// Exchange support
struct Exchange{
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: ABEF
// in2: EFGH -> out2: CDGH
// z: CDAB
float32x4_t z = vextq_f32(in1, in1, 2);
// out1: ABEF
out1 = vextq_f32(z, in2, 2);
// z: GHEF
z = vextq_f32(in2, in2, 2);
// out2: CDGH
out2 = vextq_f32(in1, z, 2);
};
static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: AECG
// in2: EFGH -> out2: BFDH
out1 = vtrn1q_f32(in1, in2);
out2 = vtrn2q_f32(in1, in2);
};
static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
assert(0);
return;
};
static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
assert(0);
return;
};
// double precision
static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
// in1: AB -> out1: AC
// in2: CD -> out2: BD
out1 = vzip1q_f64(in1, in2);
out2 = vzip2q_f64(in1, in2);
};
static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
};
//////////////////////////////////////////////
// Some Template specialization
template < typename vtype >
void permute(vtype &a, vtype b, int perm) {
};
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
return 0;
float32x4_t v1; // two complex
v1 = Optimization::Permute::Permute0(in);
v1 = vaddq_f32(v1,in);
u128f conv; conv.v=v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
float32x2_t high = vget_high_f32(in);
float32x2_t low = vget_low_f32(in);
float32x2_t tmp = vadd_f32(low, high);
float32x2_t sum = vpadd_f32(tmp, tmp);
return vget_lane_f32(sum,0);
return vaddvq_f32(in);
}
//Complex double Reduce
template<>
template<> // N:by Boyle
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
return 0;
u128d conv; conv.v = in;
return Grid::ComplexD(conv.f[0],conv.f[1]);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
float64x2_t sum = vpaddq_f64(in, in);
return vgetq_lane_f64(sum,0);
return vaddvq_f64(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
namespace Grid {
// Here assign types
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
typedef float16x8_t SIMD_Htype; // Half precision type
typedef float32x4_t SIMD_Ftype; // Single precision type
typedef float64x2_t SIMD_Dtype; // Double precision type
typedef uint32x4_t SIMD_Itype; // Integer type
@ -312,13 +581,6 @@ namespace Grid {
inline void prefetch_HINT_T0(const char *ptr){};
// Gpermute function
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
Optimization::permute(y.v,b.v,perm);
}
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
@ -326,16 +588,19 @@ namespace Grid {
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}
}

View File

@ -53,7 +53,7 @@ directory
#if defined IMCI
#include "Grid_imci.h"
#endif
#ifdef NEONv8
#ifdef NEONV8
#include "Grid_neon.h"
#endif
#if defined QPX

View File

@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid {
int LebesgueOrder::UseLebesgueOrder;
#ifdef KNL
std::vector<int> LebesgueOrder::Block({8,2,2,2});
#else
std::vector<int> LebesgueOrder::Block({2,2,2,2});
#endif
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
n--; // 1000 0011 --> 1000 0010
n |= n >> 1; // 1000 0010 | 0100 0001 = 1100 0011
@ -51,8 +54,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
if ( Block[0]==0) ZGraph();
else if ( Block[1]==0) NoBlocking();
else CartesianBlocking();
}
if (0) {
std::cout << "Thread Interleaving"<<std::endl;
ThreadInterleave();
}
}
void LebesgueOrder::ThreadInterleave(void)
{
std::vector<IndexInteger> reorder = _LebesgueReorder;
std::vector<IndexInteger> throrder;
int vol = _LebesgueReorder.size();
int threads = GridThread::GetThreads();
int blockbits=3;
int blocklen = 8;
int msk = 0x7;
for(int t=0;t<threads;t++){
for(int ss=0;ss<vol;ss++){
if ( ( ss >> blockbits) % threads == t ) {
throrder.push_back(reorder[ss]);
}
}
}
_LebesgueReorder = throrder;
}
void LebesgueOrder::NoBlocking(void)
{
std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;

View File

@ -70,6 +70,8 @@ namespace Grid {
std::vector<IndexInteger> & xi,
std::vector<IndexInteger> &dims);
void ThreadInterleave(void);
private:
std::vector<IndexInteger> _LebesgueReorder;

View File

@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
const iVector<vtype,N> * __restrict__ rhs,
const iScalar<mtype> * __restrict__ lhs){
mult(ret,lhs,rhs);
for(int c1=0;c1<N;c1++){
mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
}
}

View File

@ -377,7 +377,7 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Decomposition\n";
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;