mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Build a list of whats on the surface
This commit is contained in:
parent
916e9e1d3e
commit
56277a11c8
7
TODO
7
TODO
@ -2,21 +2,20 @@ TODO:
|
|||||||
---------------
|
---------------
|
||||||
|
|
||||||
Peter's work list:
|
Peter's work list:
|
||||||
1)- Half-precision comms <-- started -- SIMD is prepared
|
|
||||||
2)- Precision conversion and sort out localConvert <--
|
2)- Precision conversion and sort out localConvert <--
|
||||||
|
|
||||||
3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started
|
3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started
|
||||||
4)- Binary I/O speed up & x-strips
|
4)- Binary I/O speed up & x-strips
|
||||||
|
|
||||||
-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
|
-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
|
||||||
-- Physical propagator interface
|
-- Physical propagator interface
|
||||||
-- Conserved currents
|
-- Conserved currents
|
||||||
-- GaugeFix into central location
|
-- GaugeFix into central location
|
||||||
|
|
||||||
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
|
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
|
||||||
-- HDCR resume
|
-- HDCR resume
|
||||||
|
|
||||||
Recent DONE
|
Recent DONE
|
||||||
|
-- Cut down the exterior overhead <-- DONE
|
||||||
|
-- Interior legs from SHM comms <-- DONE
|
||||||
|
-- Half-precision comms <-- DONE
|
||||||
-- Merge high precision reduction into develop
|
-- Merge high precision reduction into develop
|
||||||
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
|
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
|
||||||
-- slice* linalg routines for multiRHS, BlockCG
|
-- slice* linalg routines for multiRHS, BlockCG
|
||||||
|
@ -242,6 +242,7 @@ public:
|
|||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||||
|
|
||||||
std::vector<int> same_node;
|
std::vector<int> same_node;
|
||||||
|
std::vector<int> surface_list;
|
||||||
|
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
@ -249,11 +250,33 @@ public:
|
|||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
||||||
same_node(npoints)
|
same_node(npoints)
|
||||||
{
|
{
|
||||||
assert(npoints==8);// or 10 if do naive DWF 5d red black ?
|
surface_list.resize(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void BuildSurfaceList(int Ls,int vol4){
|
||||||
|
|
||||||
|
// find same node for SHM
|
||||||
|
// Here we know the distance is 1 for WilsonStencil
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
same_node[point] = this->SameNode(point);
|
||||||
|
std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int site = 0 ;site< vol4;site++){
|
||||||
|
int local = 1;
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
||||||
|
local = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(local == 0) {
|
||||||
|
surface_list.push_back(site);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template < class compressor>
|
template < class compressor>
|
||||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
|
@ -117,6 +117,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
|
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
|
|
||||||
|
// Build lists of exterior only nodes
|
||||||
|
int LLs = FourDimGrid._rdimensions[0];
|
||||||
|
int vol4;
|
||||||
|
vol4=FourDimGrid.oSites();
|
||||||
|
Stencil.BuildSurfaceList(LLs,vol4);
|
||||||
|
vol4=FourDimRedBlackGrid.oSites();
|
||||||
|
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||||
|
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
|
||||||
|
<<" " << StencilEven.surface_list.size()<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -406,6 +419,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
// Load imbalance alert. Should use dynamic schedule OMP for loop
|
// Load imbalance alert. Should use dynamic schedule OMP for loop
|
||||||
// Perhaps create a list of only those sites with face work, and
|
// Perhaps create a list of only those sites with face work, and
|
||||||
// load balance process the list.
|
// load balance process the list.
|
||||||
|
#if 1
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
@ -422,6 +437,27 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
|
else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
|
||||||
if ( me==0 ) DhopComputeTime2+=usecond();
|
if ( me==0 ) DhopComputeTime2+=usecond();
|
||||||
}// end parallel region
|
}// end parallel region
|
||||||
|
#else
|
||||||
|
DhopComputeTime2-=usecond();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
#pragma omp parallel for schedule(static,1)
|
||||||
|
for (int ss = 0; ss < st.surface_list.size(); ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
int sF = LLs * sU;
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#pragma omp parallel for schedule(static,1)
|
||||||
|
for (int ss = 0; ss < st.surface_list.size(); ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
int sF = LLs * sU;
|
||||||
|
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime2+=usecond();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
DhopComputeTime2-=usecond();
|
DhopComputeTime2-=usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
|
@ -473,12 +473,12 @@ namespace Optimization {
|
|||||||
#define USE_FP16
|
#define USE_FP16
|
||||||
struct PrecisionChange {
|
struct PrecisionChange {
|
||||||
static inline __m256i StoH (__m256 a,__m256 b) {
|
static inline __m256i StoH (__m256 a,__m256 b) {
|
||||||
__m256 h;
|
__m256i h;
|
||||||
#ifdef USE_FP16
|
#ifdef USE_FP16
|
||||||
__m128i ha = _mm256_cvtps_ph(a,0);
|
__m128i ha = _mm256_cvtps_ph(a,0);
|
||||||
__m128i hb = _mm256_cvtps_ph(b,0);
|
__m128i hb = _mm256_cvtps_ph(b,0);
|
||||||
h = _mm256_castps128_ps256(ha);
|
h =(__m256i) _mm256_castps128_ps256((__m128)ha);
|
||||||
h = _mm256_insertf128_ps(h,hb,1);
|
h =(__m256i) _mm256_insertf128_ps((__m256)h,(__m128)hb,1);
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
@ -486,8 +486,8 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) {
|
static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) {
|
||||||
#ifdef USE_FP16
|
#ifdef USE_FP16
|
||||||
sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0));
|
sa = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,0));
|
||||||
sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1));
|
sb = _mm256_cvtph_ps((__m128i)_mm256_extractf128_ps((__m256)h,1));
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
|
@ -190,8 +190,38 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Stencil query
|
// Stencil query
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
inline int GetNodeLocal(int osite) {
|
inline int SameNode(int point) {
|
||||||
return _entries[_npoints*osite]._node_local;
|
|
||||||
|
int dimension = _directions[point];
|
||||||
|
int displacement = _distances[point];
|
||||||
|
assert( (displacement==1) || (displacement==-1));
|
||||||
|
|
||||||
|
int pd = _grid->_processors[dimension];
|
||||||
|
int fd = _grid->_fdimensions[dimension];
|
||||||
|
int ld = _grid->_ldimensions[dimension];
|
||||||
|
int rd = _grid->_rdimensions[dimension];
|
||||||
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
if ( ! comm_dim ) return 1;
|
||||||
|
|
||||||
|
int nbr_proc;
|
||||||
|
if (displacement==1) nbr_proc = 1;
|
||||||
|
else nbr_proc = pd-1;
|
||||||
|
|
||||||
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
||||||
|
|
||||||
|
if ( shm==NULL ) return 0;
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
inline int GetNodeLocal(int osite,int point) {
|
||||||
|
return _entries[point+_npoints*osite]._node_local;
|
||||||
}
|
}
|
||||||
inline StencilEntry * GetEntry(int &ptype,int point,int osite) {
|
inline StencilEntry * GetEntry(int &ptype,int point,int osite) {
|
||||||
ptype = _permute_type[point]; return & _entries[point+_npoints*osite];
|
ptype = _permute_type[point]; return & _entries[point+_npoints*osite];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user