mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-16 06:47:06 +01:00
Compare commits
45 Commits
feature/kn
...
feature/mp
Author | SHA1 | Date | |
---|---|---|---|
b820076b91 | |||
09f66100d3 | |||
d7d92af09d | |||
460d0753a1 | |||
8f8058f8a5 | |||
d97a27f483 | |||
7c3363b91e | |||
b94478fa51 | |||
13bf0482e3 | |||
a795b5705e | |||
392e064513 | |||
b6a65059a2 | |||
ea25a4d9ac | |||
c190221fd3 | |||
0fcd2e7188 | |||
910b8dd6a1 | |||
75ebd3a0d1 | |||
09fd5c43a7 | |||
f22317748f | |||
6a9eae6b6b | |||
fad96cf250 | |||
f331809c27 | |||
2c54a53d0a | |||
306160ad9a | |||
20a091c3ed | |||
202078eb1b | |||
a762b1fb71 | |||
5b5925b8e5 | |||
b58adc6a4b | |||
f9d5e95d72 | |||
4f8e636a43 | |||
9b39f35ae6 | |||
5fe2b85cbd | |||
c7cccaaa69 | |||
cbcfea466f | |||
4955672fc3 | |||
39f1c880b8 | |||
8c043da5b7 | |||
3cbe974eb4 | |||
7af9b87318 | |||
811ca45473 | |||
bc1a4d40ba | |||
c8079e6621 | |||
8b0d171c9a | |||
1f293b76b4 |
@ -153,9 +153,10 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
assert (norm2(err)< 1.0e-5 );
|
||||||
Dw.Report();
|
Dw.Report();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -192,7 +193,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
sDw.Report();
|
sDw.Report();
|
||||||
|
|
||||||
if(0){
|
if(0){
|
||||||
@ -208,8 +209,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
|
std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
|
||||||
|
|
||||||
|
RealD sum=0;
|
||||||
RealF sum=0;
|
|
||||||
for(int x=0;x<latt4[0];x++){
|
for(int x=0;x<latt4[0];x++){
|
||||||
for(int y=0;y<latt4[1];y++){
|
for(int y=0;y<latt4[1];y++){
|
||||||
for(int z=0;z<latt4[2];z++){
|
for(int z=0;z<latt4[2];z++){
|
||||||
@ -227,12 +227,12 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
}}}}}
|
}}}}}
|
||||||
std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
|
std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
|
||||||
|
assert (sum< 1.0e-5 );
|
||||||
|
|
||||||
|
|
||||||
if (1) {
|
if (1) {
|
||||||
|
|
||||||
LatticeFermion sr_eo(sFGrid);
|
LatticeFermion sr_eo(sFGrid);
|
||||||
LatticeFermion serr(sFGrid);
|
|
||||||
|
|
||||||
LatticeFermion ssrc_e (sFrbGrid);
|
LatticeFermion ssrc_e (sFrbGrid);
|
||||||
LatticeFermion ssrc_o (sFrbGrid);
|
LatticeFermion ssrc_o (sFrbGrid);
|
||||||
@ -244,8 +244,6 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
setCheckerboard(sr_eo,ssrc_o);
|
setCheckerboard(sr_eo,ssrc_o);
|
||||||
setCheckerboard(sr_eo,ssrc_e);
|
setCheckerboard(sr_eo,ssrc_e);
|
||||||
serr = sr_eo-ssrc;
|
|
||||||
std::cout<<GridLogMessage << "EO src norm diff "<< norm2(serr)<<std::endl;
|
|
||||||
|
|
||||||
sr_e = zero;
|
sr_e = zero;
|
||||||
sr_o = zero;
|
sr_o = zero;
|
||||||
@ -263,7 +261,7 @@ int main (int argc, char ** argv)
|
|||||||
double flops=(1344.0*volume*ncall)/2;
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "sDeo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "sDeo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
sDw.Report();
|
sDw.Report();
|
||||||
|
|
||||||
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
@ -273,9 +271,18 @@ int main (int argc, char ** argv)
|
|||||||
pickCheckerboard(Even,ssrc_e,sresult);
|
pickCheckerboard(Even,ssrc_e,sresult);
|
||||||
pickCheckerboard(Odd ,ssrc_o,sresult);
|
pickCheckerboard(Odd ,ssrc_o,sresult);
|
||||||
ssrc_e = ssrc_e - sr_e;
|
ssrc_e = ssrc_e - sr_e;
|
||||||
|
RealD error = norm2(ssrc_e);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "sE norm diff "<< norm2(ssrc_e)<< " vec nrm"<<norm2(sr_e) <<std::endl;
|
std::cout<<GridLogMessage << "sE norm diff "<< norm2(ssrc_e)<< " vec nrm"<<norm2(sr_e) <<std::endl;
|
||||||
ssrc_o = ssrc_o - sr_o;
|
ssrc_o = ssrc_o - sr_o;
|
||||||
|
|
||||||
|
error+= norm2(ssrc_o);
|
||||||
std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<< " vec nrm"<<norm2(sr_o) <<std::endl;
|
std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<< " vec nrm"<<norm2(sr_o) <<std::endl;
|
||||||
|
if(error>1.0e-5) {
|
||||||
|
setCheckerboard(ssrc,ssrc_o);
|
||||||
|
setCheckerboard(ssrc,ssrc_e);
|
||||||
|
std::cout<< ssrc << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -307,7 +314,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
assert(norm2(err)<1.0e-5);
|
||||||
LatticeFermion src_e (FrbGrid);
|
LatticeFermion src_e (FrbGrid);
|
||||||
LatticeFermion src_o (FrbGrid);
|
LatticeFermion src_o (FrbGrid);
|
||||||
LatticeFermion r_e (FrbGrid);
|
LatticeFermion r_e (FrbGrid);
|
||||||
@ -334,7 +341,7 @@ int main (int argc, char ** argv)
|
|||||||
double flops=(1344.0*volume*ncall)/2;
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
Dw.Report();
|
Dw.Report();
|
||||||
}
|
}
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
@ -350,11 +357,14 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
err = r_eo-result;
|
err = r_eo-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
assert(norm2(err)<1.0e-5);
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,err);
|
pickCheckerboard(Even,src_e,err);
|
||||||
pickCheckerboard(Odd,src_o,err);
|
pickCheckerboard(Odd,src_o,err);
|
||||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
||||||
|
assert(norm2(src_e)<1.0e-5);
|
||||||
|
assert(norm2(src_o)<1.0e-5);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -244,6 +244,9 @@ case ${ac_COMMS} in
|
|||||||
mpi)
|
mpi)
|
||||||
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
|
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
|
||||||
;;
|
;;
|
||||||
|
mpi3)
|
||||||
|
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
|
||||||
|
;;
|
||||||
shmem)
|
shmem)
|
||||||
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
|
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
|
||||||
;;
|
;;
|
||||||
@ -253,6 +256,7 @@ case ${ac_COMMS} in
|
|||||||
esac
|
esac
|
||||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
|
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
|
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
|
||||||
|
AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
|
||||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
||||||
|
|
||||||
############### RNG selection
|
############### RNG selection
|
||||||
|
@ -40,14 +40,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <mm_malloc.h>
|
#include <mm_malloc.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_COMMS_SHMEM
|
|
||||||
extern "C" {
|
|
||||||
#include <mpp/shmem.h>
|
|
||||||
extern void * shmem_align(size_t, size_t);
|
|
||||||
extern void shmem_free(void *);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
@ -65,53 +57,21 @@ public:
|
|||||||
typedef _Tp value_type;
|
typedef _Tp value_type;
|
||||||
|
|
||||||
template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; };
|
template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; };
|
||||||
|
|
||||||
alignedAllocator() throw() { }
|
alignedAllocator() throw() { }
|
||||||
|
|
||||||
alignedAllocator(const alignedAllocator&) throw() { }
|
alignedAllocator(const alignedAllocator&) throw() { }
|
||||||
|
|
||||||
template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
|
template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
|
||||||
|
|
||||||
~alignedAllocator() throw() { }
|
~alignedAllocator() throw() { }
|
||||||
|
|
||||||
pointer address(reference __x) const { return &__x; }
|
pointer address(reference __x) const { return &__x; }
|
||||||
// const_pointer address(const_reference __x) const { return &__x; }
|
|
||||||
|
|
||||||
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||||
|
|
||||||
pointer allocate(size_type __n, const void* _p= 0)
|
pointer allocate(size_type __n, const void* _p= 0)
|
||||||
{
|
{
|
||||||
#ifdef GRID_COMMS_SHMEM
|
|
||||||
|
|
||||||
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
|
|
||||||
|
|
||||||
|
|
||||||
#define PARANOID_SYMMETRIC_HEAP
|
|
||||||
#ifdef PARANOID_SYMMETRIC_HEAP
|
|
||||||
static void * bcast;
|
|
||||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
|
||||||
|
|
||||||
bcast = (void *) ptr;
|
|
||||||
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
|
|
||||||
|
|
||||||
if ( bcast != ptr ) {
|
|
||||||
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
|
|
||||||
BACKTRACEFILE();
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert( bcast == (void *) ptr);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
|
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
|
||||||
#else
|
#else
|
||||||
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
|
||||||
_Tp tmp;
|
_Tp tmp;
|
||||||
#ifdef GRID_NUMA
|
#ifdef GRID_NUMA
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
@ -123,27 +83,109 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void deallocate(pointer __p, size_type) {
|
void deallocate(pointer __p, size_type) {
|
||||||
#ifdef GRID_COMMS_SHMEM
|
|
||||||
shmem_free((void *)__p);
|
|
||||||
#else
|
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
_mm_free((void *)__p);
|
_mm_free((void *)__p);
|
||||||
#else
|
#else
|
||||||
free((void *)__p);
|
free((void *)__p);
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
void construct(pointer __p, const _Tp& __val) { };
|
void construct(pointer __p, const _Tp& __val) { };
|
||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
|
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
};
|
};
|
||||||
|
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
||||||
|
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
||||||
|
|
||||||
template<typename _Tp> inline bool
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
// MPI3 : comms must use shm region
|
||||||
|
// SHMEM: comms must use symmetric heap
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
extern "C" {
|
||||||
|
#include <mpp/shmem.h>
|
||||||
|
extern void * shmem_align(size_t, size_t);
|
||||||
|
extern void shmem_free(void *);
|
||||||
|
}
|
||||||
|
#define PARANOID_SYMMETRIC_HEAP
|
||||||
|
#endif
|
||||||
|
|
||||||
template<typename _Tp> inline bool
|
template<typename _Tp>
|
||||||
operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
class commAllocator {
|
||||||
|
public:
|
||||||
|
typedef std::size_t size_type;
|
||||||
|
typedef std::ptrdiff_t difference_type;
|
||||||
|
typedef _Tp* pointer;
|
||||||
|
typedef const _Tp* const_pointer;
|
||||||
|
typedef _Tp& reference;
|
||||||
|
typedef const _Tp& const_reference;
|
||||||
|
typedef _Tp value_type;
|
||||||
|
|
||||||
|
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
|
||||||
|
commAllocator() throw() { }
|
||||||
|
commAllocator(const commAllocator&) throw() { }
|
||||||
|
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
|
||||||
|
~commAllocator() throw() { }
|
||||||
|
pointer address(reference __x) const { return &__x; }
|
||||||
|
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
pointer allocate(size_type __n, const void* _p= 0)
|
||||||
|
{
|
||||||
|
#ifdef CRAY
|
||||||
|
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
|
||||||
|
#else
|
||||||
|
_Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
|
||||||
|
#endif
|
||||||
|
#ifdef PARANOID_SYMMETRIC_HEAP
|
||||||
|
static void * bcast;
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
|
||||||
|
bcast = (void *) ptr;
|
||||||
|
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
|
||||||
|
|
||||||
|
if ( bcast != ptr ) {
|
||||||
|
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
|
||||||
|
// BACKTRACEFILE();
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
assert( bcast == (void *) ptr);
|
||||||
|
#endif
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void deallocate(pointer __p, size_type) {
|
||||||
|
shmem_free((void *)__p);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
pointer allocate(size_type __n, const void* _p= 0)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
|
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
|
||||||
|
#else
|
||||||
|
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
||||||
|
#endif
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void deallocate(pointer __p, size_type) {
|
||||||
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
|
_mm_free((void *)__p);
|
||||||
|
#else
|
||||||
|
free((void *)__p);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
void construct(pointer __p, const _Tp& __val) { };
|
||||||
|
void construct(pointer __p) { };
|
||||||
|
void destroy(pointer __p) { };
|
||||||
|
};
|
||||||
|
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
|
||||||
|
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Template typedefs
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
|
||||||
|
template<class T> using commVector = std::vector<T,commAllocator<T> >;
|
||||||
|
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
|
||||||
|
|
||||||
}; // namespace Grid
|
}; // namespace Grid
|
||||||
#endif
|
#endif
|
||||||
|
@ -38,6 +38,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/cshift/Cshift_mpi.h>
|
#include <Grid/cshift/Cshift_mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_MPI3
|
||||||
|
#include <Grid/cshift/Cshift_mpi.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
|
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
|
||||||
#endif
|
#endif
|
||||||
|
27
lib/FFT.h
27
lib/FFT.h
@ -200,18 +200,14 @@ namespace Grid {
|
|||||||
sign,FFTW_ESTIMATE);
|
sign,FFTW_ESTIMATE);
|
||||||
}
|
}
|
||||||
|
|
||||||
double add,mul,fma;
|
std::vector<int> lcoor(Nd), gcoor(Nd);
|
||||||
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
|
|
||||||
flops_call = add+mul+2.0*fma;
|
|
||||||
|
|
||||||
GridStopWatch timer;
|
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
for(int p=0;p<processors[dim];p++) {
|
for(int p=0;p<processors[dim];p++) {
|
||||||
|
|
||||||
for(int idx=0;idx<sgrid->lSites();idx++) {
|
for(int idx=0;idx<sgrid->lSites();idx++) {
|
||||||
|
|
||||||
std::vector<int> lcoor(Nd);
|
|
||||||
sgrid->LocalIndexToLocalCoor(idx,lcoor);
|
sgrid->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
|
||||||
sobj s;
|
sobj s;
|
||||||
@ -228,14 +224,11 @@ namespace Grid {
|
|||||||
|
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
|
GridStopWatch timer;
|
||||||
GridStopWatch Timer;
|
timer.Start();
|
||||||
Timer.Start();
|
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int idx=0;idx<NN;idx++) {
|
for(int idx=0;idx<NN;idx++) {
|
||||||
|
|
||||||
std::vector<int> lcoor(Nd);
|
|
||||||
pencil_g.LocalIndexToLocalCoor(idx,lcoor);
|
pencil_g.LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
|
||||||
if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
|
if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
|
||||||
@ -245,15 +238,17 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Timer.Stop();
|
timer.Stop();
|
||||||
usec += Timer.useconds();
|
|
||||||
flops+= flops_call*NN;
|
|
||||||
|
|
||||||
|
double add,mul,fma;
|
||||||
|
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
|
||||||
|
flops_call = add+mul+2.0*fma;
|
||||||
|
usec += timer.useconds();
|
||||||
|
flops+= flops_call*NN;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
for(int idx=0;idx<sgrid->lSites();idx++) {
|
for(int idx=0;idx<sgrid->lSites();idx++) {
|
||||||
std::vector<int> lcoor(Nd);
|
|
||||||
sgrid->LocalIndexToLocalCoor(idx,lcoor);
|
sgrid->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
std::vector<int> gcoor = lcoor;
|
gcoor = lcoor;
|
||||||
// extract the result
|
// extract the result
|
||||||
sobj s;
|
sobj s;
|
||||||
gcoor[dim] = lcoor[dim]+L*pc;
|
gcoor[dim] = lcoor[dim]+L*pc;
|
||||||
|
22
lib/Init.cc
22
lib/Init.cc
@ -171,14 +171,17 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
|||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
//
|
//
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
|
static int Grid_is_initialised = 0;
|
||||||
|
|
||||||
|
|
||||||
void Grid_init(int *argc,char ***argv)
|
void Grid_init(int *argc,char ***argv)
|
||||||
{
|
{
|
||||||
|
GridLogger::StopWatch.Start();
|
||||||
|
|
||||||
CartesianCommunicator::Init(argc,argv);
|
CartesianCommunicator::Init(argc,argv);
|
||||||
|
|
||||||
// Parse command line args.
|
// Parse command line args.
|
||||||
|
|
||||||
GridLogger::StopWatch.Start();
|
|
||||||
|
|
||||||
std::string arg;
|
std::string arg;
|
||||||
std::vector<std::string> logstreams;
|
std::vector<std::string> logstreams;
|
||||||
std::string defaultLog("Error,Warning,Message,Performance");
|
std::string defaultLog("Error,Warning,Message,Performance");
|
||||||
@ -216,11 +219,14 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
||||||
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
||||||
}
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
|
||||||
|
GridLogTimestamp(1);
|
||||||
|
}
|
||||||
|
|
||||||
GridParseLayout(*argv,*argc,
|
GridParseLayout(*argv,*argc,
|
||||||
Grid_default_latt,
|
Grid_default_latt,
|
||||||
Grid_default_mpi);
|
Grid_default_mpi);
|
||||||
@ -246,15 +252,15 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout <<std::endl;
|
std::cout <<std::endl;
|
||||||
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
||||||
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
||||||
std::cout <<COL_RED << "__|__| | | "<< "| | | "<<COL_PURPLE<<" | | |"<< " | | | _|__"<<std::endl;
|
std::cout <<COL_RED << "__|_ | | | "<< "| | | "<<COL_PURPLE<<" | | |"<< " | | | _|__"<<std::endl;
|
||||||
std::cout <<COL_RED << "__|__ "<< " "<<COL_PURPLE<<" "<< " _|__"<<std::endl;
|
std::cout <<COL_RED << "__|_ "<< " "<<COL_PURPLE<<" "<< " _|__"<<std::endl;
|
||||||
std::cout <<COL_RED << "__|_ "<<COL_GREEN<<" GGGG "<<COL_RED<<" RRRR "<<COL_BLUE <<" III "<<COL_PURPLE<<"DDDD "<<COL_PURPLE<<" _|__"<<std::endl;
|
std::cout <<COL_RED << "__|_ "<<COL_GREEN<<" GGGG "<<COL_RED<<" RRRR "<<COL_BLUE <<" III "<<COL_PURPLE<<"DDDD "<<COL_PURPLE<<" _|__"<<std::endl;
|
||||||
std::cout <<COL_RED << "__|_ "<<COL_GREEN<<"G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D "<<COL_PURPLE<<" _|__"<<std::endl;
|
std::cout <<COL_RED << "__|_ "<<COL_GREEN<<"G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D "<<COL_PURPLE<<" _|__"<<std::endl;
|
||||||
std::cout <<COL_RED << "__|_ "<<COL_GREEN<<"G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D"<<COL_PURPLE<<" _|__"<<std::endl;
|
std::cout <<COL_RED << "__|_ "<<COL_GREEN<<"G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D"<<COL_PURPLE<<" _|__"<<std::endl;
|
||||||
std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<"G GG "<<COL_RED<<" RRRR "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D"<<COL_GREEN <<" _|__"<<std::endl;
|
std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<"G GG "<<COL_RED<<" RRRR "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D"<<COL_GREEN <<" _|__"<<std::endl;
|
||||||
std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<"G G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D "<<COL_GREEN <<" _|__"<<std::endl;
|
std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<"G G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D "<<COL_GREEN <<" _|__"<<std::endl;
|
||||||
std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<" GGGG "<<COL_RED<<" R R "<<COL_BLUE <<" III "<<COL_PURPLE<<"DDDD "<<COL_GREEN <<" _|__"<<std::endl;
|
std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<" GGGG "<<COL_RED<<" R R "<<COL_BLUE <<" III "<<COL_PURPLE<<"DDDD "<<COL_GREEN <<" _|__"<<std::endl;
|
||||||
std::cout <<COL_BLUE << "__|__ "<< " "<<COL_GREEN <<" "<< " _|__"<<std::endl;
|
std::cout <<COL_BLUE << "__|_ "<< " "<<COL_GREEN <<" "<< " _|__"<<std::endl;
|
||||||
std::cout <<COL_BLUE << "__|__|__|__|__"<< "|__|__|_"<<COL_GREEN <<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
std::cout <<COL_BLUE << "__|__|__|__|__"<< "|__|__|_"<<COL_GREEN <<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
||||||
std::cout <<COL_BLUE << "__|__|__|__|__"<< "|__|__|_"<<COL_GREEN <<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
std::cout <<COL_BLUE << "__|__|__|__|__"<< "|__|__|_"<<COL_GREEN <<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
|
||||||
std::cout <<COL_BLUE << " | | | | "<< "| | | "<<COL_GREEN <<" | | |"<< " | | | | "<<std::endl;
|
std::cout <<COL_BLUE << " | | | | "<< "| | | "<<COL_GREEN <<" | | |"<< " | | | | "<<std::endl;
|
||||||
@ -274,12 +280,14 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << "GNU General Public License for more details."<<std::endl;
|
std::cout << "GNU General Public License for more details."<<std::endl;
|
||||||
std::cout << COL_BACKGROUND <<std::endl;
|
std::cout << COL_BACKGROUND <<std::endl;
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
Grid_is_initialised = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void Grid_finalize(void)
|
void Grid_finalize(void)
|
||||||
{
|
{
|
||||||
#ifdef GRID_COMMS_MPI
|
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
Grid_unquiesce_nodes();
|
Grid_unquiesce_nodes();
|
||||||
#endif
|
#endif
|
||||||
|
@ -33,6 +33,7 @@ namespace Grid {
|
|||||||
|
|
||||||
void Grid_init(int *argc,char ***argv);
|
void Grid_init(int *argc,char ***argv);
|
||||||
void Grid_finalize(void);
|
void Grid_finalize(void);
|
||||||
|
|
||||||
// internal, controled with --handle
|
// internal, controled with --handle
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||||
void Grid_debug_handler_init(void);
|
void Grid_debug_handler_init(void);
|
||||||
@ -44,6 +45,7 @@ namespace Grid {
|
|||||||
const std::vector<int> &GridDefaultMpi(void);
|
const std::vector<int> &GridDefaultMpi(void);
|
||||||
const int &GridThreads(void) ;
|
const int &GridThreads(void) ;
|
||||||
void GridSetThreads(int t) ;
|
void GridSetThreads(int t) ;
|
||||||
|
void GridLogTimestamp(int);
|
||||||
|
|
||||||
// Common parsing chores
|
// Common parsing chores
|
||||||
std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
|
std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
|
||||||
|
@ -34,8 +34,13 @@ directory
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
GridStopWatch Logger::StopWatch;
|
GridStopWatch Logger::StopWatch;
|
||||||
|
int Logger::timestamp;
|
||||||
std::ostream Logger::devnull(0);
|
std::ostream Logger::devnull(0);
|
||||||
|
|
||||||
|
void GridLogTimestamp(int on){
|
||||||
|
Logger::Timestamp(on);
|
||||||
|
}
|
||||||
|
|
||||||
Colours GridLogColours(0);
|
Colours GridLogColours(0);
|
||||||
GridLogger GridLogError(1, "Error", GridLogColours, "RED");
|
GridLogger GridLogError(1, "Error", GridLogColours, "RED");
|
||||||
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
|
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
|
||||||
@ -73,7 +78,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
void Grid_quiesce_nodes(void) {
|
void Grid_quiesce_nodes(void) {
|
||||||
int me = 0;
|
int me = 0;
|
||||||
#ifdef GRID_COMMS_MPI
|
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
23
lib/Log.h
23
lib/Log.h
@ -39,8 +39,9 @@
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Dress the output; use std::chrono for time stamping via the StopWatch class
|
// Dress the output; use std::chrono for time stamping via the StopWatch class
|
||||||
int Rank(void); // used for early stage debug before library init
|
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
class Colours{
|
class Colours{
|
||||||
@ -55,7 +56,6 @@ public:
|
|||||||
|
|
||||||
void Active(bool activate){
|
void Active(bool activate){
|
||||||
is_active=activate;
|
is_active=activate;
|
||||||
|
|
||||||
if (is_active){
|
if (is_active){
|
||||||
colour["BLACK"] ="\033[30m";
|
colour["BLACK"] ="\033[30m";
|
||||||
colour["RED"] ="\033[31m";
|
colour["RED"] ="\033[31m";
|
||||||
@ -77,10 +77,7 @@ public:
|
|||||||
colour["WHITE"] ="";
|
colour["WHITE"] ="";
|
||||||
colour["NORMAL"]="";
|
colour["NORMAL"]="";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -88,6 +85,7 @@ class Logger {
|
|||||||
protected:
|
protected:
|
||||||
Colours &Painter;
|
Colours &Painter;
|
||||||
int active;
|
int active;
|
||||||
|
static int timestamp;
|
||||||
std::string name, topName;
|
std::string name, topName;
|
||||||
std::string COLOUR;
|
std::string COLOUR;
|
||||||
|
|
||||||
@ -99,8 +97,7 @@ public:
|
|||||||
std::string evidence() {return Painter.colour["YELLOW"];}
|
std::string evidence() {return Painter.colour["YELLOW"];}
|
||||||
std::string colour() {return Painter.colour[COLOUR];}
|
std::string colour() {return Painter.colour[COLOUR];}
|
||||||
|
|
||||||
Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
|
Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on),
|
||||||
: active(on),
|
|
||||||
name(nm),
|
name(nm),
|
||||||
topName(topNm),
|
topName(topNm),
|
||||||
Painter(col_class),
|
Painter(col_class),
|
||||||
@ -108,16 +105,20 @@ public:
|
|||||||
|
|
||||||
void Active(int on) {active = on;};
|
void Active(int on) {active = on;};
|
||||||
int isActive(void) {return active;};
|
int isActive(void) {return active;};
|
||||||
|
static void Timestamp(int on) {timestamp = on;};
|
||||||
|
|
||||||
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
||||||
|
|
||||||
if ( log.active ) {
|
if ( log.active ) {
|
||||||
|
stream << log.background()<< log.topName << log.background()<< " : ";
|
||||||
|
stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
|
||||||
|
if ( log.timestamp ) {
|
||||||
StopWatch.Stop();
|
StopWatch.Stop();
|
||||||
GridTime now = StopWatch.Elapsed();
|
GridTime now = StopWatch.Elapsed();
|
||||||
StopWatch.Start();
|
StopWatch.Start();
|
||||||
stream << log.background()<< log.topName << log.background()<< " : ";
|
stream << log.evidence()<< now << log.background() << " : " ;
|
||||||
stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
|
}
|
||||||
stream << log.evidence()<< now << log.background() << " : " << log.colour();
|
stream << log.colour();
|
||||||
return stream;
|
return stream;
|
||||||
} else {
|
} else {
|
||||||
return devnull;
|
return devnull;
|
||||||
@ -149,7 +150,7 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
|
|||||||
|
|
||||||
#define BACKTRACEFILE() {\
|
#define BACKTRACEFILE() {\
|
||||||
char string[20]; \
|
char string[20]; \
|
||||||
std::sprintf(string,"backtrace.%d",Rank()); \
|
std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
|
||||||
std::FILE * fp = std::fopen(string,"w"); \
|
std::FILE * fp = std::fopen(string,"w"); \
|
||||||
BACKTRACEFP(fp)\
|
BACKTRACEFP(fp)\
|
||||||
std::fclose(fp); \
|
std::fclose(fp); \
|
||||||
|
@ -1,14 +1,22 @@
|
|||||||
extra_sources=
|
extra_sources=
|
||||||
if BUILD_COMMS_MPI
|
if BUILD_COMMS_MPI
|
||||||
extra_sources+=communicator/Communicator_mpi.cc
|
extra_sources+=communicator/Communicator_mpi.cc
|
||||||
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
|
endif
|
||||||
|
|
||||||
|
if BUILD_COMMS_MPI3
|
||||||
|
extra_sources+=communicator/Communicator_mpi3.cc
|
||||||
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
if BUILD_COMMS_SHMEM
|
if BUILD_COMMS_SHMEM
|
||||||
extra_sources+=communicator/Communicator_shmem.cc
|
extra_sources+=communicator/Communicator_shmem.cc
|
||||||
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
if BUILD_COMMS_NONE
|
if BUILD_COMMS_NONE
|
||||||
extra_sources+=communicator/Communicator_none.cc
|
extra_sources+=communicator/Communicator_none.cc
|
||||||
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
|
338
lib/Stencil.h
338
lib/Stencil.h
@ -70,9 +70,56 @@
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
|
int off,std::vector<std::pair<int,int> > & table)
|
||||||
|
{
|
||||||
|
table.resize(0);
|
||||||
|
int rd = grid->_rdimensions[dimension];
|
||||||
|
|
||||||
|
if ( !grid->CheckerBoarded(dimension) ) {
|
||||||
|
cbmask = 0x3;
|
||||||
|
}
|
||||||
|
int so= plane*grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int e1=grid->_slice_nblock[dimension];
|
||||||
|
int e2=grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
int stride=grid->_slice_stride[dimension];
|
||||||
|
if ( cbmask == 0x3 ) {
|
||||||
|
table.resize(e1*e2);
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o = n*stride;
|
||||||
|
int bo = n*e2;
|
||||||
|
table[bo+b]=std::pair<int,int>(bo+b,o+b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int bo=0;
|
||||||
|
table.resize(e1*e2/2);
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o = n*stride;
|
||||||
|
int ocb=1<<grid->CheckerBoardFromOindexTable(o+b);
|
||||||
|
if ( ocb &cbmask ) {
|
||||||
|
table[bo]=std::pair<int,int>(bo,o+b); bo++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj,class cobj,class compressor> void
|
||||||
|
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
||||||
|
{
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int i=0;i<table.size();i++){
|
||||||
|
buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct StencilEntry {
|
struct StencilEntry {
|
||||||
uint32_t _offset;
|
uint64_t _offset;
|
||||||
uint32_t _byte_offset;
|
uint64_t _byte_offset;
|
||||||
uint16_t _is_local;
|
uint16_t _is_local;
|
||||||
uint16_t _permute;
|
uint16_t _permute;
|
||||||
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
||||||
@ -82,6 +129,7 @@
|
|||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||||
typedef uint32_t StencilInteger;
|
typedef uint32_t StencilInteger;
|
||||||
typedef typename cobj::vector_type vector_type;
|
typedef typename cobj::vector_type vector_type;
|
||||||
typedef typename cobj::scalar_type scalar_type;
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
@ -97,86 +145,60 @@
|
|||||||
Integer to_rank;
|
Integer to_rank;
|
||||||
Integer from_rank;
|
Integer from_rank;
|
||||||
Integer bytes;
|
Integer bytes;
|
||||||
volatile Integer done;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
|
|
||||||
#define SEND_IMMEDIATE
|
int face_table_computed;
|
||||||
#define SERIAL_SENDS
|
std::vector<std::vector<std::pair<int,int> > > face_table ;
|
||||||
|
|
||||||
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
|
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
|
||||||
#ifdef SEND_IMMEDIATE
|
|
||||||
commtime-=usecond();
|
|
||||||
_grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
|
|
||||||
commtime+=usecond();
|
|
||||||
#endif
|
|
||||||
Packet p;
|
Packet p;
|
||||||
p.send_buf = xmit;
|
p.send_buf = xmit;
|
||||||
p.recv_buf = rcv;
|
p.recv_buf = rcv;
|
||||||
p.to_rank = to;
|
p.to_rank = to;
|
||||||
p.from_rank= from;
|
p.from_rank= from;
|
||||||
p.bytes = bytes;
|
p.bytes = bytes;
|
||||||
p.done = 0;
|
|
||||||
comms_bytes+=2.0*bytes;
|
comms_bytes+=2.0*bytes;
|
||||||
Packets.push_back(p);
|
Packets.push_back(p);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SERIAL_SENDS
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
void Communicate(void ) {
|
{
|
||||||
|
reqs.resize(Packets.size());
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
#ifndef SEND_IMMEDIATE
|
_grid->StencilSendToRecvFromBegin(reqs[i],
|
||||||
_grid->SendToRecvFrom(
|
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,
|
Packets[i].to_rank,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes);
|
Packets[i].bytes);
|
||||||
#endif
|
/*
|
||||||
Packets[i].done = 1;
|
}else{
|
||||||
|
_grid->SendToRecvFromBegin(reqs[i],
|
||||||
|
Packets[i].send_buf,
|
||||||
|
Packets[i].to_rank,
|
||||||
|
Packets[i].recv_buf,
|
||||||
|
Packets[i].from_rank,
|
||||||
|
Packets[i].bytes);
|
||||||
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
#else
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
void Communicate(void ) {
|
{
|
||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs(Packets.size());
|
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
const int concurrency=2;
|
|
||||||
for(int i=0;i<Packets.size();i+=concurrency){
|
for(int i=0;i<Packets.size();i++){
|
||||||
for(int ii=0;ii<concurrency;ii++){
|
// if( ShmDirectCopy )
|
||||||
int j = i+ii;
|
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
||||||
if ( j<Packets.size() ) {
|
// else
|
||||||
#ifndef SEND_IMMEDIATE
|
// _grid->SendToRecvFromComplete(reqs[i]);
|
||||||
_grid->SendToRecvFromBegin(reqs[j],
|
|
||||||
Packets[j].send_buf,
|
|
||||||
Packets[j].to_rank,
|
|
||||||
Packets[j].recv_buf,
|
|
||||||
Packets[j].from_rank,
|
|
||||||
Packets[j].bytes);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int ii=0;ii<concurrency;ii++){
|
|
||||||
int j = i+ii;
|
|
||||||
if ( j<Packets.size() ) {
|
|
||||||
#ifndef SEND_IMMEDIATE
|
|
||||||
_grid->SendToRecvFromComplete(reqs[i]);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int ii=0;ii<concurrency;ii++){
|
|
||||||
int j = i+ii;
|
|
||||||
if ( j<Packets.size() ) {
|
|
||||||
Packets[j].done = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Simd merge queue for asynch comms
|
// Simd merge queue for asynch comms
|
||||||
@ -196,36 +218,19 @@
|
|||||||
m.rpointers= rpointers;
|
m.rpointers= rpointers;
|
||||||
m.buffer_size = buffer_size;
|
m.buffer_size = buffer_size;
|
||||||
m.packet_id = packet_id;
|
m.packet_id = packet_id;
|
||||||
#ifdef SEND_IMMEDIATE
|
|
||||||
mergetime-=usecond();
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int o=0;o<m.buffer_size;o++){
|
|
||||||
merge1(m.mpointer[o],m.rpointers,o);
|
|
||||||
}
|
|
||||||
mergetime+=usecond();
|
|
||||||
#else
|
|
||||||
Mergers.push_back(m);
|
Mergers.push_back(m);
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommsMerge(void ) {
|
void CommsMerge(void ) {
|
||||||
//PARALLEL_NESTED_LOOP2
|
|
||||||
for(int i=0;i<Mergers.size();i++){
|
for(int i=0;i<Mergers.size();i++){
|
||||||
|
|
||||||
spintime-=usecond();
|
|
||||||
int packet_id = Mergers[i].packet_id;
|
|
||||||
while(! Packets[packet_id].done ); // spin for completion
|
|
||||||
spintime+=usecond();
|
|
||||||
|
|
||||||
#ifndef SEND_IMMEDIATE
|
|
||||||
mergetime-=usecond();
|
mergetime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||||
}
|
}
|
||||||
mergetime+=usecond();
|
mergetime+=usecond();
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -248,45 +253,56 @@
|
|||||||
// Flat vector, change layout for cache friendly.
|
// Flat vector, change layout for cache friendly.
|
||||||
Vector<StencilEntry> _entries;
|
Vector<StencilEntry> _entries;
|
||||||
|
|
||||||
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
|
|
||||||
|
|
||||||
void PrecomputeByteOffsets(void){
|
void PrecomputeByteOffsets(void){
|
||||||
for(int i=0;i<_entries.size();i++){
|
for(int i=0;i<_entries.size();i++){
|
||||||
if( _entries[i]._is_local ) {
|
if( _entries[i]._is_local ) {
|
||||||
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
|
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
|
||||||
} else {
|
} else {
|
||||||
_entries[i]._byte_offset =(uint64_t)&comm_buf[0]+ _entries[i]._offset*sizeof(cobj);
|
_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
inline uint64_t Touch(int ent) {
|
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
|
||||||
// _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
|
|
||||||
}
|
|
||||||
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
||||||
//_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
|
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
||||||
uint64_t cbase = (uint64_t) 0;
|
|
||||||
local = _entries[ent]._is_local;
|
local = _entries[ent]._is_local;
|
||||||
perm = _entries[ent]._permute;
|
perm = _entries[ent]._permute;
|
||||||
if (perm) ptype = _permute_type[point];
|
if (perm) ptype = _permute_type[point];
|
||||||
if (local) return base + _entries[ent]._byte_offset;
|
if (local) {
|
||||||
else return cbase + _entries[ent]._byte_offset;
|
return base + _entries[ent]._byte_offset;
|
||||||
|
} else {
|
||||||
|
return cbase + _entries[ent]._byte_offset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
inline uint64_t GetPFInfo(int ent,uint64_t base) {
|
inline uint64_t GetPFInfo(int ent,uint64_t base) {
|
||||||
uint64_t cbase = (uint64_t) 0;
|
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
||||||
int local = _entries[ent]._is_local;
|
int local = _entries[ent]._is_local;
|
||||||
if (local) return base + _entries[ent]._byte_offset;
|
if (local) return base + _entries[ent]._byte_offset;
|
||||||
else return cbase + _entries[ent]._byte_offset;
|
else return cbase + _entries[ent]._byte_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Comms buffers
|
///////////////////////////////////////////////////////////
|
||||||
std::vector<Vector<scalar_object> > u_simd_send_buf;
|
// Unified Comms buffers for all directions
|
||||||
std::vector<Vector<scalar_object> > u_simd_recv_buf;
|
///////////////////////////////////////////////////////////
|
||||||
Vector<cobj> u_send_buf;
|
// Vectors that live on the symmetric heap in case of SHMEM
|
||||||
Vector<cobj> comm_buf;
|
// std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
|
||||||
|
// std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
|
||||||
|
// commVector<cobj> u_send_buf_hide;
|
||||||
|
// commVector<cobj> u_recv_buf_hide;
|
||||||
|
|
||||||
|
// These are used; either SHM objects or refs to the above symmetric heap vectors
|
||||||
|
// depending on comms target
|
||||||
|
cobj* u_recv_buf_p;
|
||||||
|
cobj* u_send_buf_p;
|
||||||
|
std::vector<scalar_object *> u_simd_send_buf;
|
||||||
|
std::vector<scalar_object *> u_simd_recv_buf;
|
||||||
|
|
||||||
int u_comm_offset;
|
int u_comm_offset;
|
||||||
int _unified_buffer_size;
|
int _unified_buffer_size;
|
||||||
|
|
||||||
|
cobj *CommBuf(void) { return u_recv_buf_p; }
|
||||||
|
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
// Timing info; ugly; possibly temporary
|
// Timing info; ugly; possibly temporary
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
@ -302,6 +318,8 @@
|
|||||||
double gathermtime;
|
double gathermtime;
|
||||||
double splicetime;
|
double splicetime;
|
||||||
double nosplicetime;
|
double nosplicetime;
|
||||||
|
double t_data;
|
||||||
|
double t_table;
|
||||||
double calls;
|
double calls;
|
||||||
|
|
||||||
void ZeroCounters(void) {
|
void ZeroCounters(void) {
|
||||||
@ -314,6 +332,8 @@
|
|||||||
gathermtime = 0.;
|
gathermtime = 0.;
|
||||||
splicetime = 0.;
|
splicetime = 0.;
|
||||||
nosplicetime = 0.;
|
nosplicetime = 0.;
|
||||||
|
t_data = 0.0;
|
||||||
|
t_table= 0.0;
|
||||||
comms_bytes = 0.;
|
comms_bytes = 0.;
|
||||||
calls = 0.;
|
calls = 0.;
|
||||||
};
|
};
|
||||||
@ -323,16 +343,21 @@
|
|||||||
std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
||||||
if ( calls > 0. ) {
|
if ( calls > 0. ) {
|
||||||
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
|
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
|
||||||
PRINTIT(jointime);
|
|
||||||
PRINTIT(gathertime);
|
|
||||||
PRINTIT(commtime);
|
|
||||||
PRINTIT(halogtime);
|
PRINTIT(halogtime);
|
||||||
PRINTIT(mergetime);
|
PRINTIT(gathertime);
|
||||||
PRINTIT(spintime);
|
|
||||||
PRINTIT(comms_bytes);
|
|
||||||
PRINTIT(gathermtime);
|
PRINTIT(gathermtime);
|
||||||
|
PRINTIT(mergetime);
|
||||||
|
if(comms_bytes>1.0){
|
||||||
|
PRINTIT(comms_bytes);
|
||||||
|
PRINTIT(commtime);
|
||||||
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl;
|
||||||
|
}
|
||||||
|
PRINTIT(jointime);
|
||||||
|
PRINTIT(spintime);
|
||||||
PRINTIT(splicetime);
|
PRINTIT(splicetime);
|
||||||
PRINTIT(nosplicetime);
|
PRINTIT(nosplicetime);
|
||||||
|
PRINTIT(t_table);
|
||||||
|
PRINTIT(t_data);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
@ -344,6 +369,7 @@
|
|||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: _permute_type(npoints), _comm_buf_size(npoints)
|
: _permute_type(npoints), _comm_buf_size(npoints)
|
||||||
{
|
{
|
||||||
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
_grid = grid;
|
_grid = grid;
|
||||||
_directions = directions;
|
_directions = directions;
|
||||||
@ -358,7 +384,6 @@
|
|||||||
int i = ii; // reverse direction to get SIMD comms done first
|
int i = ii; // reverse direction to get SIMD comms done first
|
||||||
int point = i;
|
int point = i;
|
||||||
|
|
||||||
|
|
||||||
int dimension = directions[i];
|
int dimension = directions[i];
|
||||||
int displacement = distances[i];
|
int displacement = distances[i];
|
||||||
int shift = displacement;
|
int shift = displacement;
|
||||||
@ -405,18 +430,25 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
u_send_buf.resize(_unified_buffer_size);
|
|
||||||
comm_buf.resize(_unified_buffer_size);
|
|
||||||
|
|
||||||
PrecomputeByteOffsets();
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Try to allocate for receiving in a shared memory region, fall back to buffer
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
|
_grid->ShmBufferFreeAll();
|
||||||
|
|
||||||
u_simd_send_buf.resize(Nsimd);
|
u_simd_send_buf.resize(Nsimd);
|
||||||
u_simd_recv_buf.resize(Nsimd);
|
u_simd_recv_buf.resize(Nsimd);
|
||||||
|
|
||||||
|
u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
|
u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||||
for(int l=0;l<Nsimd;l++){
|
for(int l=0;l<Nsimd;l++){
|
||||||
u_simd_send_buf[l].resize(_unified_buffer_size);
|
u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
|
||||||
u_simd_recv_buf[l].resize(_unified_buffer_size);
|
u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PrecomputeByteOffsets();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Local (int point, int dimension,int shiftpm,int cbmask)
|
void Local (int point, int dimension,int shiftpm,int cbmask)
|
||||||
@ -640,38 +672,22 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
|
|
||||||
template<class compressor>
|
|
||||||
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
|
||||||
{
|
{
|
||||||
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
calls++;
|
calls++;
|
||||||
Mergers.resize(0);
|
Mergers.resize(0);
|
||||||
Packets.resize(0);
|
Packets.resize(0);
|
||||||
|
_grid->StencilBarrier();
|
||||||
HaloGather(source,compress);
|
HaloGather(source,compress);
|
||||||
this->Communicate();
|
this->CommunicateBegin(reqs);
|
||||||
|
_grid->StencilBarrier();
|
||||||
|
this->CommunicateComplete(reqs);
|
||||||
|
_grid->StencilBarrier();
|
||||||
CommsMerge(); // spins
|
CommsMerge(); // spins
|
||||||
}
|
}
|
||||||
#if 0
|
|
||||||
// Overlapping comms and compute typically slows down compute and is useless
|
template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||||
// unless memory bandwidth greatly exceeds network
|
|
||||||
template<class compressor>
|
|
||||||
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
|
|
||||||
Mergers.resize(0);
|
|
||||||
Packets.resize(0);
|
|
||||||
HaloGather(source,compress);
|
|
||||||
return std::thread([&] { this->Communicate(); });
|
|
||||||
}
|
|
||||||
void HaloExchangeComplete(std::thread &thr)
|
|
||||||
{
|
|
||||||
CommsMerge(); // spins
|
|
||||||
jointime-=usecond();
|
|
||||||
thr.join();
|
|
||||||
jointime+=usecond();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
template<class compressor>
|
|
||||||
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point)
|
|
||||||
{
|
{
|
||||||
int dimension = _directions[point];
|
int dimension = _directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = _distances[point];
|
||||||
@ -699,23 +715,23 @@
|
|||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
if (splice_dim) {
|
if (splice_dim) {
|
||||||
splicetime-=usecond();
|
splicetime-=usecond();
|
||||||
GatherSimd(source,dimension,shift,0x3,compress);
|
GatherSimd(source,dimension,shift,0x3,compress,face_idx);
|
||||||
splicetime+=usecond();
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
nosplicetime-=usecond();
|
nosplicetime-=usecond();
|
||||||
Gather(source,dimension,shift,0x3,compress);
|
Gather(source,dimension,shift,0x3,compress,face_idx);
|
||||||
nosplicetime+=usecond();
|
nosplicetime+=usecond();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if(splice_dim){
|
if(splice_dim){
|
||||||
splicetime-=usecond();
|
splicetime-=usecond();
|
||||||
GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes
|
GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
|
||||||
GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration
|
GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
|
||||||
splicetime+=usecond();
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
nosplicetime-=usecond();
|
nosplicetime-=usecond();
|
||||||
Gather(source,dimension,shift,0x1,compress);
|
Gather(source,dimension,shift,0x1,compress,face_idx);
|
||||||
Gather(source,dimension,shift,0x2,compress);
|
Gather(source,dimension,shift,0x2,compress,face_idx);
|
||||||
nosplicetime+=usecond();
|
nosplicetime+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -729,21 +745,22 @@
|
|||||||
assert(source._grid==_grid);
|
assert(source._grid==_grid);
|
||||||
halogtime-=usecond();
|
halogtime-=usecond();
|
||||||
|
|
||||||
assert (comm_buf.size() == _unified_buffer_size );
|
|
||||||
u_comm_offset=0;
|
u_comm_offset=0;
|
||||||
|
|
||||||
// Gather all comms buffers
|
// Gather all comms buffers
|
||||||
|
int face_idx=0;
|
||||||
for(int point = 0 ; point < _npoints; point++) {
|
for(int point = 0 ; point < _npoints; point++) {
|
||||||
compress.Point(point);
|
compress.Point(point);
|
||||||
HaloGatherDir(source,compress,point);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
|
face_table_computed=1;
|
||||||
|
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
halogtime+=usecond();
|
halogtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class compressor>
|
template<class compressor>
|
||||||
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
|
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
|
||||||
{
|
{
|
||||||
typedef typename cobj::vector_type vector_type;
|
typedef typename cobj::vector_type vector_type;
|
||||||
typedef typename cobj::scalar_type scalar_type;
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
@ -780,31 +797,54 @@
|
|||||||
int bytes = words * sizeof(cobj);
|
int bytes = words * sizeof(cobj);
|
||||||
|
|
||||||
gathertime-=usecond();
|
gathertime-=usecond();
|
||||||
Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset);
|
int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
gathertime+=usecond();
|
if ( !face_table_computed ) {
|
||||||
|
t_table-=usecond();
|
||||||
|
face_table.resize(face_idx+1);
|
||||||
|
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
|
||||||
|
face_table[face_idx]);
|
||||||
|
t_table+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int rank = _grid->_processor;
|
int rank = _grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
assert (xmit_to_rank != _grid->ThisRank());
|
assert (xmit_to_rank != _grid->ThisRank());
|
||||||
assert (recv_from_rank != _grid->ThisRank());
|
assert (recv_from_rank != _grid->ThisRank());
|
||||||
|
|
||||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
/////////////////////////////////////////////////////////
|
||||||
AddPacket((void *)&u_send_buf[u_comm_offset],
|
// try the direct copy if possible
|
||||||
(void *) &comm_buf[u_comm_offset],
|
/////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
|
||||||
|
if ( send_buf==NULL ) {
|
||||||
|
send_buf = u_send_buf_p;
|
||||||
|
}
|
||||||
|
// std::cout << " send_bufs "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl;
|
||||||
|
t_data-=usecond();
|
||||||
|
assert(u_send_buf_p!=NULL);
|
||||||
|
assert(send_buf!=NULL);
|
||||||
|
Gather_plane_simple_table (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
|
||||||
|
t_data+=usecond();
|
||||||
|
|
||||||
|
AddPacket((void *)&send_buf[u_comm_offset],
|
||||||
|
(void *)&u_recv_buf_p[u_comm_offset],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
|
gathertime+=usecond();
|
||||||
u_comm_offset+=words;
|
u_comm_offset+=words;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class compressor>
|
template<class compressor>
|
||||||
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
|
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
|
||||||
{
|
{
|
||||||
const int Nsimd = _grid->Nsimd();
|
const int Nsimd = _grid->Nsimd();
|
||||||
|
|
||||||
@ -883,10 +923,6 @@
|
|||||||
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
||||||
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
|
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
|
||||||
|
|
||||||
void *vrp = (void *)rp;
|
|
||||||
void *vsp = (void *)sp;
|
|
||||||
|
|
||||||
|
|
||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
|
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
@ -894,9 +930,17 @@
|
|||||||
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
|
scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
||||||
|
// if ((ShmDirectCopy==0)||(shm==NULL)) {
|
||||||
|
if (shm==NULL) {
|
||||||
|
shm = rp;
|
||||||
|
}
|
||||||
|
|
||||||
rpointers[i] = rp;
|
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
|
||||||
|
// assuming above pointer flip
|
||||||
|
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
|
||||||
|
|
||||||
|
rpointers[i] = shm;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
@ -905,7 +949,7 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
|
AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
|
||||||
|
|
||||||
u_comm_offset +=buffer_size;
|
u_comm_offset +=buffer_size;
|
||||||
}
|
}
|
||||||
|
@ -127,6 +127,22 @@ class GridThread {
|
|||||||
ThreadBarrier();
|
ThreadBarrier();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void bcopy(const void *src, void *dst, size_t len) {
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
const char *c_src =(char *) src;
|
||||||
|
char *c_dest=(char *) dst;
|
||||||
|
int me,mywork,myoff;
|
||||||
|
GridThread::GetWorkBarrier(len,me, mywork,myoff);
|
||||||
|
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
bcopy(src,dst,len);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -77,7 +77,7 @@ public:
|
|||||||
// GridCartesian / GridRedBlackCartesian
|
// GridCartesian / GridRedBlackCartesian
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
virtual int CheckerBoarded(int dim)=0;
|
virtual int CheckerBoarded(int dim)=0;
|
||||||
virtual int CheckerBoard(std::vector<int> site)=0;
|
virtual int CheckerBoard(std::vector<int> &site)=0;
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||||
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
||||||
|
@ -49,7 +49,7 @@ public:
|
|||||||
virtual int CheckerBoarded(int dim){
|
virtual int CheckerBoarded(int dim){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoard(std::vector<int> site){
|
virtual int CheckerBoard(std::vector<int> &site){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoardDestination(int cb,int shift,int dim){
|
virtual int CheckerBoardDestination(int cb,int shift,int dim){
|
||||||
|
@ -49,7 +49,7 @@ public:
|
|||||||
if( dim==_checker_dim) return 1;
|
if( dim==_checker_dim) return 1;
|
||||||
else return 0;
|
else return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoard(std::vector<int> site){
|
virtual int CheckerBoard(std::vector<int> &site){
|
||||||
int linear=0;
|
int linear=0;
|
||||||
assert(site.size()==_ndimension);
|
assert(site.size()==_ndimension);
|
||||||
for(int d=0;d<_ndimension;d++){
|
for(int d=0;d<_ndimension;d++){
|
||||||
|
132
lib/communicator/Communicator_base.cc
Normal file
132
lib/communicator/Communicator_base.cc
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/communicator/Communicator_none.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include "Grid.h"
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// Info that is setup once and indept of cartesian layout
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
int CartesianCommunicator::ShmRank;
|
||||||
|
int CartesianCommunicator::ShmSize;
|
||||||
|
int CartesianCommunicator::GroupRank;
|
||||||
|
int CartesianCommunicator::GroupSize;
|
||||||
|
int CartesianCommunicator::WorldRank;
|
||||||
|
int CartesianCommunicator::WorldSize;
|
||||||
|
int CartesianCommunicator::Slave;
|
||||||
|
void * CartesianCommunicator::ShmCommBuf;
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
// Alloc, free shmem region
|
||||||
|
/////////////////////////////////
|
||||||
|
void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
|
||||||
|
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
|
||||||
|
void *ptr = (void *)heap_top;
|
||||||
|
heap_top += bytes;
|
||||||
|
heap_bytes+= bytes;
|
||||||
|
std::cout <<"Shm alloc "<<ptr<<std::endl;
|
||||||
|
assert(heap_bytes < MAX_MPI_SHM_BYTES);
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ShmBufferFreeAll(void) {
|
||||||
|
heap_top =(size_t)ShmBufferSelf();
|
||||||
|
heap_bytes=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
// Grid information queries
|
||||||
|
/////////////////////////////////
|
||||||
|
int CartesianCommunicator::IsBoss(void) { return _processor==0; };
|
||||||
|
int CartesianCommunicator::BossRank(void) { return 0; };
|
||||||
|
int CartesianCommunicator::ThisRank(void) { return _processor; };
|
||||||
|
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
|
||||||
|
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
|
||||||
|
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int CartesianCommunicator::RankWorld(void){ return WorldRank; };
|
||||||
|
int CartesianCommunicator::Ranks (void) { return WorldSize; };
|
||||||
|
int CartesianCommunicator::Nodes (void) { return GroupSize; };
|
||||||
|
int CartesianCommunicator::Cores (void) { return ShmSize; };
|
||||||
|
int CartesianCommunicator::NodeRank (void) { return GroupRank; };
|
||||||
|
int CartesianCommunicator::CoreRank (void) { return ShmRank; };
|
||||||
|
|
||||||
|
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||||
|
{
|
||||||
|
GlobalSumVector((float *)&c,2);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||||
|
{
|
||||||
|
GlobalSumVector((float *)c,2*N);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||||
|
{
|
||||||
|
GlobalSumVector((double *)&c,2);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
||||||
|
{
|
||||||
|
GlobalSumVector((double *)c,2*N);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef GRID_COMMS_MPI3
|
||||||
|
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
||||||
|
{
|
||||||
|
SendToRecvFromComplete(waitall);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::StencilBarrier(void){};
|
||||||
|
|
||||||
|
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
|
||||||
|
|
||||||
|
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
|
||||||
|
|
||||||
|
void *CartesianCommunicator::ShmBuffer(int rank) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ShmInitGeneric(void){
|
||||||
|
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
|
||||||
|
ShmCommBuf=(void *)&ShmBufStorageVector[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -34,77 +34,137 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_COMMS_MPI
|
#ifdef GRID_COMMS_MPI
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GRID_COMMS_MPI3
|
||||||
|
#include <mpi.h>
|
||||||
|
#endif
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
#include <mpp/shmem.h>
|
#include <mpp/shmem.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
class CartesianCommunicator {
|
class CartesianCommunicator {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// Communicator should know nothing of the physics grid, only processor grid.
|
// 65536 ranks per node adequate for now
|
||||||
|
// 128MB shared memory for comms enought for 48^4 local vol comms
|
||||||
|
// Give external control (command line override?) of this
|
||||||
|
|
||||||
|
static const int MAXLOG2RANKSPERNODE = 16;
|
||||||
|
static const uint64_t MAX_MPI_SHM_BYTES = 128*1024*1024;
|
||||||
|
|
||||||
|
// Communicator should know nothing of the physics grid, only processor grid.
|
||||||
int _Nprocessors; // How many in all
|
int _Nprocessors; // How many in all
|
||||||
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
|
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
|
||||||
int _processor; // linear processor rank
|
int _processor; // linear processor rank
|
||||||
std::vector<int> _processor_coor; // linear processor coordinate
|
std::vector<int> _processor_coor; // linear processor coordinate
|
||||||
unsigned long _ndimension;
|
unsigned long _ndimension;
|
||||||
|
|
||||||
#ifdef GRID_COMMS_MPI
|
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
|
||||||
MPI_Comm communicator;
|
MPI_Comm communicator;
|
||||||
|
static MPI_Comm communicator_world;
|
||||||
typedef MPI_Request CommsRequest_t;
|
typedef MPI_Request CommsRequest_t;
|
||||||
#else
|
#else
|
||||||
typedef int CommsRequest_t;
|
typedef int CommsRequest_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Helper functionality for SHM Windows common to all other impls
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Longer term; drop this in favour of a master / slave model with
|
||||||
|
// cartesian communicator on a subset of ranks, slave ranks controlled
|
||||||
|
// by group leader with data xfer via shared memory
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
#ifdef GRID_COMMS_MPI3
|
||||||
|
std::vector<int> WorldDims;
|
||||||
|
std::vector<int> GroupDims;
|
||||||
|
std::vector<int> ShmDims;
|
||||||
|
|
||||||
|
std::vector<int> GroupCoor;
|
||||||
|
std::vector<int> ShmCoor;
|
||||||
|
std::vector<int> WorldCoor;
|
||||||
|
|
||||||
|
static std::vector<int> GroupRanks;
|
||||||
|
static std::vector<int> MyGroup;
|
||||||
|
static int ShmSetup;
|
||||||
|
static MPI_Win ShmWindow;
|
||||||
|
static MPI_Comm ShmComm;
|
||||||
|
|
||||||
|
std::vector<int> LexicographicToWorldRank;
|
||||||
|
|
||||||
|
static std::vector<void *> ShmCommBufs;
|
||||||
|
#else
|
||||||
|
static void ShmInitGeneric(void);
|
||||||
|
static commVector<uint8_t> ShmBufStorageVector;
|
||||||
|
#endif
|
||||||
|
static void * ShmCommBuf;
|
||||||
|
size_t heap_top;
|
||||||
|
size_t heap_bytes;
|
||||||
|
void *ShmBufferSelf(void);
|
||||||
|
void *ShmBuffer(int rank);
|
||||||
|
void *ShmBufferTranslate(int rank,void * local_p);
|
||||||
|
void *ShmBufferMalloc(size_t bytes);
|
||||||
|
void ShmBufferFreeAll(void) ;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
// Must call in Grid startup
|
||||||
|
////////////////////////////////////////////////
|
||||||
static void Init(int *argc, char ***argv);
|
static void Init(int *argc, char ***argv);
|
||||||
|
|
||||||
// Constructor
|
////////////////////////////////////////////////
|
||||||
|
// Constructor of any given grid
|
||||||
|
////////////////////////////////////////////////
|
||||||
CartesianCommunicator(const std::vector<int> &pdimensions_in);
|
CartesianCommunicator(const std::vector<int> &pdimensions_in);
|
||||||
|
|
||||||
// Wraps MPI_Cart routines
|
////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Wraps MPI_Cart routines, or implements equivalent on other impls
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////
|
||||||
void ShiftedRanks(int dim,int shift,int & source, int & dest);
|
void ShiftedRanks(int dim,int shift,int & source, int & dest);
|
||||||
int RankFromProcessorCoor(std::vector<int> &coor);
|
int RankFromProcessorCoor(std::vector<int> &coor);
|
||||||
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
|
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Grid information queries
|
// Grid information and queries
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
int IsBoss(void) { return _processor==0; };
|
static int ShmRank;
|
||||||
int BossRank(void) { return 0; };
|
static int ShmSize;
|
||||||
int ThisRank(void) { return _processor; };
|
static int GroupSize;
|
||||||
const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
|
static int GroupRank;
|
||||||
const std::vector<int> & ProcessorGrid(void) { return _processors; };
|
static int WorldRank;
|
||||||
int ProcessorCount(void) { return _Nprocessors; };
|
static int WorldSize;
|
||||||
|
static int Slave;
|
||||||
|
|
||||||
|
int IsBoss(void) ;
|
||||||
|
int BossRank(void) ;
|
||||||
|
int ThisRank(void) ;
|
||||||
|
const std::vector<int> & ThisProcessorCoor(void) ;
|
||||||
|
const std::vector<int> & ProcessorGrid(void) ;
|
||||||
|
int ProcessorCount(void) ;
|
||||||
|
static int Ranks (void);
|
||||||
|
static int Nodes (void);
|
||||||
|
static int Cores (void);
|
||||||
|
static int NodeRank (void);
|
||||||
|
static int CoreRank (void);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
static int RankWorld(void) ;
|
||||||
|
static void BroadcastWorld(int root,void* data, int bytes);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Reduction
|
// Reduction
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
void GlobalSum(RealF &);
|
void GlobalSum(RealF &);
|
||||||
void GlobalSumVector(RealF *,int N);
|
void GlobalSumVector(RealF *,int N);
|
||||||
|
|
||||||
void GlobalSum(RealD &);
|
void GlobalSum(RealD &);
|
||||||
void GlobalSumVector(RealD *,int N);
|
void GlobalSumVector(RealD *,int N);
|
||||||
|
|
||||||
void GlobalSum(uint32_t &);
|
void GlobalSum(uint32_t &);
|
||||||
void GlobalSum(uint64_t &);
|
void GlobalSum(uint64_t &);
|
||||||
|
void GlobalSum(ComplexF &c);
|
||||||
void GlobalSum(ComplexF &c)
|
void GlobalSumVector(ComplexF *c,int N);
|
||||||
{
|
void GlobalSum(ComplexD &c);
|
||||||
GlobalSumVector((float *)&c,2);
|
void GlobalSumVector(ComplexD *c,int N);
|
||||||
}
|
|
||||||
void GlobalSumVector(ComplexF *c,int N)
|
|
||||||
{
|
|
||||||
GlobalSumVector((float *)c,2*N);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GlobalSum(ComplexD &c)
|
|
||||||
{
|
|
||||||
GlobalSumVector((double *)&c,2);
|
|
||||||
}
|
|
||||||
void GlobalSumVector(ComplexD *c,int N)
|
|
||||||
{
|
|
||||||
GlobalSumVector((double *)c,2*N);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class obj> void GlobalSum(obj &o){
|
template<class obj> void GlobalSum(obj &o){
|
||||||
typedef typename obj::scalar_type scalar_type;
|
typedef typename obj::scalar_type scalar_type;
|
||||||
@ -112,6 +172,7 @@ class CartesianCommunicator {
|
|||||||
scalar_type * ptr = (scalar_type *)& o;
|
scalar_type * ptr = (scalar_type *)& o;
|
||||||
GlobalSumVector(ptr,words);
|
GlobalSumVector(ptr,words);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Face exchange, buffer swap in translational invariant way
|
// Face exchange, buffer swap in translational invariant way
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
@ -133,8 +194,19 @@ class CartesianCommunicator {
|
|||||||
void *recv,
|
void *recv,
|
||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||||
|
|
||||||
|
void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes);
|
||||||
|
|
||||||
|
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||||
|
void StencilBarrier(void);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Barrier
|
// Barrier
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
@ -144,13 +216,12 @@ class CartesianCommunicator {
|
|||||||
// Broadcast a buffer and composite larger
|
// Broadcast a buffer and composite larger
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
void Broadcast(int root,void* data, int bytes);
|
void Broadcast(int root,void* data, int bytes);
|
||||||
|
|
||||||
template<class obj> void Broadcast(int root,obj &data)
|
template<class obj> void Broadcast(int root,obj &data)
|
||||||
{
|
{
|
||||||
Broadcast(root,(void *)&data,sizeof(data));
|
Broadcast(root,(void *)&data,sizeof(data));
|
||||||
};
|
};
|
||||||
|
|
||||||
static void BroadcastWorld(int root,void* data, int bytes);
|
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,6 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Info that is setup once and indept of cartesian layout
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
|
|
||||||
// Should error check all MPI calls.
|
// Should error check all MPI calls.
|
||||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||||
int flag;
|
int flag;
|
||||||
@ -37,12 +43,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
if ( !flag ) {
|
if ( !flag ) {
|
||||||
MPI_Init(argc,argv);
|
MPI_Init(argc,argv);
|
||||||
}
|
}
|
||||||
}
|
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||||
|
MPI_Comm_rank(communicator_world,&WorldRank);
|
||||||
int Rank(void) {
|
MPI_Comm_size(communicator_world,&WorldSize);
|
||||||
int pe;
|
ShmRank=0;
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD,&pe);
|
ShmSize=1;
|
||||||
return pe;
|
GroupRank=WorldRank;
|
||||||
|
GroupSize=WorldSize;
|
||||||
|
Slave =0;
|
||||||
|
ShmInitGeneric();
|
||||||
}
|
}
|
||||||
|
|
||||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
|||||||
_processors = processors;
|
_processors = processors;
|
||||||
_processor_coor.resize(_ndimension);
|
_processor_coor.resize(_ndimension);
|
||||||
|
|
||||||
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||||
MPI_Comm_rank(communicator,&_processor);
|
MPI_Comm_rank(communicator,&_processor);
|
||||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||||
|
|
||||||
@ -67,7 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
|||||||
|
|
||||||
assert(Size==_Nprocessors);
|
assert(Size==_Nprocessors);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
@ -168,7 +176,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
|
|||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Status> status(nreq);
|
||||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||||
|
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
|||||||
communicator);
|
communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Should only be used prior to Grid Init finished.
|
||||||
|
// Check for this?
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||||
{
|
{
|
||||||
int ierr= MPI_Bcast(data,
|
int ierr= MPI_Bcast(data,
|
||||||
bytes,
|
bytes,
|
||||||
MPI_BYTE,
|
MPI_BYTE,
|
||||||
root,
|
root,
|
||||||
MPI_COMM_WORLD);
|
communicator_world);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
574
lib/communicator/Communicator_mpi3.cc
Normal file
574
lib/communicator/Communicator_mpi3.cc
Normal file
@ -0,0 +1,574 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/communicator/Communicator_mpi.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include "Grid.h"
|
||||||
|
#include <mpi.h>
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Info that is setup once and indept of cartesian layout
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int CartesianCommunicator::ShmSetup = 0;
|
||||||
|
|
||||||
|
MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
|
MPI_Comm CartesianCommunicator::ShmComm;
|
||||||
|
MPI_Win CartesianCommunicator::ShmWindow;
|
||||||
|
|
||||||
|
std::vector<int> CartesianCommunicator::GroupRanks;
|
||||||
|
std::vector<int> CartesianCommunicator::MyGroup;
|
||||||
|
std::vector<void *> CartesianCommunicator::ShmCommBufs;
|
||||||
|
|
||||||
|
void *CartesianCommunicator::ShmBufferSelf(void)
|
||||||
|
{
|
||||||
|
return ShmCommBufs[ShmRank];
|
||||||
|
}
|
||||||
|
void *CartesianCommunicator::ShmBuffer(int rank)
|
||||||
|
{
|
||||||
|
int gpeer = GroupRanks[rank];
|
||||||
|
if (gpeer == MPI_UNDEFINED){
|
||||||
|
return NULL;
|
||||||
|
} else {
|
||||||
|
return ShmCommBufs[gpeer];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
|
||||||
|
{
|
||||||
|
int gpeer = GroupRanks[rank];
|
||||||
|
if (gpeer == MPI_UNDEFINED){
|
||||||
|
return NULL;
|
||||||
|
} else {
|
||||||
|
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
||||||
|
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
||||||
|
return (void *) remote;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||||
|
int flag;
|
||||||
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
|
if ( !flag ) {
|
||||||
|
MPI_Init(argc,argv);
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||||
|
MPI_Comm_rank(communicator_world,&WorldRank);
|
||||||
|
MPI_Comm_size(communicator_world,&WorldSize);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
// Split into groups that can share memory
|
||||||
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
||||||
|
MPI_Comm_rank(ShmComm ,&ShmRank);
|
||||||
|
MPI_Comm_size(ShmComm ,&ShmSize);
|
||||||
|
GroupSize = WorldSize/ShmSize;
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
// find world ranks in our SHM group (i.e. which ranks are on our node)
|
||||||
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
MPI_Group WorldGroup, ShmGroup;
|
||||||
|
MPI_Comm_group (communicator_world, &WorldGroup);
|
||||||
|
MPI_Comm_group (ShmComm, &ShmGroup);
|
||||||
|
|
||||||
|
std::vector<int> world_ranks(WorldSize);
|
||||||
|
GroupRanks.resize(WorldSize);
|
||||||
|
MyGroup.resize(ShmSize);
|
||||||
|
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
|
||||||
|
|
||||||
|
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// Identify who is in my group and noninate the leader
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
int g=0;
|
||||||
|
for(int rank=0;rank<WorldSize;rank++){
|
||||||
|
if(GroupRanks[rank]!=MPI_UNDEFINED){
|
||||||
|
assert(g<ShmSize);
|
||||||
|
MyGroup[g++] = rank;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
|
||||||
|
int myleader = MyGroup[0];
|
||||||
|
|
||||||
|
std::vector<int> leaders_1hot(WorldSize,0);
|
||||||
|
std::vector<int> leaders_group(GroupSize,0);
|
||||||
|
leaders_1hot [ myleader ] = 1;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// global sum leaders over comm world
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
|
||||||
|
assert(ierr==0);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// find the group leaders world rank
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
int group=0;
|
||||||
|
for(int l=0;l<WorldSize;l++){
|
||||||
|
if(leaders_1hot[l]){
|
||||||
|
leaders_group[group++] = l;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// Identify the rank of the group in which I (and my leader) live
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
GroupRank=-1;
|
||||||
|
for(int g=0;g<GroupSize;g++){
|
||||||
|
if (myleader == leaders_group[g]){
|
||||||
|
GroupRank=g;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(GroupRank!=-1);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// allocate the shared window for our group
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
ShmCommBuf = 0;
|
||||||
|
ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
|
||||||
|
assert(ierr==0);
|
||||||
|
// KNL hack -- force to numa-domain 1 in flat
|
||||||
|
#if 0
|
||||||
|
//#include <numaif.h>
|
||||||
|
for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
|
||||||
|
void *pages = (void *) ( page + ShmCommBuf );
|
||||||
|
int status;
|
||||||
|
int flags=MPOL_MF_MOVE_ALL;
|
||||||
|
int nodes=1; // numa domain == MCDRAM
|
||||||
|
unsigned long count=1;
|
||||||
|
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
|
||||||
|
if (ierr && (page==0)) perror("numa relocate command failed");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
ShmCommBufs.resize(ShmSize);
|
||||||
|
for(int r=0;r<ShmSize;r++){
|
||||||
|
MPI_Aint sz;
|
||||||
|
int dsp_unit;
|
||||||
|
MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Verbose for now
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
if (WorldRank == 0){
|
||||||
|
std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
|
||||||
|
std::cout<< WorldSize << " Ranks " ;
|
||||||
|
std::cout<< GroupSize << " Nodes " ;
|
||||||
|
std::cout<< ShmSize << " with ranks-per-node "<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage <<"Grid MPI-3 configuration: allocated shared memory region of size ";
|
||||||
|
std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
|
||||||
|
|
||||||
|
for(int g=0;g<GroupSize;g++){
|
||||||
|
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
|
||||||
|
for(int g=0;g<ShmSize;g++){
|
||||||
|
std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
|
||||||
|
if(g!=ShmSize-1) std::cout<<",";
|
||||||
|
else std::cout<<"}"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int g=0;g<GroupSize;g++){
|
||||||
|
if ( (ShmRank == 0) && (GroupRank==g) ) std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
|
||||||
|
for(int r=0;r<ShmSize;r++){
|
||||||
|
if ( (ShmRank == 0) && (GroupRank==g) ) {
|
||||||
|
std::cout<<MyGroup[r];
|
||||||
|
if(r<ShmSize-1) std::cout<<",";
|
||||||
|
else std::cout<<"}"<<std::endl;
|
||||||
|
}
|
||||||
|
MPI_Barrier(communicator_world);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(ShmSetup==0); ShmSetup=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Want to implement some magic ... Group sub-cubes into those on same node
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||||
|
{
|
||||||
|
std::vector<int> coor = _processor_coor;
|
||||||
|
|
||||||
|
assert(std::abs(shift) <_processors[dim]);
|
||||||
|
|
||||||
|
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
|
||||||
|
Lexicographic::IndexFromCoor(coor,source,_processors);
|
||||||
|
source = LexicographicToWorldRank[source];
|
||||||
|
|
||||||
|
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
|
||||||
|
Lexicographic::IndexFromCoor(coor,dest,_processors);
|
||||||
|
dest = LexicographicToWorldRank[dest];
|
||||||
|
}
|
||||||
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
int rank;
|
||||||
|
Lexicographic::IndexFromCoor(coor,rank,_processors);
|
||||||
|
rank = LexicographicToWorldRank[rank];
|
||||||
|
return rank;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
Lexicographic::CoorFromIndex(coor,rank,_processors);
|
||||||
|
rank = LexicographicToWorldRank[rank];
|
||||||
|
}
|
||||||
|
|
||||||
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
|
{
|
||||||
|
int ierr;
|
||||||
|
|
||||||
|
communicator=communicator_world;
|
||||||
|
|
||||||
|
_ndimension = processors.size();
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Assert power of two shm_size.
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int log2size = -1;
|
||||||
|
for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){
|
||||||
|
if ( (0x1<<i) == ShmSize ) {
|
||||||
|
log2size = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(log2size != -1);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Identify subblock of ranks on node spreading across dims
|
||||||
|
// in a maximally symmetrical way
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int dim = 0;
|
||||||
|
|
||||||
|
std::vector<int> WorldDims = processors;
|
||||||
|
|
||||||
|
ShmDims.resize(_ndimension,1);
|
||||||
|
GroupDims.resize(_ndimension);
|
||||||
|
|
||||||
|
ShmCoor.resize(_ndimension);
|
||||||
|
GroupCoor.resize(_ndimension);
|
||||||
|
WorldCoor.resize(_ndimension);
|
||||||
|
|
||||||
|
for(int l2=0;l2<log2size;l2++){
|
||||||
|
while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
|
||||||
|
ShmDims[dim]*=2;
|
||||||
|
dim=(dim+1)%_ndimension;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Establish torus of processes and nodes with sub-blockings
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
for(int d=0;d<_ndimension;d++){
|
||||||
|
GroupDims[d] = WorldDims[d]/ShmDims[d];
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Check processor counts match
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
_Nprocessors=1;
|
||||||
|
_processors = processors;
|
||||||
|
_processor_coor.resize(_ndimension);
|
||||||
|
for(int i=0;i<_ndimension;i++){
|
||||||
|
_Nprocessors*=_processors[i];
|
||||||
|
}
|
||||||
|
assert(WorldSize==_Nprocessors);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Establish mapping between lexico physics coord and WorldRank
|
||||||
|
//
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
LexicographicToWorldRank.resize(WorldSize,0);
|
||||||
|
Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
|
||||||
|
Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
|
||||||
|
for(int d=0;d<_ndimension;d++){
|
||||||
|
WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
|
||||||
|
}
|
||||||
|
_processor_coor = WorldCoor;
|
||||||
|
|
||||||
|
int lexico;
|
||||||
|
Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
|
||||||
|
LexicographicToWorldRank[lexico]=WorldRank;
|
||||||
|
_processor = lexico;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// global sum Lexico to World mapping
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
std::vector<CommsRequest_t> reqs(0);
|
||||||
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||||
|
SendToRecvFromComplete(reqs);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||||
|
void *recv,
|
||||||
|
int sender,
|
||||||
|
int receiver,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
MPI_Status stat;
|
||||||
|
assert(sender != receiver);
|
||||||
|
int tag = sender;
|
||||||
|
if ( _processor == sender ) {
|
||||||
|
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||||
|
}
|
||||||
|
if ( _processor == receiver ) {
|
||||||
|
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
this->StencilBarrier();
|
||||||
|
|
||||||
|
MPI_Request xrq;
|
||||||
|
MPI_Request rrq;
|
||||||
|
|
||||||
|
static int sequence;
|
||||||
|
|
||||||
|
int ierr;
|
||||||
|
int tag;
|
||||||
|
int check;
|
||||||
|
|
||||||
|
assert(dest != _processor);
|
||||||
|
assert(from != _processor);
|
||||||
|
|
||||||
|
int gdest = GroupRanks[dest];
|
||||||
|
int gfrom = GroupRanks[from];
|
||||||
|
int gme = GroupRanks[_processor];
|
||||||
|
|
||||||
|
sequence++;
|
||||||
|
|
||||||
|
char *from_ptr = (char *)ShmCommBufs[ShmRank];
|
||||||
|
|
||||||
|
int small = (bytes<MAX_MPI_SHM_BYTES);
|
||||||
|
|
||||||
|
typedef uint64_t T;
|
||||||
|
int words = bytes/sizeof(T);
|
||||||
|
|
||||||
|
assert(((size_t)bytes &(sizeof(T)-1))==0);
|
||||||
|
assert(gme == ShmRank);
|
||||||
|
|
||||||
|
if ( small && (gdest !=MPI_UNDEFINED) ) {
|
||||||
|
|
||||||
|
char *to_ptr = (char *)ShmCommBufs[gdest];
|
||||||
|
|
||||||
|
assert(gme != gdest);
|
||||||
|
|
||||||
|
T *ip = (T *)xmit;
|
||||||
|
T *op = (T *)to_ptr;
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int w=0;w<words;w++) {
|
||||||
|
op[w]=ip[w];
|
||||||
|
}
|
||||||
|
|
||||||
|
bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
|
||||||
|
bcopy(& sequence,&to_ptr[bytes+4],sizeof(sequence));
|
||||||
|
} else {
|
||||||
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||||
|
assert(ierr==0);
|
||||||
|
list.push_back(xrq);
|
||||||
|
}
|
||||||
|
|
||||||
|
this->StencilBarrier();
|
||||||
|
|
||||||
|
if (small && (gfrom !=MPI_UNDEFINED) ) {
|
||||||
|
T *ip = (T *)from_ptr;
|
||||||
|
T *op = (T *)recv;
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int w=0;w<words;w++) {
|
||||||
|
op[w]=ip[w];
|
||||||
|
}
|
||||||
|
bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag));
|
||||||
|
bcopy(&from_ptr[bytes+4],&check,sizeof(check));
|
||||||
|
assert(check==sequence);
|
||||||
|
assert(tag==from);
|
||||||
|
} else {
|
||||||
|
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||||
|
assert(ierr==0);
|
||||||
|
list.push_back(rrq);
|
||||||
|
}
|
||||||
|
|
||||||
|
this->StencilBarrier();
|
||||||
|
|
||||||
|
#else
|
||||||
|
MPI_Request xrq;
|
||||||
|
MPI_Request rrq;
|
||||||
|
int rank = _processor;
|
||||||
|
int ierr;
|
||||||
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||||
|
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||||
|
|
||||||
|
assert(ierr==0);
|
||||||
|
|
||||||
|
list.push_back(xrq);
|
||||||
|
list.push_back(rrq);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
MPI_Request xrq;
|
||||||
|
MPI_Request rrq;
|
||||||
|
|
||||||
|
int ierr;
|
||||||
|
|
||||||
|
assert(dest != _processor);
|
||||||
|
assert(from != _processor);
|
||||||
|
|
||||||
|
int gdest = GroupRanks[dest];
|
||||||
|
int gfrom = GroupRanks[from];
|
||||||
|
int gme = GroupRanks[_processor];
|
||||||
|
|
||||||
|
assert(gme == ShmRank);
|
||||||
|
|
||||||
|
if ( gdest == MPI_UNDEFINED ) {
|
||||||
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||||
|
assert(ierr==0);
|
||||||
|
list.push_back(xrq);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( gfrom ==MPI_UNDEFINED) {
|
||||||
|
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||||
|
assert(ierr==0);
|
||||||
|
list.push_back(rrq);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
SendToRecvFromComplete(list);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::StencilBarrier(void)
|
||||||
|
{
|
||||||
|
MPI_Win_sync (ShmWindow);
|
||||||
|
MPI_Barrier (ShmComm);
|
||||||
|
MPI_Win_sync (ShmWindow);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
int nreq=list.size();
|
||||||
|
std::vector<MPI_Status> status(nreq);
|
||||||
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Barrier(void)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Barrier(communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Bcast(data,
|
||||||
|
bytes,
|
||||||
|
MPI_BYTE,
|
||||||
|
root,
|
||||||
|
communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
int ierr= MPI_Bcast(data,
|
||||||
|
bytes,
|
||||||
|
MPI_BYTE,
|
||||||
|
root,
|
||||||
|
communicator_world);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -28,12 +28,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include "Grid.h"
|
#include "Grid.h"
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Info that is setup once and indept of cartesian layout
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void CartesianCommunicator::Init(int *argc, char *** arv)
|
void CartesianCommunicator::Init(int *argc, char *** arv)
|
||||||
{
|
{
|
||||||
|
WorldRank = 0;
|
||||||
|
WorldSize = 1;
|
||||||
|
ShmRank=0;
|
||||||
|
ShmSize=1;
|
||||||
|
GroupRank=WorldRank;
|
||||||
|
GroupSize=WorldSize;
|
||||||
|
Slave =0;
|
||||||
|
ShmInitGeneric();
|
||||||
}
|
}
|
||||||
|
|
||||||
int Rank(void ){ return 0; };
|
|
||||||
|
|
||||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
{
|
{
|
||||||
_processors = processors;
|
_processors = processors;
|
||||||
@ -89,30 +99,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::Barrier(void)
|
void CartesianCommunicator::Barrier(void){}
|
||||||
{
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
|
||||||
}
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
|
||||||
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
|
||||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ assert(0);}
|
||||||
{
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||||
{
|
{
|
||||||
source =0;
|
source =0;
|
||||||
dest=0;
|
dest=0;
|
||||||
}
|
}
|
||||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -39,17 +39,22 @@ namespace Grid {
|
|||||||
BACKTRACEFILE(); \
|
BACKTRACEFILE(); \
|
||||||
}\
|
}\
|
||||||
}
|
}
|
||||||
int Rank(void) {
|
|
||||||
return shmem_my_pe();
|
|
||||||
}
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Info that is setup once and indept of cartesian layout
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
typedef struct HandShake_t {
|
typedef struct HandShake_t {
|
||||||
uint64_t seq_local;
|
uint64_t seq_local;
|
||||||
uint64_t seq_remote;
|
uint64_t seq_remote;
|
||||||
} HandShake;
|
} HandShake;
|
||||||
|
|
||||||
|
|
||||||
static Vector< HandShake > XConnections;
|
static Vector< HandShake > XConnections;
|
||||||
static Vector< HandShake > RConnections;
|
static Vector< HandShake > RConnections;
|
||||||
|
|
||||||
|
|
||||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||||
shmem_init();
|
shmem_init();
|
||||||
XConnections.resize(shmem_n_pes());
|
XConnections.resize(shmem_n_pes());
|
||||||
@ -60,8 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
RConnections[pe].seq_local = 0;
|
RConnections[pe].seq_local = 0;
|
||||||
RConnections[pe].seq_remote= 0;
|
RConnections[pe].seq_remote= 0;
|
||||||
}
|
}
|
||||||
|
WorldSize = shmem_n_pes();
|
||||||
|
WorldRank = shmem_my_pe();
|
||||||
|
ShmRank=0;
|
||||||
|
ShmSize=1;
|
||||||
|
GroupRank=WorldRank;
|
||||||
|
GroupSize=WorldSize;
|
||||||
|
Slave =0;
|
||||||
shmem_barrier_all();
|
shmem_barrier_all();
|
||||||
|
ShmInitGeneric();
|
||||||
}
|
}
|
||||||
|
|
||||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
{
|
{
|
||||||
_ndimension = processors.size();
|
_ndimension = processors.size();
|
||||||
@ -230,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
|
|||||||
|
|
||||||
if ( _processor == sender ) {
|
if ( _processor == sender ) {
|
||||||
|
|
||||||
printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
|
|
||||||
// Check he has posted a receive
|
// Check he has posted a receive
|
||||||
while(SendSeq->seq_remote == SendSeq->seq_local);
|
while(SendSeq->seq_remote == SendSeq->seq_local);
|
||||||
|
|
||||||
printf("Sender receive %d posted\n",sender,receiver);
|
|
||||||
|
|
||||||
// Advance our send count
|
// Advance our send count
|
||||||
seq = ++(SendSeq->seq_local);
|
seq = ++(SendSeq->seq_local);
|
||||||
|
|
||||||
@ -244,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
|
|||||||
shmem_putmem(recv,xmit,bytes,receiver);
|
shmem_putmem(recv,xmit,bytes,receiver);
|
||||||
shmem_fence();
|
shmem_fence();
|
||||||
|
|
||||||
printf("Sender sent payload %d\n",seq);
|
|
||||||
//Notify him we're done
|
//Notify him we're done
|
||||||
shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
|
shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
|
||||||
shmem_fence();
|
shmem_fence();
|
||||||
printf("Sender ringing door bell %d\n",seq);
|
|
||||||
}
|
}
|
||||||
if ( _processor == receiver ) {
|
if ( _processor == receiver ) {
|
||||||
|
|
||||||
printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
|
|
||||||
// Post a receive
|
// Post a receive
|
||||||
seq = ++(RecvSeq->seq_local);
|
seq = ++(RecvSeq->seq_local);
|
||||||
shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
|
shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
|
||||||
|
|
||||||
printf("Receiver Opening letter box %d\n",seq);
|
|
||||||
|
|
||||||
|
|
||||||
// Now wait until he has advanced our reception counter
|
// Now wait until he has advanced our reception counter
|
||||||
while(RecvSeq->seq_remote != RecvSeq->seq_local);
|
while(RecvSeq->seq_remote != RecvSeq->seq_local);
|
||||||
|
|
||||||
printf("Receiver Got the mail %d\n",seq);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ public:
|
|||||||
// Gather for when there is no need to SIMD split with compression
|
// Gather for when there is no need to SIMD split with compression
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class vobj,class cobj,class compressor> void
|
template<class vobj,class cobj,class compressor> void
|
||||||
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
|
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -114,6 +114,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
int offset = b+n*n2;
|
int offset = b+n*n2;
|
||||||
cobj temp =compress(rhs._odata[so+o+b]);
|
cobj temp =compress(rhs._odata[so+o+b]);
|
||||||
|
|
||||||
extract<cobj>(temp,pointers,offset);
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -121,6 +122,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
assert(0); //Fixme think this is buggy
|
assert(0); //Fixme think this is buggy
|
||||||
|
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o=n*rhs._grid->_slice_stride[dimension];
|
int o=n*rhs._grid->_slice_stride[dimension];
|
||||||
@ -139,7 +141,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
SimpleCompressor<vobj> dontcompress;
|
SimpleCompressor<vobj> dontcompress;
|
||||||
Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
|
Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
|
||||||
@ -157,7 +159,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec
|
|||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there is no need to SIMD split
|
// Scatter for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
|
@ -119,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||||
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
|
commVector<vobj> send_buf(buffer_size);
|
||||||
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
commVector<vobj> recv_buf(buffer_size);
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
||||||
@ -191,8 +191,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
int words = sizeof(vobj)/sizeof(vector_type);
|
int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
std::vector<Vector<scalar_object> > send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
|
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
|
||||||
std::vector<Vector<scalar_object> > recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
|
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
|
@ -65,9 +65,6 @@ public:
|
|||||||
|
|
||||||
class LatticeExpressionBase {};
|
class LatticeExpressionBase {};
|
||||||
|
|
||||||
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; // Aligned allocator??
|
|
||||||
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
|
|
||||||
|
|
||||||
template <typename Op, typename T1>
|
template <typename Op, typename T1>
|
||||||
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
|
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
|
||||||
public:
|
public:
|
||||||
|
@ -164,15 +164,17 @@ PARALLEL_FOR_LOOP
|
|||||||
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
int odx,idx;
|
int odx,idx;
|
||||||
idx= grid->iIndex(site);
|
idx= grid->iIndex(site);
|
||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
scalar_type * vp = (scalar_type *)&l._odata[odx];
|
||||||
|
scalar_type * pt = (scalar_type *)&s;
|
||||||
|
|
||||||
extract(l._odata[odx],buf);
|
for(int w=0;w<words;w++){
|
||||||
|
pt[w] = vp[idx+w*Nsimd];
|
||||||
s = buf[idx];
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
@ -190,18 +192,17 @@ PARALLEL_FOR_LOOP
|
|||||||
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
int odx,idx;
|
int odx,idx;
|
||||||
idx= grid->iIndex(site);
|
idx= grid->iIndex(site);
|
||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
scalar_type * vp = (scalar_type *)&l._odata[odx];
|
||||||
|
scalar_type * pt = (scalar_type *)&s;
|
||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
for(int w=0;w<words;w++){
|
||||||
extract(l._odata[odx],buf);
|
vp[idx+w*Nsimd] = pt[w];
|
||||||
|
}
|
||||||
buf[idx] = s;
|
|
||||||
|
|
||||||
merge(l._odata[odx],buf);
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
@ -297,8 +297,9 @@ namespace Grid {
|
|||||||
|
|
||||||
int l_idx=generator_idx(o_idx,i_idx);
|
int l_idx=generator_idx(o_idx,i_idx);
|
||||||
|
|
||||||
std::vector<int> site_seeds(4);
|
const int num_rand_seed=16;
|
||||||
for(int i=0;i<4;i++){
|
std::vector<int> site_seeds(num_rand_seed);
|
||||||
|
for(int i=0;i<site_seeds.size();i++){
|
||||||
site_seeds[i]= ui(pseeder);
|
site_seeds[i]= ui(pseeder);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,7 +33,6 @@ directory
|
|||||||
#define GRID_QCD_FERMION_OPERATOR_IMPL_H
|
#define GRID_QCD_FERMION_OPERATOR_IMPL_H
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
@ -108,13 +107,14 @@ namespace Grid {
|
|||||||
INHERIT_GIMPL_TYPES(Base) \
|
INHERIT_GIMPL_TYPES(Base) \
|
||||||
INHERIT_FIMPL_TYPES(Base)
|
INHERIT_FIMPL_TYPES(Base)
|
||||||
|
|
||||||
///////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
// Single flavour four spinors with colour index
|
// Single flavour four spinors with colour index
|
||||||
///////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
|
template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
|
||||||
class WilsonImpl
|
class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
||||||
: public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
static const int Dimension = Representation::Dimension;
|
static const int Dimension = Representation::Dimension;
|
||||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||||
|
|
||||||
@ -124,7 +124,6 @@ namespace Grid {
|
|||||||
const bool LsVectorised=false;
|
const bool LsVectorised=false;
|
||||||
typedef _Coeff_t Coeff_t;
|
typedef _Coeff_t Coeff_t;
|
||||||
|
|
||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
|
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
|
||||||
@ -158,8 +157,7 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class ref>
|
template <class ref>
|
||||||
inline void loadLinkElement(Simd ®,
|
inline void loadLinkElement(Simd ®, ref &memory) {
|
||||||
ref &memory) {
|
|
||||||
reg = memory;
|
reg = memory;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -202,9 +200,10 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
///////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Single flavour four spinors with colour index, 5d redblack
|
// Single flavour four spinors with colour index, 5d redblack
|
||||||
///////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
|
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
|
||||||
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
||||||
public:
|
public:
|
||||||
@ -227,12 +226,9 @@ namespace Grid {
|
|||||||
typedef Lattice<SiteSpinor> FermionField;
|
typedef Lattice<SiteSpinor> FermionField;
|
||||||
|
|
||||||
// Make the doubled gauge field a *scalar*
|
// Make the doubled gauge field a *scalar*
|
||||||
typedef iImplDoubledGaugeField<typename Simd::scalar_type>
|
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
|
||||||
SiteDoubledGaugeField; // This is a scalar
|
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
|
||||||
typedef iImplGaugeField<typename Simd::scalar_type>
|
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
|
||||||
SiteScalarGaugeField; // scalar
|
|
||||||
typedef iImplGaugeLink<typename Simd::scalar_type>
|
|
||||||
SiteScalarGaugeLink; // scalar
|
|
||||||
|
|
||||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||||
|
|
||||||
@ -250,6 +246,7 @@ namespace Grid {
|
|||||||
inline void loadLinkElement(Simd ®, ref &memory) {
|
inline void loadLinkElement(Simd ®, ref &memory) {
|
||||||
vsplat(reg, memory);
|
vsplat(reg, memory);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
|
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
|
||||||
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
|
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
|
||||||
StencilImpl &St) {
|
StencilImpl &St) {
|
||||||
@ -262,8 +259,8 @@ namespace Grid {
|
|||||||
mult(&phi(), &UU(), &chi());
|
mult(&phi(), &UU(), &chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
|
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||||
const GaugeField &Umu) {
|
{
|
||||||
SiteScalarGaugeField ScalarUmu;
|
SiteScalarGaugeField ScalarUmu;
|
||||||
SiteDoubledGaugeField ScalarUds;
|
SiteDoubledGaugeField ScalarUds;
|
||||||
|
|
||||||
@ -289,13 +286,13 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
|
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
|
||||||
FermionField &A, int mu) {
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
|
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField Ã, int mu)
|
||||||
FermionField Ã, int mu) {
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -305,9 +302,9 @@ namespace Grid {
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template <class S, int Nrepresentation,class _Coeff_t = RealD>
|
template <class S, int Nrepresentation,class _Coeff_t = RealD>
|
||||||
class GparityWilsonImpl
|
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
|
||||||
: public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
static const int Dimension = Nrepresentation;
|
static const int Dimension = Nrepresentation;
|
||||||
|
|
||||||
const bool LsVectorised=false;
|
const bool LsVectorised=false;
|
||||||
@ -317,15 +314,9 @@ namespace Grid {
|
|||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
template <typename vtype>
|
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
|
||||||
using iImplSpinor =
|
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
|
||||||
iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
|
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
|
||||||
template <typename vtype>
|
|
||||||
using iImplHalfSpinor =
|
|
||||||
iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
|
|
||||||
template <typename vtype>
|
|
||||||
using iImplDoubledGaugeField =
|
|
||||||
iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
|
|
||||||
|
|
||||||
typedef iImplSpinor<Simd> SiteSpinor;
|
typedef iImplSpinor<Simd> SiteSpinor;
|
||||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||||
@ -341,7 +332,6 @@ namespace Grid {
|
|||||||
|
|
||||||
ImplParams Params;
|
ImplParams Params;
|
||||||
|
|
||||||
|
|
||||||
GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
|
GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
|
||||||
|
|
||||||
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
|
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
|
||||||
@ -351,6 +341,7 @@ namespace Grid {
|
|||||||
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
|
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
|
||||||
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
|
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
|
||||||
StencilImpl &St) {
|
StencilImpl &St) {
|
||||||
|
|
||||||
typedef SiteHalfSpinor vobj;
|
typedef SiteHalfSpinor vobj;
|
||||||
typedef typename SiteHalfSpinor::scalar_object sobj;
|
typedef typename SiteHalfSpinor::scalar_object sobj;
|
||||||
|
|
||||||
@ -419,7 +410,6 @@ namespace Grid {
|
|||||||
|
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||||
{
|
{
|
||||||
|
|
||||||
conformable(Uds._grid,GaugeGrid);
|
conformable(Uds._grid,GaugeGrid);
|
||||||
conformable(Umu._grid,GaugeGrid);
|
conformable(Umu._grid,GaugeGrid);
|
||||||
|
|
||||||
@ -429,7 +419,6 @@ namespace Grid {
|
|||||||
|
|
||||||
Lattice<iScalar<vInteger> > coor(GaugeGrid);
|
Lattice<iScalar<vInteger> > coor(GaugeGrid);
|
||||||
|
|
||||||
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
LatticeCoordinate(coor,mu);
|
LatticeCoordinate(coor,mu);
|
||||||
@ -443,7 +432,6 @@ namespace Grid {
|
|||||||
Uconj = where(coor==neglink,-Uconj,Uconj);
|
Uconj = where(coor==neglink,-Uconj,Uconj);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(auto ss=U.begin();ss<U.end();ss++){
|
for(auto ss=U.begin();ss<U.end();ss++){
|
||||||
Uds[ss](0)(mu) = U[ss]();
|
Uds[ss](0)(mu) = U[ss]();
|
||||||
@ -477,8 +465,8 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
|
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
|
||||||
FermionField &A, int mu) {
|
|
||||||
// DhopDir provides U or Uconj depending on coor/flavour.
|
// DhopDir provides U or Uconj depending on coor/flavour.
|
||||||
GaugeLinkField link(mat._grid);
|
GaugeLinkField link(mat._grid);
|
||||||
// use lorentz for flavour as hack.
|
// use lorentz for flavour as hack.
|
||||||
@ -491,8 +479,8 @@ namespace Grid {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
|
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) {
|
||||||
FermionField Ã, int mu) {
|
|
||||||
int Ls = Btilde._grid->_fdimensions[0];
|
int Ls = Btilde._grid->_fdimensions[0];
|
||||||
|
|
||||||
GaugeLinkField tmp(mat._grid);
|
GaugeLinkField tmp(mat._grid);
|
||||||
@ -508,13 +496,13 @@ namespace Grid {
|
|||||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec
|
typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec
|
||||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
|
typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
|
||||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
|
typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
|
||||||
|
|
||||||
|
|
||||||
typedef WilsonImpl<vComplex, FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
|
typedef WilsonImpl<vComplex, FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
|
||||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
|
typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
|
||||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
|
typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
|
||||||
@ -538,6 +526,7 @@ namespace Grid {
|
|||||||
typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR; // Real.. whichever prec
|
typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR; // Real.. whichever prec
|
||||||
typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float
|
typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float
|
||||||
typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double
|
typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double
|
||||||
}
|
|
||||||
}
|
}}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -166,7 +166,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for (int sss = 0; sss < B._grid->oSites(); sss++) {
|
for (int sss = 0; sss < B._grid->oSites(); sss++) {
|
||||||
Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu,
|
Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
|
||||||
gamma);
|
gamma);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -277,7 +277,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
|
|||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out,
|
Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
|
||||||
dirdisp, gamma);
|
dirdisp, gamma);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -295,13 +295,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
|
Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
|
||||||
out);
|
out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
|
Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
|
||||||
out);
|
out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -185,18 +185,14 @@ void WilsonFermion5D<Impl>::Report(void)
|
|||||||
if ( DhopCalls > 0 ) {
|
if ( DhopCalls > 0 ) {
|
||||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl;
|
std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl;
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
|
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
|
||||||
<< " us" << std::endl;
|
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : "
|
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl;
|
||||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : "
|
|
||||||
<< DhopComputeTime << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : "
|
|
||||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
|
||||||
|
|
||||||
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime;
|
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -210,12 +206,9 @@ void WilsonFermion5D<Impl>::Report(void)
|
|||||||
std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time : " <<DerivDhopComputeTime <<" us"<<std::endl;
|
std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time : " <<DerivDhopComputeTime <<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
|
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (DerivCalls > 0 || DhopCalls > 0){
|
if (DerivCalls > 0 || DhopCalls > 0){
|
||||||
@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP
|
|||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
|
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
|||||||
assert(sF < B._grid->oSites());
|
assert(sF < B._grid->oSites());
|
||||||
assert(sU < U._grid->oSites());
|
assert(sU < U._grid->oSites());
|
||||||
|
|
||||||
Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
|
Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
|
||||||
gamma);
|
|
||||||
|
|
||||||
////////////////////////////
|
////////////////////////////
|
||||||
// spin trace outer product
|
// spin trace outer product
|
||||||
@ -396,7 +388,6 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
DhopCalls++;
|
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
@ -413,8 +404,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
int sU = ss;
|
int sU = ss;
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
|
Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
|
||||||
out);
|
|
||||||
}
|
}
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
} else if (stat.is_init() ) {
|
} else if (stat.is_init() ) {
|
||||||
@ -428,11 +418,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|||||||
int mythread = omp_get_thread_num();
|
int mythread = omp_get_thread_num();
|
||||||
stat.enter(mythread);
|
stat.enter(mythread);
|
||||||
#pragma omp for nowait
|
#pragma omp for nowait
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++)
|
for(int ss=0;ss<U._grid->oSites();ss++) {
|
||||||
{
|
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF=LLs*sU;
|
int sF=LLs*sU;
|
||||||
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||||
}
|
}
|
||||||
stat.exit(mythread);
|
stat.exit(mythread);
|
||||||
}
|
}
|
||||||
@ -443,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
int sU = ss;
|
int sU = ss;
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
|
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||||
out);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DhopComputeTime+=usecond();
|
DhopComputeTime+=usecond();
|
||||||
@ -454,6 +442,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
|
DhopCalls++;
|
||||||
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
|
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
|
||||||
conformable(in._grid,out._grid); // drops the cb check
|
conformable(in._grid,out._grid); // drops the cb check
|
||||||
|
|
||||||
@ -465,6 +454,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
|
DhopCalls++;
|
||||||
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
|
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
|
||||||
conformable(in._grid,out._grid); // drops the cb check
|
conformable(in._grid,out._grid); // drops the cb check
|
||||||
|
|
||||||
@ -476,6 +466,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
|
DhopCalls+=2;
|
||||||
conformable(in._grid,FermionGrid()); // verifies full grid
|
conformable(in._grid,FermionGrid()); // verifies full grid
|
||||||
conformable(in._grid,out._grid);
|
conformable(in._grid,out._grid);
|
||||||
|
|
||||||
|
@ -34,7 +34,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/Stat.h>
|
#include <Grid/Stat.h>
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -182,7 +181,7 @@ namespace Grid {
|
|||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
|
||||||
}
|
}}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -43,9 +43,8 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
|
|||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
|
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
SiteHalfSpinor *buf, int sF,
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
|
|
||||||
int sU, const FermionField &in, FermionField &out) {
|
int sU, const FermionField &in, FermionField &out) {
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
@ -220,9 +219,8 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
|
|||||||
|
|
||||||
// Need controls to do interior, exterior, or both
|
// Need controls to do interior, exterior, or both
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(
|
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
SiteHalfSpinor *buf, int sF,
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
|
|
||||||
int sU, const FermionField &in, FermionField &out) {
|
int sU, const FermionField &in, FermionField &out) {
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopDir(
|
void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
|
||||||
StencilImpl &st, DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
|
|
||||||
int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
|
int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
|
||||||
|
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteSpinor result;
|
SiteSpinor result;
|
||||||
|
@ -32,7 +32,6 @@ directory
|
|||||||
#define GRID_QCD_DHOP_H
|
#define GRID_QCD_DHOP_H
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -56,16 +55,11 @@ namespace Grid {
|
|||||||
|
|
||||||
template <bool EnableBool = true>
|
template <bool EnableBool = true>
|
||||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
||||||
DiracOptDhopSite(
|
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
|
||||||
FermionField &out) {
|
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
if (AsmOpt) {
|
if (AsmOpt) {
|
||||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
|
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
in, out);
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
#else
|
#else
|
||||||
{
|
{
|
||||||
@ -73,11 +67,9 @@ namespace Grid {
|
|||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if (HandOpt)
|
if (HandOpt)
|
||||||
WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
|
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
in, out);
|
|
||||||
else
|
else
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
|
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
in, out);
|
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
@ -87,15 +79,12 @@ namespace Grid {
|
|||||||
|
|
||||||
template <bool EnableBool = true>
|
template <bool EnableBool = true>
|
||||||
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
|
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
|
||||||
DiracOptDhopSite(
|
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
|
||||||
FermionField &out) {
|
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
|
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
|
||||||
out);
|
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
@ -103,17 +92,12 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <bool EnableBool = true>
|
template <bool EnableBool = true>
|
||||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
|
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
|
||||||
void>::type
|
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
DiracOptDhopSiteDag(
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
|
||||||
FermionField &out) {
|
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
if (AsmOpt) {
|
if (AsmOpt) {
|
||||||
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
|
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
Ns, in, out);
|
|
||||||
} else {
|
} else {
|
||||||
#else
|
#else
|
||||||
{
|
{
|
||||||
@ -121,11 +105,9 @@ namespace Grid {
|
|||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if (HandOpt)
|
if (HandOpt)
|
||||||
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
|
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
in, out);
|
|
||||||
else
|
else
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
|
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
sU, in, out);
|
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
@ -134,73 +116,48 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <bool EnableBool = true>
|
template <bool EnableBool = true>
|
||||||
typename std::enable_if<
|
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
|
||||||
(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
|
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
|
||||||
void>::type
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||||
DiracOptDhopSiteDag(
|
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
|
||||||
FermionField &out) {
|
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
|
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
in, out);
|
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void DiracOptDhopDir(
|
void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, DoubledGaugeField &U,
|
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
|
|
||||||
int gamma);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Specialised variants
|
// Specialised variants
|
||||||
void DiracOptGenericDhopSite(
|
void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptGenericDhopSiteDag(
|
void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptAsmDhopSite(
|
void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
|
||||||
FermionField &out);
|
|
||||||
|
|
||||||
void DiracOptAsmDhopSiteDag(
|
void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
|
||||||
FermionField &out);
|
|
||||||
|
|
||||||
void DiracOptHandDhopSite(
|
void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptHandDhopSiteDag(
|
void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
WilsonKernels(const ImplParams &p = ImplParams());
|
WilsonKernels(const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -38,26 +38,22 @@ namespace Grid {
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
// Default to no assembler implementation
|
// Default to no assembler implementation
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
template<class Impl>
|
template<class Impl> void
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class Impl> void
|
||||||
|
WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(AVX512)
|
#if defined(AVX512)
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
// If we are AVX512 specialise the single precision routine
|
// If we are AVX512 specialise the single precision routine
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
@ -84,16 +80,14 @@ namespace Grid {
|
|||||||
#define FX(A) WILSONASM_ ##A
|
#define FX(A) WILSONASM_ ##A
|
||||||
|
|
||||||
#undef KERNEL_DAG
|
#undef KERNEL_DAG
|
||||||
template<>
|
template<> void
|
||||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#define KERNEL_DAG
|
#define KERNEL_DAG
|
||||||
template<>
|
template<> void
|
||||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
@ -109,31 +103,26 @@ namespace Grid {
|
|||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||||
|
|
||||||
#undef KERNEL_DAG
|
#undef KERNEL_DAG
|
||||||
template<>
|
template<> void
|
||||||
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#define KERNEL_DAG
|
#define KERNEL_DAG
|
||||||
template<>
|
template<> void
|
||||||
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define INSTANTIATE_ASM(A)\
|
#define INSTANTIATE_ASM(A)\
|
||||||
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
|
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,\
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
|
||||||
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
|
\
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,\
|
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
|
||||||
|
|
||||||
|
|
||||||
INSTANTIATE_ASM(WilsonImplF);
|
INSTANTIATE_ASM(WilsonImplF);
|
||||||
INSTANTIATE_ASM(WilsonImplD);
|
INSTANTIATE_ASM(WilsonImplD);
|
||||||
INSTANTIATE_ASM(ZWilsonImplF);
|
INSTANTIATE_ASM(ZWilsonImplF);
|
||||||
@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF);
|
|||||||
INSTANTIATE_ASM(DomainWallVec5dImplD);
|
INSTANTIATE_ASM(DomainWallVec5dImplD);
|
||||||
INSTANTIATE_ASM(ZDomainWallVec5dImplF);
|
INSTANTIATE_ASM(ZDomainWallVec5dImplF);
|
||||||
INSTANTIATE_ASM(ZDomainWallVec5dImplD);
|
INSTANTIATE_ASM(ZDomainWallVec5dImplD);
|
||||||
}
|
|
||||||
}
|
}}
|
||||||
|
|
||||||
|
@ -311,9 +311,8 @@ namespace Grid {
|
|||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> void
|
||||||
void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
@ -555,8 +554,7 @@ namespace QCD {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
// std::cout << "Hand op Dhop "<<std::endl;
|
// std::cout << "Hand op Dhop "<<std::endl;
|
||||||
@ -798,37 +796,34 @@ namespace QCD {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// Specialise Gparity to simple implementation
|
// Specialise Gparity to simple implementation
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
template<>
|
template<> void
|
||||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
SiteHalfSpinor *buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<> void
|
||||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
SiteHalfSpinor *buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<> void
|
||||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<> void
|
||||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -840,11 +835,9 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
|||||||
// Need Nc=3 though //
|
// Need Nc=3 though //
|
||||||
|
|
||||||
#define INSTANTIATE_THEM(A) \
|
#define INSTANTIATE_THEM(A) \
|
||||||
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
|
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,\
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out); \
|
int ss,int sU,const FermionField &in, FermionField &out); \
|
||||||
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
|
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,\
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
INSTANTIATE_THEM(WilsonImplF);
|
INSTANTIATE_THEM(WilsonImplF);
|
||||||
|
@ -42,6 +42,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid{
|
namespace Grid{
|
||||||
namespace Optimization {
|
namespace Optimization {
|
||||||
|
|
||||||
|
union u512f {
|
||||||
|
__m512 v;
|
||||||
|
float f[16];
|
||||||
|
};
|
||||||
|
|
||||||
|
union u512d {
|
||||||
|
__m512d v;
|
||||||
|
double f[8];
|
||||||
|
};
|
||||||
|
|
||||||
struct Vsplat{
|
struct Vsplat{
|
||||||
//Complex float
|
//Complex float
|
||||||
inline __m512 operator()(float a, float b){
|
inline __m512 operator()(float a, float b){
|
||||||
@ -360,6 +370,72 @@ namespace Optimization {
|
|||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Some Template specialization
|
// Some Template specialization
|
||||||
|
|
||||||
|
// Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
|
||||||
|
<<<<<<< HEAD
|
||||||
|
#define GNU_CLANG_COMPILER
|
||||||
|
#ifdef GNU_CLANG_COMPILER
|
||||||
|
=======
|
||||||
|
|
||||||
|
#ifndef __INTEL_COMPILER
|
||||||
|
#warning "Slow reduction due to incomplete reduce intrinsics"
|
||||||
|
>>>>>>> develop
|
||||||
|
//Complex float Reduce
|
||||||
|
template<>
|
||||||
|
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
|
||||||
|
__m512 v1,v2;
|
||||||
|
v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
|
||||||
|
v1= _mm512_add_ps(v1,in);
|
||||||
|
v2=Optimization::Permute::Permute1(v1);
|
||||||
|
v1 = _mm512_add_ps(v1,v2);
|
||||||
|
v2=Optimization::Permute::Permute2(v1);
|
||||||
|
v1 = _mm512_add_ps(v1,v2);
|
||||||
|
u512f conv; conv.v = v1;
|
||||||
|
return Grid::ComplexF(conv.f[0],conv.f[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Real float Reduce
|
||||||
|
template<>
|
||||||
|
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
|
||||||
|
__m512 v1,v2;
|
||||||
|
v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
|
||||||
|
v1 = _mm512_add_ps(v1,in);
|
||||||
|
v2 = Optimization::Permute::Permute1(v1);
|
||||||
|
v1 = _mm512_add_ps(v1,v2);
|
||||||
|
v2 = Optimization::Permute::Permute2(v1);
|
||||||
|
v1 = _mm512_add_ps(v1,v2);
|
||||||
|
v2 = Optimization::Permute::Permute3(v1);
|
||||||
|
v1 = _mm512_add_ps(v1,v2);
|
||||||
|
u512f conv; conv.v=v1;
|
||||||
|
return conv.f[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//Complex double Reduce
|
||||||
|
template<>
|
||||||
|
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
|
||||||
|
__m512d v1;
|
||||||
|
v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
|
||||||
|
v1 = _mm512_add_pd(v1,in);
|
||||||
|
v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
|
||||||
|
v1 = _mm512_add_pd(v1,in);
|
||||||
|
u512d conv; conv.v = v1;
|
||||||
|
return Grid::ComplexD(conv.f[0],conv.f[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Real double Reduce
|
||||||
|
template<>
|
||||||
|
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
|
||||||
|
__m512d v1,v2;
|
||||||
|
v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
|
||||||
|
v1 = _mm512_add_pd(v1,in);
|
||||||
|
v2 = Optimization::Permute::Permute1(v1);
|
||||||
|
v1 = _mm512_add_pd(v1,v2);
|
||||||
|
v2 = Optimization::Permute::Permute2(v1);
|
||||||
|
v1 = _mm512_add_pd(v1,v2);
|
||||||
|
u512d conv; conv.v = v1;
|
||||||
|
return conv.f[0];
|
||||||
|
}
|
||||||
|
#else
|
||||||
//Complex float Reduce
|
//Complex float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
|
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
|
||||||
@ -371,7 +447,6 @@ namespace Optimization {
|
|||||||
return _mm512_reduce_add_ps(in);
|
return _mm512_reduce_add_ps(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//Complex double Reduce
|
//Complex double Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
|
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
|
||||||
@ -391,6 +466,7 @@ namespace Optimization {
|
|||||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
printf("Reduce : Missing integer implementation -> FIX\n");
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -250,8 +250,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline
|
template<class vobj> inline void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
||||||
void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
@ -269,8 +268,7 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
|||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline
|
template<class vobj> inline void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
||||||
void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
@ -116,7 +116,7 @@ int main (int argc, char ** argv)
|
|||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
Check._odata[i] = Foo._odata[SE->_offset];
|
Check._odata[i] = Foo._odata[SE->_offset];
|
||||||
else
|
else
|
||||||
Check._odata[i] = myStencil.comm_buf[SE->_offset];
|
Check._odata[i] = myStencil.CommBuf()[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
Real nrmC = norm2(Check);
|
Real nrmC = norm2(Check);
|
||||||
@ -207,7 +207,7 @@ int main (int argc, char ** argv)
|
|||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
OCheck._odata[i] = EFoo._odata[SE->_offset];
|
OCheck._odata[i] = EFoo._odata[SE->_offset];
|
||||||
else
|
else
|
||||||
OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
|
OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
|
||||||
}
|
}
|
||||||
for(int i=0;i<ECheck._grid->oSites();i++){
|
for(int i=0;i<ECheck._grid->oSites();i++){
|
||||||
int permute_type;
|
int permute_type;
|
||||||
@ -220,7 +220,7 @@ int main (int argc, char ** argv)
|
|||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
ECheck._odata[i] = OFoo._odata[SE->_offset];
|
ECheck._odata[i] = OFoo._odata[SE->_offset];
|
||||||
else
|
else
|
||||||
ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
|
ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
setCheckerboard(Check,ECheck);
|
setCheckerboard(Check,ECheck);
|
||||||
|
Reference in New Issue
Block a user