1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Reduce now going through MPI.

This commit is contained in:
Peter Boyle 2015-04-14 22:40:40 +01:00
parent f1876b7e95
commit ab9a764bb1
12 changed files with 123 additions and 74 deletions

View File

@ -42,13 +42,31 @@ class CartesianCommunicator {
////////////////////////////////////////////////////////////
// Reduction
////////////////////////////////////////////////////////////
void GlobalSum(float &);
void GlobalSumVector(float *,int N);
void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N);
void GlobalSum(double &);
void GlobalSumVector(double *,int N);
void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N);
template<class obj> void GlobalSumObj(obj &o){
void GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);

View File

@ -407,18 +407,83 @@ public:
return ret;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline RealD norm2(const Lattice<vobj> &arg){
typedef typename vobj::scalar_type scalar;
decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
scalar nrm;
//FIXME make this loop parallelisable
for(int ss=0;ss<arg._grid->oSites(); ss++){
vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]);
vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
}
nrm = Reduce(TensorRemove(vnrm));
arg._grid->GlobalSum(nrm);
return real(nrm);
}
template<class vobj>
inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
{
typedef typename vobj::scalar_type scalar;
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
scalar nrm;
//FIXME make this loop parallelisable
for(int ss=0;ss<left._grid->oSites(); ss++){
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
}
nrm = Reduce(TensorRemove(vnrm));
right._grid->GlobalSum(nrm);
return nrm;
}
/////////////////////////////////////////////////////
// Non site reduced routines
/////////////////////////////////////////////////////
// localNorm2,
template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(rhs,rhs);
}
return ret;
}
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
{
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
}
#endif

View File

@ -57,41 +57,6 @@ namespace QCD {
typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vColourVector> LatticeColourVector;
// localNorm2,
template<class tt>
inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
{
LatticeComplex ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=trace(adj(rhs)*rhs);
}
return ret;
}
// localInnerProduct
template<class tt>
inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
{
LatticeComplex ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
{
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
#pragma omp parallel for
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// FIXME for debug; deprecate this
inline void LatticeCoordinate(LatticeInteger &l,int mu){

View File

@ -29,21 +29,20 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
}
void CartesianCommunicator::GlobalSum(float &f){
MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSum(double &d)
{
MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
MPI_Cart_shift(communicator,dim,shift,&source,&dest);

View File

@ -8,7 +8,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
{
int rd = rhs._grid->_rdimensions[dimension];
printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout);
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
@ -19,7 +18,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout);
buffer[bo++]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];

View File

@ -903,38 +903,38 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri
///////////////////////////////////////////////////////////////////////////////////////
// localInnerProduct Scalar x Scalar -> Scalar
// localInnerProduct Vector x Vector -> Scalar
// localInnerProduct Matrix x Matrix -> Scalar
// innerProduct Scalar x Scalar -> Scalar
// innerProduct Vector x Vector -> Scalar
// innerProduct Matrix x Matrix -> Scalar
///////////////////////////////////////////////////////////////////////////////////////
template<class l,class r,int N> inline
auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))>
auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
{
typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret=zero;
for(int c1=0;c1<N;c1++){
ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]);
ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
}
return ret;
}
template<class l,class r,int N> inline
auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
{
typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
iScalar<ret_t> ret=zero;
iScalar<ret_t> tmp;
for(int c1=0;c1<N;c1++){
for(int c2=0;c2<N;c2++){
ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
}}
return ret;
}
template<class l,class r> inline
auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))>
auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
{
typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t;
typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
iScalar<ret_t> ret;
ret._internal = localInnerProduct(lhs._internal,rhs._internal);
ret._internal = innerProduct(lhs._internal,rhs._internal);
return ret;
}

View File

@ -33,10 +33,10 @@ namespace Grid {
inline RealF adj(const RealF & r){ return r; }
inline RealF conj(const RealF & r){ return r; }
inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; }
inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; }
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
////////////////////////////////////////////////////////////////////////////////
//Provide support functions for basic real and complex data types required by Grid

View File

@ -313,7 +313,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
};
inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
typedef vComplexD vDComplex;

View File

@ -362,7 +362,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
};
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r)
inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)
{
return conj(l)*r;
}

View File

@ -240,7 +240,7 @@ namespace Grid {
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
};
inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
inline void zeroit(vRealD &z){ vzero(z);}
inline vRealD outerProduct(const vRealD &l, const vRealD& r)

View File

@ -260,7 +260,7 @@ friend inline void vstore(const vRealF &ret, float *a){
public:
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
};
inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
inline void zeroit(vRealF &z){ vzero(z);}
inline vRealF outerProduct(const vRealF &l, const vRealF& r)

14
TODO
View File

@ -1,15 +1,14 @@
AUDITS:
* FIXME audit
* const audit
* Replace vset with a call to merge.;
* care in Gmerge,Gextract over vset .
* Const audit
* extract / merge extra implementation removal
FUNCTIONALITY:
* Conditional execution, where etc... -----DONE, simple test
* Integer relational support -----DONE
@ -22,6 +21,11 @@
* Stencil operator support -----Initial thoughts, trial implementation DONE.
-----some simple tests that Stencil matches Cshift.
-----do all permute in comms phase, so that copy permute
-----cases move into a buffer.
* CovariantShift support -----Use a class to store gauge field? (parallel transport?)
* Subset support, slice sums etc... -----Only need slice sum?
-----Generic cartesian subslicing?