mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Reduce now going through MPI.
This commit is contained in:
parent
f1876b7e95
commit
ab9a764bb1
@ -42,13 +42,31 @@ class CartesianCommunicator {
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Reduction
|
// Reduction
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
void GlobalSum(float &);
|
void GlobalSum(RealF &);
|
||||||
void GlobalSumVector(float *,int N);
|
void GlobalSumVector(RealF *,int N);
|
||||||
|
|
||||||
void GlobalSum(double &);
|
void GlobalSum(RealD &);
|
||||||
void GlobalSumVector(double *,int N);
|
void GlobalSumVector(RealD *,int N);
|
||||||
|
|
||||||
template<class obj> void GlobalSumObj(obj &o){
|
void GlobalSum(ComplexF &c)
|
||||||
|
{
|
||||||
|
GlobalSumVector((float *)&c,2);
|
||||||
|
}
|
||||||
|
void GlobalSumVector(ComplexF *c,int N)
|
||||||
|
{
|
||||||
|
GlobalSumVector((float *)c,2*N);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GlobalSum(ComplexD &c)
|
||||||
|
{
|
||||||
|
GlobalSumVector((double *)&c,2);
|
||||||
|
}
|
||||||
|
void GlobalSumVector(ComplexD *c,int N)
|
||||||
|
{
|
||||||
|
GlobalSumVector((double *)c,2*N);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class obj> void GlobalSum(obj &o){
|
||||||
|
|
||||||
typedef typename obj::scalar_type scalar_type;
|
typedef typename obj::scalar_type scalar_type;
|
||||||
int words = sizeof(obj)/sizeof(scalar_type);
|
int words = sizeof(obj)/sizeof(scalar_type);
|
||||||
|
@ -407,18 +407,83 @@ public:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Reduction operations
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline RealD norm2(const Lattice<vobj> &arg){
|
inline RealD norm2(const Lattice<vobj> &arg){
|
||||||
|
|
||||||
typedef typename vobj::scalar_type scalar;
|
typedef typename vobj::scalar_type scalar;
|
||||||
decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
|
decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
|
||||||
|
|
||||||
scalar nrm;
|
scalar nrm;
|
||||||
|
//FIXME make this loop parallelisable
|
||||||
for(int ss=0;ss<arg._grid->oSites(); ss++){
|
for(int ss=0;ss<arg._grid->oSites(); ss++){
|
||||||
vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]);
|
vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
|
||||||
}
|
}
|
||||||
nrm = Reduce(TensorRemove(vnrm));
|
nrm = Reduce(TensorRemove(vnrm));
|
||||||
|
arg._grid->GlobalSum(nrm);
|
||||||
return real(nrm);
|
return real(nrm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj>
|
||||||
|
inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar;
|
||||||
|
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
|
||||||
|
|
||||||
|
scalar nrm;
|
||||||
|
//FIXME make this loop parallelisable
|
||||||
|
for(int ss=0;ss<left._grid->oSites(); ss++){
|
||||||
|
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
|
||||||
|
}
|
||||||
|
nrm = Reduce(TensorRemove(vnrm));
|
||||||
|
right._grid->GlobalSum(nrm);
|
||||||
|
return nrm;
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
// Non site reduced routines
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// localNorm2,
|
||||||
|
template<class vobj>
|
||||||
|
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
||||||
|
{
|
||||||
|
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
|
ret._odata[ss]=innerProduct(rhs,rhs);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// localInnerProduct
|
||||||
|
template<class vobj>
|
||||||
|
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
|
||||||
|
-> Lattice<typename vobj::tensor_reduced>
|
||||||
|
{
|
||||||
|
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
|
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// outerProduct Scalar x Scalar -> Scalar
|
||||||
|
// Vector x Vector -> Matrix
|
||||||
|
template<class ll,class rr>
|
||||||
|
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
|
||||||
|
{
|
||||||
|
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||||
|
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
35
Grid_QCD.h
35
Grid_QCD.h
@ -57,41 +57,6 @@ namespace QCD {
|
|||||||
typedef Lattice<vSpinVector> LatticeSpinVector;
|
typedef Lattice<vSpinVector> LatticeSpinVector;
|
||||||
typedef Lattice<vColourVector> LatticeColourVector;
|
typedef Lattice<vColourVector> LatticeColourVector;
|
||||||
|
|
||||||
// localNorm2,
|
|
||||||
template<class tt>
|
|
||||||
inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
|
|
||||||
{
|
|
||||||
LatticeComplex ret(rhs._grid);
|
|
||||||
#pragma omp parallel for
|
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
ret._odata[ss]=trace(adj(rhs)*rhs);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
// localInnerProduct
|
|
||||||
template<class tt>
|
|
||||||
inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
|
|
||||||
{
|
|
||||||
LatticeComplex ret(rhs._grid);
|
|
||||||
#pragma omp parallel for
|
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// outerProduct Scalar x Scalar -> Scalar
|
|
||||||
// Vector x Vector -> Matrix
|
|
||||||
template<class ll,class rr>
|
|
||||||
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
|
|
||||||
{
|
|
||||||
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
|
|
||||||
#pragma omp parallel for
|
|
||||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME for debug; deprecate this
|
// FIXME for debug; deprecate this
|
||||||
inline void LatticeCoordinate(LatticeInteger &l,int mu){
|
inline void LatticeCoordinate(LatticeInteger &l,int mu){
|
||||||
|
@ -29,21 +29,20 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
{
|
{
|
||||||
MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(double &d)
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
{
|
{
|
||||||
MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
{
|
{
|
||||||
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||||
{
|
{
|
||||||
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||||
|
@ -8,7 +8,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
|||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout);
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
@ -19,7 +18,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
|||||||
#pragma omp parallel for collapse(2)
|
#pragma omp parallel for collapse(2)
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout);
|
|
||||||
buffer[bo++]=rhs._odata[so+o+b];
|
buffer[bo++]=rhs._odata[so+o+b];
|
||||||
}
|
}
|
||||||
o +=rhs._grid->_slice_stride[dimension];
|
o +=rhs._grid->_slice_stride[dimension];
|
||||||
|
@ -903,38 +903,38 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri
|
|||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// localInnerProduct Scalar x Scalar -> Scalar
|
// innerProduct Scalar x Scalar -> Scalar
|
||||||
// localInnerProduct Vector x Vector -> Scalar
|
// innerProduct Vector x Vector -> Scalar
|
||||||
// localInnerProduct Matrix x Matrix -> Scalar
|
// innerProduct Matrix x Matrix -> Scalar
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class l,class r,int N> inline
|
template<class l,class r,int N> inline
|
||||||
auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))>
|
auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
|
||||||
{
|
{
|
||||||
typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
|
typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
|
||||||
iScalar<ret_t> ret=zero;
|
iScalar<ret_t> ret=zero;
|
||||||
for(int c1=0;c1<N;c1++){
|
for(int c1=0;c1<N;c1++){
|
||||||
ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]);
|
ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template<class l,class r,int N> inline
|
template<class l,class r,int N> inline
|
||||||
auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
|
auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
|
||||||
{
|
{
|
||||||
typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
|
typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
|
||||||
iScalar<ret_t> ret=zero;
|
iScalar<ret_t> ret=zero;
|
||||||
iScalar<ret_t> tmp;
|
iScalar<ret_t> tmp;
|
||||||
for(int c1=0;c1<N;c1++){
|
for(int c1=0;c1<N;c1++){
|
||||||
for(int c2=0;c2<N;c2++){
|
for(int c2=0;c2<N;c2++){
|
||||||
ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
|
ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
|
||||||
}}
|
}}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template<class l,class r> inline
|
template<class l,class r> inline
|
||||||
auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))>
|
auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
|
||||||
{
|
{
|
||||||
typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t;
|
typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
|
||||||
iScalar<ret_t> ret;
|
iScalar<ret_t> ret;
|
||||||
ret._internal = localInnerProduct(lhs._internal,rhs._internal);
|
ret._internal = innerProduct(lhs._internal,rhs._internal);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,10 +33,10 @@ namespace Grid {
|
|||||||
|
|
||||||
inline RealF adj(const RealF & r){ return r; }
|
inline RealF adj(const RealF & r){ return r; }
|
||||||
inline RealF conj(const RealF & r){ return r; }
|
inline RealF conj(const RealF & r){ return r; }
|
||||||
inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
||||||
inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
|
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
|
||||||
inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; }
|
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
|
||||||
inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; }
|
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//Provide support functions for basic real and complex data types required by Grid
|
//Provide support functions for basic real and complex data types required by Grid
|
||||||
|
@ -313,7 +313,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
|
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
|
||||||
|
|
||||||
|
|
||||||
typedef vComplexD vDComplex;
|
typedef vComplexD vDComplex;
|
||||||
|
@ -362,7 +362,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r)
|
inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)
|
||||||
{
|
{
|
||||||
return conj(l)*r;
|
return conj(l)*r;
|
||||||
}
|
}
|
||||||
|
@ -240,7 +240,7 @@ namespace Grid {
|
|||||||
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
|
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
|
||||||
};
|
};
|
||||||
|
|
||||||
inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
|
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
|
||||||
inline void zeroit(vRealD &z){ vzero(z);}
|
inline void zeroit(vRealD &z){ vzero(z);}
|
||||||
|
|
||||||
inline vRealD outerProduct(const vRealD &l, const vRealD& r)
|
inline vRealD outerProduct(const vRealD &l, const vRealD& r)
|
||||||
|
@ -260,7 +260,7 @@ friend inline void vstore(const vRealF &ret, float *a){
|
|||||||
public:
|
public:
|
||||||
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
||||||
};
|
};
|
||||||
inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
||||||
inline void zeroit(vRealF &z){ vzero(z);}
|
inline void zeroit(vRealF &z){ vzero(z);}
|
||||||
|
|
||||||
inline vRealF outerProduct(const vRealF &l, const vRealF& r)
|
inline vRealF outerProduct(const vRealF &l, const vRealF& r)
|
||||||
|
14
TODO
14
TODO
@ -1,15 +1,14 @@
|
|||||||
|
|
||||||
|
AUDITS:
|
||||||
* FIXME audit
|
* FIXME audit
|
||||||
|
* const audit
|
||||||
* Replace vset with a call to merge.;
|
* Replace vset with a call to merge.;
|
||||||
|
|
||||||
* care in Gmerge,Gextract over vset .
|
* care in Gmerge,Gextract over vset .
|
||||||
|
|
||||||
* Const audit
|
|
||||||
|
|
||||||
* extract / merge extra implementation removal
|
* extract / merge extra implementation removal
|
||||||
|
|
||||||
|
|
||||||
|
FUNCTIONALITY:
|
||||||
|
|
||||||
* Conditional execution, where etc... -----DONE, simple test
|
* Conditional execution, where etc... -----DONE, simple test
|
||||||
|
|
||||||
* Integer relational support -----DONE
|
* Integer relational support -----DONE
|
||||||
@ -22,6 +21,11 @@
|
|||||||
|
|
||||||
* Stencil operator support -----Initial thoughts, trial implementation DONE.
|
* Stencil operator support -----Initial thoughts, trial implementation DONE.
|
||||||
-----some simple tests that Stencil matches Cshift.
|
-----some simple tests that Stencil matches Cshift.
|
||||||
|
-----do all permute in comms phase, so that copy permute
|
||||||
|
-----cases move into a buffer.
|
||||||
|
|
||||||
|
* CovariantShift support -----Use a class to store gauge field? (parallel transport?)
|
||||||
|
|
||||||
|
|
||||||
* Subset support, slice sums etc... -----Only need slice sum?
|
* Subset support, slice sums etc... -----Only need slice sum?
|
||||||
-----Generic cartesian subslicing?
|
-----Generic cartesian subslicing?
|
||||||
|
Loading…
Reference in New Issue
Block a user