mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Reduce now going through MPI.
This commit is contained in:
parent
2ae42c40a8
commit
94f9e781f4
@ -42,13 +42,31 @@ class CartesianCommunicator {
|
||||
////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
////////////////////////////////////////////////////////////
|
||||
void GlobalSum(float &);
|
||||
void GlobalSumVector(float *,int N);
|
||||
void GlobalSum(RealF &);
|
||||
void GlobalSumVector(RealF *,int N);
|
||||
|
||||
void GlobalSum(double &);
|
||||
void GlobalSumVector(double *,int N);
|
||||
void GlobalSum(RealD &);
|
||||
void GlobalSumVector(RealD *,int N);
|
||||
|
||||
void GlobalSum(ComplexF &c)
|
||||
{
|
||||
GlobalSumVector((float *)&c,2);
|
||||
}
|
||||
void GlobalSumVector(ComplexF *c,int N)
|
||||
{
|
||||
GlobalSumVector((float *)c,2*N);
|
||||
}
|
||||
|
||||
void GlobalSum(ComplexD &c)
|
||||
{
|
||||
GlobalSumVector((double *)&c,2);
|
||||
}
|
||||
void GlobalSumVector(ComplexD *c,int N)
|
||||
{
|
||||
GlobalSumVector((double *)c,2*N);
|
||||
}
|
||||
|
||||
template<class obj> void GlobalSumObj(obj &o){
|
||||
template<class obj> void GlobalSum(obj &o){
|
||||
|
||||
typedef typename obj::scalar_type scalar_type;
|
||||
int words = sizeof(obj)/sizeof(scalar_type);
|
||||
|
@ -407,18 +407,83 @@ public:
|
||||
return ret;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction operations
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj>
|
||||
inline RealD norm2(const Lattice<vobj> &arg){
|
||||
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
|
||||
decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
|
||||
|
||||
scalar nrm;
|
||||
//FIXME make this loop parallelisable
|
||||
for(int ss=0;ss<arg._grid->oSites(); ss++){
|
||||
vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]);
|
||||
vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
|
||||
}
|
||||
nrm = Reduce(TensorRemove(vnrm));
|
||||
arg._grid->GlobalSum(nrm);
|
||||
return real(nrm);
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
|
||||
|
||||
scalar nrm;
|
||||
//FIXME make this loop parallelisable
|
||||
for(int ss=0;ss<left._grid->oSites(); ss++){
|
||||
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
|
||||
}
|
||||
nrm = Reduce(TensorRemove(vnrm));
|
||||
right._grid->GlobalSum(nrm);
|
||||
return nrm;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Non site reduced routines
|
||||
/////////////////////////////////////////////////////
|
||||
|
||||
// localNorm2,
|
||||
template<class vobj>
|
||||
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
||||
{
|
||||
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=innerProduct(rhs,rhs);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// localInnerProduct
|
||||
template<class vobj>
|
||||
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
|
||||
-> Lattice<typename vobj::tensor_reduced>
|
||||
{
|
||||
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// outerProduct Scalar x Scalar -> Scalar
|
||||
// Vector x Vector -> Matrix
|
||||
template<class ll,class rr>
|
||||
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
|
35
Grid_QCD.h
35
Grid_QCD.h
@ -57,41 +57,6 @@ namespace QCD {
|
||||
typedef Lattice<vSpinVector> LatticeSpinVector;
|
||||
typedef Lattice<vColourVector> LatticeColourVector;
|
||||
|
||||
// localNorm2,
|
||||
template<class tt>
|
||||
inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
|
||||
{
|
||||
LatticeComplex ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=trace(adj(rhs)*rhs);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
// localInnerProduct
|
||||
template<class tt>
|
||||
inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
|
||||
{
|
||||
LatticeComplex ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// outerProduct Scalar x Scalar -> Scalar
|
||||
// Vector x Vector -> Matrix
|
||||
template<class ll,class rr>
|
||||
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
|
||||
#pragma omp parallel for
|
||||
for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
||||
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// FIXME for debug; deprecate this
|
||||
inline void LatticeCoordinate(LatticeInteger &l,int mu){
|
||||
|
@ -29,21 +29,20 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||
|
@ -8,7 +8,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout);
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
@ -19,7 +18,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout);
|
||||
buffer[bo++]=rhs._odata[so+o+b];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
|
@ -903,38 +903,38 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// localInnerProduct Scalar x Scalar -> Scalar
|
||||
// localInnerProduct Vector x Vector -> Scalar
|
||||
// localInnerProduct Matrix x Matrix -> Scalar
|
||||
// innerProduct Scalar x Scalar -> Scalar
|
||||
// innerProduct Vector x Vector -> Scalar
|
||||
// innerProduct Matrix x Matrix -> Scalar
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class l,class r,int N> inline
|
||||
auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))>
|
||||
auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
|
||||
{
|
||||
typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
|
||||
typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
|
||||
iScalar<ret_t> ret=zero;
|
||||
for(int c1=0;c1<N;c1++){
|
||||
ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]);
|
||||
ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class l,class r,int N> inline
|
||||
auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
|
||||
auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
|
||||
{
|
||||
typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
|
||||
typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
|
||||
iScalar<ret_t> ret=zero;
|
||||
iScalar<ret_t> tmp;
|
||||
for(int c1=0;c1<N;c1++){
|
||||
for(int c2=0;c2<N;c2++){
|
||||
ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
|
||||
ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
|
||||
}}
|
||||
return ret;
|
||||
}
|
||||
template<class l,class r> inline
|
||||
auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))>
|
||||
auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
|
||||
{
|
||||
typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t;
|
||||
typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
|
||||
iScalar<ret_t> ret;
|
||||
ret._internal = localInnerProduct(lhs._internal,rhs._internal);
|
||||
ret._internal = innerProduct(lhs._internal,rhs._internal);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -33,10 +33,10 @@ namespace Grid {
|
||||
|
||||
inline RealF adj(const RealF & r){ return r; }
|
||||
inline RealF conj(const RealF & r){ return r; }
|
||||
inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
||||
inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
|
||||
inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; }
|
||||
inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; }
|
||||
inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
|
||||
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
|
||||
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
|
||||
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//Provide support functions for basic real and complex data types required by Grid
|
||||
|
@ -313,7 +313,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
};
|
||||
|
||||
|
||||
inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
|
||||
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
|
||||
|
||||
|
||||
typedef vComplexD vDComplex;
|
||||
|
@ -362,7 +362,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
|
||||
|
||||
};
|
||||
|
||||
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r)
|
||||
inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)
|
||||
{
|
||||
return conj(l)*r;
|
||||
}
|
||||
|
@ -240,7 +240,7 @@ namespace Grid {
|
||||
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
|
||||
};
|
||||
|
||||
inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
|
||||
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
|
||||
inline void zeroit(vRealD &z){ vzero(z);}
|
||||
|
||||
inline vRealD outerProduct(const vRealD &l, const vRealD& r)
|
||||
|
@ -260,7 +260,7 @@ friend inline void vstore(const vRealF &ret, float *a){
|
||||
public:
|
||||
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
|
||||
};
|
||||
inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
||||
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
|
||||
inline void zeroit(vRealF &z){ vzero(z);}
|
||||
|
||||
inline vRealF outerProduct(const vRealF &l, const vRealF& r)
|
||||
|
14
TODO
14
TODO
@ -1,15 +1,14 @@
|
||||
|
||||
AUDITS:
|
||||
* FIXME audit
|
||||
|
||||
* const audit
|
||||
* Replace vset with a call to merge.;
|
||||
|
||||
* care in Gmerge,Gextract over vset .
|
||||
|
||||
* Const audit
|
||||
|
||||
* extract / merge extra implementation removal
|
||||
|
||||
|
||||
FUNCTIONALITY:
|
||||
|
||||
* Conditional execution, where etc... -----DONE, simple test
|
||||
|
||||
* Integer relational support -----DONE
|
||||
@ -22,6 +21,11 @@
|
||||
|
||||
* Stencil operator support -----Initial thoughts, trial implementation DONE.
|
||||
-----some simple tests that Stencil matches Cshift.
|
||||
-----do all permute in comms phase, so that copy permute
|
||||
-----cases move into a buffer.
|
||||
|
||||
* CovariantShift support -----Use a class to store gauge field? (parallel transport?)
|
||||
|
||||
|
||||
* Subset support, slice sums etc... -----Only need slice sum?
|
||||
-----Generic cartesian subslicing?
|
||||
|
Loading…
Reference in New Issue
Block a user