1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-15 14:27:06 +01:00

Some fix for the GenericHMCrunner

This commit is contained in:
Guido Cossu
2016-10-10 09:43:05 +01:00
parent 6eb873dd96
commit 26b9740d53
5 changed files with 80 additions and 60 deletions

View File

@ -38,51 +38,56 @@ namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg);
return std::real(nrm);
template <class vobj>
inline RealD norm2(const Lattice<vobj> &arg) {
ComplexD nrm = innerProduct(arg, arg);
return std::real(nrm);
}
template <class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,
const Lattice<vobj> &right) {
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
scalar_type nrm;
GridBase *grid = left._grid;
std::vector<vector_type, alignedAllocator<vector_type> > sumarray(
grid->SumArraySize());
for (int i = 0; i < grid->SumArraySize(); i++) {
sumarray[i] = zero;
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
scalar_type nrm;
PARALLEL_FOR_LOOP
for (int thr = 0; thr < grid->SumArraySize(); thr++) {
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(), thr, mywork, myoff);
GridBase *grid = left._grid;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
PARALLEL_FOR_LOOP
for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
}
sumarray[thr]=TensorRemove(vnrm) ;
}
vector_type vvnrm; vvnrm=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i];
}
nrm = Reduce(vvnrm);// sum across simd
right._grid->GlobalSum(nrm);
return nrm;
decltype(innerProduct(left._odata[0], right._odata[0])) vnrm =
zero; // private to thread; sub summation
for (int ss = myoff; ss < mywork + myoff; ss++) {
vnrm = vnrm + innerProduct(left._odata[ss], right._odata[ss]);
}
sumarray[thr] = TensorRemove(vnrm);
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{
return sum(closure(expr));
vector_type vvnrm;
vvnrm = zero; // sum across threads
for (int i = 0; i < grid->SumArraySize(); i++) {
vvnrm = vvnrm + sumarray[i];
}
nrm = Reduce(vvnrm); // sum across simd
right._grid->GlobalSum(nrm);
return nrm;
}
template <class Op, class T1>
inline auto sum(const LatticeUnaryExpression<Op, T1> &expr) ->
typename decltype(
expr.first.func(eval(0, std::get<0>(expr.second))))::scalar_object {
return sum(closure(expr));
}
template<class Op,class T1,class T2>
@ -96,9 +101,9 @@ PARALLEL_FOR_LOOP
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
{
return sum(closure(expr));
}
@ -111,24 +116,24 @@ PARALLEL_FOR_LOOP
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
sumarray[i]=zero;
}
PARALLEL_FOR_LOOP
for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;