mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Synchronize blocking infrastructure with GPT
This commit is contained in:
parent
6b64727161
commit
e9b295f967
@ -37,211 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
// Move following 100 LOC to lattice/Lattice_basis.h
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
template<class Field>
|
|
||||||
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
|
|
||||||
{
|
|
||||||
// If assume basis[j] are already orthonormal,
|
|
||||||
// can take all inner products in parallel saving 2x bandwidth
|
|
||||||
// Save 3x bandwidth on the second line of loop.
|
|
||||||
// perhaps 2.5x speed up.
|
|
||||||
// 2x overall in Multigrid Lanczos
|
|
||||||
for(int j=0; j<k; ++j){
|
|
||||||
auto ip = innerProduct(basis[j],w);
|
|
||||||
w = w - ip*basis[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|
||||||
{
|
|
||||||
typedef decltype(basis[0].View()) View;
|
|
||||||
auto tmp_v = basis[0].View();
|
|
||||||
Vector<View> basis_v(basis.size(),tmp_v);
|
|
||||||
typedef typename Field::vector_object vobj;
|
|
||||||
GridBase* grid = basis[0].Grid();
|
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
|
||||||
basis_v[k] = basis[k].View();
|
|
||||||
}
|
|
||||||
#if 0
|
|
||||||
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
|
|
||||||
thread_region
|
|
||||||
{
|
|
||||||
vobj* B = Bt.data() + Nm * thread_num();
|
|
||||||
|
|
||||||
thread_for_in_region(ss, grid->oSites(),{
|
|
||||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
|
||||||
|
|
||||||
for(int j=j0; j<j1; ++j){
|
|
||||||
for(int k=k0; k<k1; ++k){
|
|
||||||
B[j] +=Qt(j,k) * basis_v[k][ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int j=j0; j<j1; ++j){
|
|
||||||
basis_v[j][ss] = B[j];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
|
|
||||||
int nrot = j1-j0;
|
|
||||||
|
|
||||||
|
|
||||||
uint64_t oSites =grid->oSites();
|
|
||||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
|
||||||
|
|
||||||
// printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
|
|
||||||
|
|
||||||
Vector <vobj> Bt(siteBlock * nrot);
|
|
||||||
auto Bp=&Bt[0];
|
|
||||||
|
|
||||||
// GPU readable copy of Eigen matrix
|
|
||||||
Vector<double> Qt_jv(Nm*Nm);
|
|
||||||
double *Qt_p = & Qt_jv[0];
|
|
||||||
for(int k=0;k<Nm;++k){
|
|
||||||
for(int j=0;j<Nm;++j){
|
|
||||||
Qt_p[j*Nm+k]=Qt(j,k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Block the loop to keep storage footprint down
|
|
||||||
vobj zz=Zero();
|
|
||||||
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
|
||||||
|
|
||||||
// remaining work in this block
|
|
||||||
int ssites=MIN(siteBlock,oSites-s);
|
|
||||||
|
|
||||||
// zero out the accumulators
|
|
||||||
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
|
|
||||||
auto z=coalescedRead(zz);
|
|
||||||
coalescedWrite(Bp[ss],z);
|
|
||||||
});
|
|
||||||
|
|
||||||
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
|
|
||||||
|
|
||||||
int j =sj%nrot;
|
|
||||||
int jj =j0+j;
|
|
||||||
int ss =sj/nrot;
|
|
||||||
int sss=ss+s;
|
|
||||||
|
|
||||||
for(int k=k0; k<k1; ++k){
|
|
||||||
auto tmp = coalescedRead(Bp[ss*nrot+j]);
|
|
||||||
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
|
|
||||||
int j =sj%nrot;
|
|
||||||
int jj =j0+j;
|
|
||||||
int ss =sj/nrot;
|
|
||||||
int sss=ss+s;
|
|
||||||
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract a single rotated vector
|
|
||||||
template<class Field>
|
|
||||||
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
|
|
||||||
{
|
|
||||||
typedef decltype(basis[0].View()) View;
|
|
||||||
typedef typename Field::vector_object vobj;
|
|
||||||
GridBase* grid = basis[0].Grid();
|
|
||||||
|
|
||||||
result.Checkerboard() = basis[0].Checkerboard();
|
|
||||||
auto result_v=result.View();
|
|
||||||
Vector<View> basis_v(basis.size(),result_v);
|
|
||||||
for(int k=0;k<basis.size();k++){
|
|
||||||
basis_v[k] = basis[k].View();
|
|
||||||
}
|
|
||||||
vobj zz=Zero();
|
|
||||||
Vector<double> Qt_jv(Nm);
|
|
||||||
double * Qt_j = & Qt_jv[0];
|
|
||||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
|
||||||
auto B=coalescedRead(zz);
|
|
||||||
for(int k=k0; k<k1; ++k){
|
|
||||||
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
|
|
||||||
}
|
|
||||||
coalescedWrite(result_v[ss], B);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
|
|
||||||
{
|
|
||||||
int vlen = idx.size();
|
|
||||||
|
|
||||||
assert(vlen>=1);
|
|
||||||
assert(vlen<=sort_vals.size());
|
|
||||||
assert(vlen<=_v.size());
|
|
||||||
|
|
||||||
for (size_t i=0;i<vlen;i++) {
|
|
||||||
|
|
||||||
if (idx[i] != i) {
|
|
||||||
|
|
||||||
//////////////////////////////////////
|
|
||||||
// idx[i] is a table of desired sources giving a permutation.
|
|
||||||
// Swap v[i] with v[idx[i]].
|
|
||||||
// Find j>i for which _vnew[j] = _vold[i],
|
|
||||||
// track the move idx[j] => idx[i]
|
|
||||||
// track the move idx[i] => i
|
|
||||||
//////////////////////////////////////
|
|
||||||
size_t j;
|
|
||||||
for (j=i;j<idx.size();j++)
|
|
||||||
if (idx[j]==i)
|
|
||||||
break;
|
|
||||||
|
|
||||||
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
|
|
||||||
|
|
||||||
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
|
|
||||||
std::swap(sort_vals[i],sort_vals[idx[i]]);
|
|
||||||
|
|
||||||
idx[j] = idx[i];
|
|
||||||
idx[i] = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
|
|
||||||
{
|
|
||||||
std::vector<int> idx(sort_vals.size());
|
|
||||||
std::iota(idx.begin(), idx.end(), 0);
|
|
||||||
|
|
||||||
// sort indexes based on comparing values in v
|
|
||||||
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
|
|
||||||
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
|
|
||||||
});
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
|
|
||||||
{
|
|
||||||
std::vector<int> idx = basisSortGetIndex(sort_vals);
|
|
||||||
if (reverse)
|
|
||||||
std::reverse(idx.begin(), idx.end());
|
|
||||||
|
|
||||||
basisReorderInPlace(_v,sort_vals,idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// PAB: faster to compute the inner products first then fuse loops.
|
|
||||||
// If performance critical can improve.
|
|
||||||
template<class Field>
|
|
||||||
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
|
|
||||||
result = Zero();
|
|
||||||
assert(_v.size()==eval.size());
|
|
||||||
int N = (int)_v.size();
|
|
||||||
for (int i=0;i<N;i++) {
|
|
||||||
Field& tmp = _v[i];
|
|
||||||
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// Implicitly restarted lanczos
|
// Implicitly restarted lanczos
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
|
@ -43,4 +43,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/lattice/Lattice_rng.h>
|
#include <Grid/lattice/Lattice_rng.h>
|
||||||
#include <Grid/lattice/Lattice_unary.h>
|
#include <Grid/lattice/Lattice_unary.h>
|
||||||
#include <Grid/lattice/Lattice_transfer.h>
|
#include <Grid/lattice/Lattice_transfer.h>
|
||||||
|
#include <Grid/lattice/Lattice_basis.h>
|
||||||
|
@ -116,7 +116,6 @@ public:
|
|||||||
int target;
|
int target;
|
||||||
cudaGetDevice(&target);
|
cudaGetDevice(&target);
|
||||||
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
|
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
|
||||||
//std::cout<< GridLogMessage << "To Device " << target << std::endl;
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
@ -125,7 +124,6 @@ public:
|
|||||||
#ifdef GRID_NVCC
|
#ifdef GRID_NVCC
|
||||||
#ifndef __CUDA_ARCH__ // only on host
|
#ifndef __CUDA_ARCH__ // only on host
|
||||||
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
|
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
|
||||||
//std::cout<< GridLogMessage << "To Host" << std::endl;
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
@ -425,7 +423,6 @@ public:
|
|||||||
// copy constructor
|
// copy constructor
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
Lattice(const Lattice& r){
|
Lattice(const Lattice& r){
|
||||||
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
|
|
||||||
this->_grid = r.Grid();
|
this->_grid = r.Grid();
|
||||||
resize(this->_grid->oSites());
|
resize(this->_grid->oSites());
|
||||||
*this = r;
|
*this = r;
|
||||||
@ -448,7 +445,6 @@ public:
|
|||||||
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
|
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
|
||||||
conformable(*this,r);
|
conformable(*this,r);
|
||||||
this->checkerboard = r.Checkerboard();
|
this->checkerboard = r.Checkerboard();
|
||||||
//std::cout << GridLogMessage << "Copy other" << std::endl;
|
|
||||||
auto me = AcceleratorView(ViewWrite);
|
auto me = AcceleratorView(ViewWrite);
|
||||||
auto him= r.AcceleratorView(ViewRead);
|
auto him= r.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
@ -463,7 +459,6 @@ public:
|
|||||||
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
|
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
|
||||||
this->checkerboard = r.Checkerboard();
|
this->checkerboard = r.Checkerboard();
|
||||||
conformable(*this,r);
|
conformable(*this,r);
|
||||||
//std::cout << GridLogMessage << "Copy same" << std::endl;
|
|
||||||
auto me = AcceleratorView(ViewWrite);
|
auto me = AcceleratorView(ViewWrite);
|
||||||
auto him= r.AcceleratorView(ViewRead);
|
auto him= r.AcceleratorView(ViewRead);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -63,6 +64,7 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||||
int cb = half.Checkerboard();
|
int cb = half.Checkerboard();
|
||||||
auto half_v = half.View();
|
auto half_v = half.View();
|
||||||
@ -81,25 +83,130 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex,int nbasis>
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Flexible Type Conversion for internal promotion to double as well as graceful
|
||||||
|
// treatment of scalar-compatible types
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
|
||||||
|
out = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
|
||||||
|
out = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
|
||||||
|
((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
|
||||||
|
}
|
||||||
|
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
|
||||||
|
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
|
||||||
|
}
|
||||||
|
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
|
||||||
|
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
|
||||||
|
out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
|
||||||
|
}
|
||||||
|
|
||||||
|
accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
|
||||||
|
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2,int N>
|
||||||
|
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
|
||||||
|
template<typename T1,typename T2,int N>
|
||||||
|
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
|
||||||
|
|
||||||
|
template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
|
||||||
|
accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
|
||||||
|
convertType(out,in._internal);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2>
|
||||||
|
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
|
||||||
|
convertType(out._internal,in);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2,int N>
|
||||||
|
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
|
||||||
|
for (int i=0;i<N;i++)
|
||||||
|
for (int j=0;j<N;j++)
|
||||||
|
convertType(out._internal[i][j],in._internal[i][j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2,int N>
|
||||||
|
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
|
||||||
|
for (int i=0;i<N;i++)
|
||||||
|
convertType(out._internal[i],in._internal[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
|
||||||
|
accelerator_inline void convertType(T & out, const T & in) {
|
||||||
|
out = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2>
|
||||||
|
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
||||||
|
auto out_v = out.AcceleratorView(ViewWrite);
|
||||||
|
auto in_v = in.AcceleratorView(ViewRead);
|
||||||
|
|
||||||
|
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
|
||||||
|
convertType(out_v[ss],in_v(ss));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// precision-promoted local inner product
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj>
|
||||||
|
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
|
||||||
|
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
|
||||||
|
{
|
||||||
|
auto lhs_v = lhs.AcceleratorView(ViewRead);
|
||||||
|
auto rhs_v = rhs.AcceleratorView(ViewRead);
|
||||||
|
|
||||||
|
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
|
||||||
|
Lattice<iScalar<t_inner>> ret(lhs.Grid());
|
||||||
|
auto ret_v = ret.AcceleratorView(ViewWrite);
|
||||||
|
|
||||||
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
|
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
|
||||||
|
});
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// block routines
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||||
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
const Lattice<vobj> &fineData,
|
const Lattice<vobj> &fineData,
|
||||||
const std::vector<Lattice<vobj> > &Basis)
|
const VLattice &Basis)
|
||||||
{
|
{
|
||||||
GridBase * fine = fineData.Grid();
|
GridBase * fine = fineData.Grid();
|
||||||
GridBase * coarse= coarseData.Grid();
|
GridBase * coarse= coarseData.Grid();
|
||||||
|
|
||||||
Lattice<CComplex> ip(coarse);
|
Lattice<iScalar<CComplex>> ip(coarse);
|
||||||
|
Lattice<vobj> fineDataRed = fineData;
|
||||||
|
|
||||||
// auto fineData_ = fineData.View();
|
// auto fineData_ = fineData.View();
|
||||||
auto coarseData_ = coarseData.View();
|
auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
|
||||||
auto ip_ = ip.View();
|
auto ip_ = ip.AcceleratorView(ViewReadWrite);
|
||||||
for(int v=0;v<nbasis;v++) {
|
for(int v=0;v<nbasis;v++) {
|
||||||
blockInnerProduct(ip,Basis[v],fineData);
|
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
|
||||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||||
coalescedWrite(coarseData_[sc](v),ip_(sc));
|
convertType(coarseData_[sc](v),ip_[sc]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// improve numerical stability of projection
|
||||||
|
// |fine> = |fine> - <basis|fine> |basis>
|
||||||
|
ip=-ip;
|
||||||
|
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,11 +273,11 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class vobj,class vobj2,class CComplex>
|
||||||
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||||
const Lattice<CComplex> &coarseA,
|
const Lattice<CComplex> &coarseA,
|
||||||
const Lattice<vobj> &fineX,
|
const Lattice<vobj2> &fineX,
|
||||||
const Lattice<vobj> &fineY)
|
const Lattice<vobj> &fineY)
|
||||||
{
|
{
|
||||||
GridBase * fine = fineZ.Grid();
|
GridBase * fine = fineZ.Grid();
|
||||||
GridBase * coarse= coarseA.Grid();
|
GridBase * coarse= coarseA.Grid();
|
||||||
@ -182,7 +289,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
conformable(fineX,fineZ);
|
conformable(fineX,fineZ);
|
||||||
|
|
||||||
int _ndimension = coarse->_ndimension;
|
int _ndimension = coarse->_ndimension;
|
||||||
|
|
||||||
Coordinate block_r (_ndimension);
|
Coordinate block_r (_ndimension);
|
||||||
|
|
||||||
// FIXME merge with subdivide checking routine as this is redundant
|
// FIXME merge with subdivide checking routine as this is redundant
|
||||||
@ -191,29 +298,65 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto fineZ_ = fineZ.View();
|
auto fineZ_ = fineZ.AcceleratorView(ViewWrite);
|
||||||
auto fineX_ = fineX.View();
|
auto fineX_ = fineX.AcceleratorView(ViewRead);
|
||||||
auto fineY_ = fineY.View();
|
auto fineY_ = fineY.AcceleratorView(ViewRead);
|
||||||
auto coarseA_= coarseA.View();
|
auto coarseA_= coarseA.AcceleratorView(ViewRead);
|
||||||
|
|
||||||
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
||||||
|
|
||||||
int sc;
|
|
||||||
Coordinate coor_c(_ndimension);
|
|
||||||
Coordinate coor_f(_ndimension);
|
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
int sc;
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
Coordinate coor_c(_ndimension);
|
||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Coordinate coor_f(_ndimension);
|
||||||
|
|
||||||
// z = A x + y
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
});
|
// z = A x + y
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
typename vobj2::tensor_reduced::scalar_object cA;
|
||||||
|
typename vobj::scalar_object cAx;
|
||||||
|
#else
|
||||||
|
typename vobj2::tensor_reduced cA;
|
||||||
|
vobj cAx;
|
||||||
|
#endif
|
||||||
|
convertType(cA,TensorRemove(coarseA_(sc)));
|
||||||
|
auto prod = cA*fineX_(sf);
|
||||||
|
convertType(cAx,prod);
|
||||||
|
coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class vobj,class CComplex>
|
||||||
|
inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
|
||||||
|
const Lattice<vobj> &fineX,
|
||||||
|
const Lattice<vobj> &fineY)
|
||||||
|
{
|
||||||
|
typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
|
||||||
|
|
||||||
|
GridBase *coarse(CoarseInner.Grid());
|
||||||
|
GridBase *fine (fineX.Grid());
|
||||||
|
|
||||||
|
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
|
||||||
|
Lattice<dotp> coarse_inner(coarse);
|
||||||
|
|
||||||
|
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
|
||||||
|
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
|
||||||
|
|
||||||
|
// Precision promotion
|
||||||
|
fine_inner = localInnerProductD(fineX,fineY);
|
||||||
|
blockSum(coarse_inner,fine_inner);
|
||||||
|
accelerator_for(ss, coarse->oSites(), 1, {
|
||||||
|
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj,class CComplex> // deprecate
|
||||||
inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
||||||
const Lattice<vobj> &fineX,
|
const Lattice<vobj> &fineX,
|
||||||
const Lattice<vobj> &fineY)
|
const Lattice<vobj> &fineY)
|
||||||
@ -227,8 +370,8 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
|||||||
Lattice<dotp> coarse_inner(coarse);
|
Lattice<dotp> coarse_inner(coarse);
|
||||||
|
|
||||||
// Precision promotion?
|
// Precision promotion?
|
||||||
auto CoarseInner_ = CoarseInner.View();
|
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
|
||||||
auto coarse_inner_ = coarse_inner.View();
|
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
|
||||||
|
|
||||||
fine_inner = localInnerProduct(fineX,fineY);
|
fine_inner = localInnerProduct(fineX,fineY);
|
||||||
blockSum(coarse_inner,fine_inner);
|
blockSum(coarse_inner,fine_inner);
|
||||||
@ -236,6 +379,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
|||||||
CoarseInner_[ss] = coarse_inner_[ss];
|
CoarseInner_[ss] = coarse_inner_[ss];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class vobj,class CComplex>
|
||||||
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
|
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
|
||||||
{
|
{
|
||||||
@ -248,7 +392,7 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
|
|||||||
// useful in multigrid project;
|
// useful in multigrid project;
|
||||||
// Generic name : Coarsen?
|
// Generic name : Coarsen?
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
||||||
{
|
{
|
||||||
GridBase * fine = fineData.Grid();
|
GridBase * fine = fineData.Grid();
|
||||||
GridBase * coarse= coarseData.Grid();
|
GridBase * coarse= coarseData.Grid();
|
||||||
@ -256,42 +400,41 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
subdivides(coarse,fine); // require they map
|
subdivides(coarse,fine); // require they map
|
||||||
|
|
||||||
int _ndimension = coarse->_ndimension;
|
int _ndimension = coarse->_ndimension;
|
||||||
|
|
||||||
Coordinate block_r (_ndimension);
|
Coordinate block_r (_ndimension);
|
||||||
|
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||||
}
|
}
|
||||||
int blockVol = fine->oSites()/coarse->oSites();
|
int blockVol = fine->oSites()/coarse->oSites();
|
||||||
|
|
||||||
// Turn this around to loop threaded over sc and interior loop
|
auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
|
||||||
// over sf would thread better
|
auto fineData_ = fineData.AcceleratorView(ViewRead);
|
||||||
auto coarseData_ = coarseData.View();
|
|
||||||
auto fineData_ = fineData.View();
|
|
||||||
|
|
||||||
accelerator_for(sc,coarse->oSites(),1,{
|
accelerator_for(sc,coarse->oSites(),1,{
|
||||||
|
|
||||||
// One thread per sub block
|
// One thread per sub block
|
||||||
Coordinate coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
||||||
coarseData_[sc]=Zero();
|
coarseData_[sc]=Zero();
|
||||||
|
|
||||||
for(int sb=0;sb<blockVol;sb++){
|
for(int sb=0;sb<blockVol;sb++){
|
||||||
|
|
||||||
int sf;
|
|
||||||
Coordinate coor_b(_ndimension);
|
|
||||||
Coordinate coor_f(_ndimension);
|
|
||||||
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
|
|
||||||
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
|
|
||||||
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
|
||||||
|
|
||||||
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
|
int sf;
|
||||||
}
|
Coordinate coor_b(_ndimension);
|
||||||
|
Coordinate coor_f(_ndimension);
|
||||||
|
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
|
||||||
|
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
|
||||||
|
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
||||||
|
|
||||||
});
|
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
|
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
|
||||||
{
|
{
|
||||||
@ -313,8 +456,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class CComplex,class VLattice>
|
||||||
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
|
inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
|
||||||
{
|
{
|
||||||
GridBase *coarse = ip.Grid();
|
GridBase *coarse = ip.Grid();
|
||||||
GridBase *fine = Basis[0].Grid();
|
GridBase *fine = Basis[0].Grid();
|
||||||
@ -322,23 +465,30 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
|
|||||||
int nbasis = Basis.size() ;
|
int nbasis = Basis.size() ;
|
||||||
|
|
||||||
// checks
|
// checks
|
||||||
subdivides(coarse,fine);
|
subdivides(coarse,fine);
|
||||||
for(int i=0;i<nbasis;i++){
|
for(int i=0;i<nbasis;i++){
|
||||||
conformable(Basis[i].Grid(),fine);
|
conformable(Basis[i].Grid(),fine);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int v=0;v<nbasis;v++) {
|
for(int v=0;v<nbasis;v++) {
|
||||||
for(int u=0;u<v;u++) {
|
for(int u=0;u<v;u++) {
|
||||||
//Inner product & remove component
|
//Inner product & remove component
|
||||||
blockInnerProduct(ip,Basis[u],Basis[v]);
|
blockInnerProductD(ip,Basis[u],Basis[v]);
|
||||||
ip = -ip;
|
ip = -ip;
|
||||||
blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]);
|
blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
|
||||||
}
|
}
|
||||||
blockNormalise(ip,Basis[v]);
|
blockNormalise(ip,Basis[v]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj,class CComplex>
|
||||||
|
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
|
||||||
|
{
|
||||||
|
blockOrthonormalize(ip,Basis);
|
||||||
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
// TODO: CPU optimized version here
|
||||||
template<class vobj,class CComplex,int nbasis>
|
template<class vobj,class CComplex,int nbasis>
|
||||||
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
Lattice<vobj> &fineData,
|
Lattice<vobj> &fineData,
|
||||||
@ -383,24 +533,18 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
template<class vobj,class CComplex,int nbasis>
|
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||||
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
Lattice<vobj> &fineData,
|
Lattice<vobj> &fineData,
|
||||||
const std::vector<Lattice<vobj> > &Basis)
|
const VLattice &Basis)
|
||||||
{
|
{
|
||||||
GridBase * fine = fineData.Grid();
|
GridBase * fine = fineData.Grid();
|
||||||
GridBase * coarse= coarseData.Grid();
|
GridBase * coarse= coarseData.Grid();
|
||||||
|
|
||||||
fineData=Zero();
|
fineData=Zero();
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
||||||
Lattice<CComplex> cip(coarse);
|
auto ip_ = ip.AcceleratorView(ViewRead);
|
||||||
auto cip_ = cip.View();
|
blockZAXPY(fineData,ip,Basis[i],fineData);
|
||||||
auto ip_ = ip.View();
|
|
||||||
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
|
|
||||||
coalescedWrite(cip_[sc], ip_(sc)());
|
|
||||||
});
|
|
||||||
blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -470,8 +614,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
Coordinate rdt = Tg->_rdimensions;
|
Coordinate rdt = Tg->_rdimensions;
|
||||||
Coordinate ist = Tg->_istride;
|
Coordinate ist = Tg->_istride;
|
||||||
Coordinate ost = Tg->_ostride;
|
Coordinate ost = Tg->_ostride;
|
||||||
auto t_v = To.View();
|
auto t_v = To.AcceleratorView(ViewWrite);
|
||||||
auto f_v = From.View();
|
auto f_v = From.AcceleratorView(ViewRead);
|
||||||
accelerator_for(idx,Fg->lSites(),1,{
|
accelerator_for(idx,Fg->lSites(),1,{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate Fcoor(nd);
|
Coordinate Fcoor(nd);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user