mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Coordinate handling GPU friendly + some GPU merge/extract improvements
This commit is contained in:
parent
ff7b19a71b
commit
c1fc947bb8
@ -52,9 +52,9 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
|
|||||||
|
|
||||||
const int Nsimd = vobj::vector_type::Nsimd();
|
const int Nsimd = vobj::vector_type::Nsimd();
|
||||||
|
|
||||||
std::vector<Integer> mask(Nsimd);
|
ExtractBuffer<Integer> mask(Nsimd);
|
||||||
std::vector<scalar_object> truevals(Nsimd);
|
ExtractBuffer<scalar_object> truevals(Nsimd);
|
||||||
std::vector<scalar_object> falsevals(Nsimd);
|
ExtractBuffer<scalar_object> falsevals(Nsimd);
|
||||||
|
|
||||||
extract(iftrue, truevals);
|
extract(iftrue, truevals);
|
||||||
extract(iffalse, falsevals);
|
extract(iffalse, falsevals);
|
||||||
|
@ -382,11 +382,13 @@ public:
|
|||||||
}; // class Lattice
|
}; // class Lattice
|
||||||
|
|
||||||
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
|
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
|
||||||
std::vector<int> gcoor;
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
sobj ss;
|
|
||||||
for(int g=0;g<o.Grid()->_gsites;g++){
|
for(int g=0;g<o.Grid()->_gsites;g++){
|
||||||
|
|
||||||
|
Coordinate gcoor;
|
||||||
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
|
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
|
||||||
|
|
||||||
|
sobj ss;
|
||||||
peekSite(ss,o,gcoor);
|
peekSite(ss,o,gcoor);
|
||||||
stream<<"[";
|
stream<<"[";
|
||||||
for(int d=0;d<gcoor.size();d++){
|
for(int d=0;d<gcoor.size();d++){
|
||||||
|
@ -136,9 +136,9 @@ template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
|||||||
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
|
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
|
||||||
{
|
{
|
||||||
typedef typename vsimd::scalar_type scalar;
|
typedef typename vsimd::scalar_type scalar;
|
||||||
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
||||||
std::vector<scalar> vrhs(vsimd::Nsimd());
|
ExtractBuffer<scalar> vrhs(vsimd::Nsimd());
|
||||||
std::vector<Integer> vpred(vsimd::Nsimd());
|
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
|
||||||
vInteger ret;
|
vInteger ret;
|
||||||
extract<vsimd,scalar>(lhs,vlhs);
|
extract<vsimd,scalar>(lhs,vlhs);
|
||||||
extract<vsimd,scalar>(rhs,vrhs);
|
extract<vsimd,scalar>(rhs,vrhs);
|
||||||
@ -153,8 +153,8 @@ template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
|||||||
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
|
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
|
||||||
{
|
{
|
||||||
typedef typename vsimd::scalar_type scalar;
|
typedef typename vsimd::scalar_type scalar;
|
||||||
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
||||||
std::vector<Integer> vpred(vsimd::Nsimd());
|
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
|
||||||
vInteger ret;
|
vInteger ret;
|
||||||
extract<vsimd,scalar>(lhs,vlhs);
|
extract<vsimd,scalar>(lhs,vlhs);
|
||||||
for(int s=0;s<vsimd::Nsimd();s++){
|
for(int s=0;s<vsimd::Nsimd();s++){
|
||||||
@ -168,8 +168,8 @@ template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
|||||||
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
|
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
|
||||||
{
|
{
|
||||||
typedef typename vsimd::scalar_type scalar;
|
typedef typename vsimd::scalar_type scalar;
|
||||||
std::vector<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
||||||
std::vector<Integer> vpred(vsimd::Nsimd());
|
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
|
||||||
vInteger ret;
|
vInteger ret;
|
||||||
extract<vsimd,scalar>(rhs,vrhs);
|
extract<vsimd,scalar>(rhs,vrhs);
|
||||||
for(int s=0;s<vsimd::Nsimd();s++){
|
for(int s=0;s<vsimd::Nsimd();s++){
|
||||||
|
@ -38,8 +38,8 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
|||||||
GridBase *grid = l.Grid();
|
GridBase *grid = l.Grid();
|
||||||
int Nsimd = grid->iSites();
|
int Nsimd = grid->iSites();
|
||||||
|
|
||||||
std::vector<int> gcoor;
|
Coordinate gcoor;
|
||||||
std::vector<scalar_type> mergebuf(Nsimd);
|
ExtractBuffer<scalar_type> mergebuf(Nsimd);
|
||||||
|
|
||||||
vector_type vI;
|
vector_type vI;
|
||||||
for(int o=0;o<grid->oSites();o++){
|
for(int o=0;o<grid->oSites();o++){
|
||||||
|
@ -84,7 +84,7 @@ void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs[0]
|
|||||||
// Poke a scalar object into the SIMD array
|
// Poke a scalar object into the SIMD array
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
|
void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
|
||||||
|
|
||||||
GridBase *grid=l.Grid();
|
GridBase *grid=l.Grid();
|
||||||
|
|
||||||
@ -101,9 +101,8 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
|
|||||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
grid->Broadcast(grid->BossRank(),s);
|
grid->Broadcast(grid->BossRank(),s);
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
|
||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||||
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
if ( rank == grid->ThisRank() ) {
|
if ( rank == grid->ThisRank() ) {
|
||||||
extract(l[odx],buf);
|
extract(l[odx],buf);
|
||||||
buf[idx] = s;
|
buf[idx] = s;
|
||||||
@ -118,7 +117,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
|
|||||||
// Peek a scalar object from the SIMD array
|
// Peek a scalar object from the SIMD array
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
|
void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
||||||
|
|
||||||
GridBase *grid=l.Grid();
|
GridBase *grid=l.Grid();
|
||||||
|
|
||||||
@ -132,7 +131,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
|
|||||||
int rank,odx,idx;
|
int rank,odx,idx;
|
||||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
extract(l[odx],buf);
|
extract(l[odx],buf);
|
||||||
|
|
||||||
s = buf[idx];
|
s = buf[idx];
|
||||||
@ -147,7 +146,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
|
|||||||
// Peek a scalar object from the SIMD array
|
// Peek a scalar object from the SIMD array
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
|
void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
|
||||||
|
|
||||||
GridBase *grid = l.Grid();
|
GridBase *grid = l.Grid();
|
||||||
|
|
||||||
@ -175,7 +174,7 @@ void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
|
||||||
|
|
||||||
GridBase *grid=l.Grid();
|
GridBase *grid=l.Grid();
|
||||||
|
|
||||||
|
@ -123,7 +123,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
|||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
sobj ssum; zeroit(ssum);
|
sobj ssum; zeroit(ssum);
|
||||||
|
|
||||||
std::vector<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
extract(vsum,buf);
|
extract(vsum,buf);
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
|
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
|
||||||
@ -160,7 +160,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
|
|
||||||
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
|
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
|
||||||
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||||
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
|
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
result.resize(fd); // And then global sum to return the same vector to every node
|
result.resize(fd); // And then global sum to return the same vector to every node
|
||||||
for(int r=0;r<rd;r++){
|
for(int r=0;r<rd;r++){
|
||||||
@ -185,7 +185,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
std::vector<int> icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
|
|
||||||
for(int rt=0;rt<rd;rt++){
|
for(int rt=0;rt<rd;rt++){
|
||||||
|
|
||||||
@ -240,7 +240,7 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
|
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
|
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
|
||||||
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||||
std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
||||||
for(int r=0;r<rd;r++){
|
for(int r=0;r<rd;r++){
|
||||||
@ -265,7 +265,7 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
std::vector<int> icoor(Nd);
|
Coordinate icoor(Nd);
|
||||||
for(int rt=0;rt<rd;rt++){
|
for(int rt=0;rt<rd;rt++){
|
||||||
|
|
||||||
iScalar<vector_type> temp;
|
iScalar<vector_type> temp;
|
||||||
@ -341,7 +341,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
|||||||
int e2 =grid->_slice_block [orthogdim];
|
int e2 =grid->_slice_block [orthogdim];
|
||||||
int stride =grid->_slice_stride[orthogdim];
|
int stride =grid->_slice_stride[orthogdim];
|
||||||
|
|
||||||
std::vector<int> icoor;
|
Coordinate icoor;
|
||||||
|
|
||||||
for(int r=0;r<rd;r++){
|
for(int r=0;r<rd;r++){
|
||||||
|
|
||||||
|
@ -347,7 +347,7 @@ public:
|
|||||||
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
||||||
|
|
||||||
thread_loop( (int ss=0;ss<osites;ss++), {
|
thread_loop( (int ss=0;ss<osites;ss++), {
|
||||||
std::vector<scalar_object> buf(Nsimd);
|
ExtractBuffer<scalar_object> buf(Nsimd);
|
||||||
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
||||||
|
|
||||||
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
|
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
|
||||||
@ -392,8 +392,8 @@ public:
|
|||||||
int rank;
|
int rank;
|
||||||
int o_idx;
|
int o_idx;
|
||||||
int i_idx;
|
int i_idx;
|
||||||
std::vector<int> gcoor;
|
|
||||||
|
|
||||||
|
Coordinate gcoor;
|
||||||
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
|
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
|
||||||
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
||||||
|
|
||||||
@ -456,8 +456,8 @@ public:
|
|||||||
|
|
||||||
uint32_t the_number;
|
uint32_t the_number;
|
||||||
// who
|
// who
|
||||||
std::vector<int> gcoor;
|
|
||||||
int rank,o_idx,i_idx;
|
int rank,o_idx,i_idx;
|
||||||
|
Coordinate gcoor;
|
||||||
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
|
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
|
||||||
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
|||||||
|
|
||||||
thread_loop( (int ss=0;ss<full.Grid()->oSites();ss++),{
|
thread_loop( (int ss=0;ss<full.Grid()->oSites();ss++),{
|
||||||
int cbos;
|
int cbos;
|
||||||
std::vector<int> coor;
|
Coordinate coor;
|
||||||
full.Grid()->oCoorFromOindex(coor,ss);
|
full.Grid()->oCoorFromOindex(coor,ss);
|
||||||
cbos=half.Grid()->CheckerBoard(coor);
|
cbos=half.Grid()->CheckerBoard(coor);
|
||||||
|
|
||||||
@ -66,7 +66,7 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
|||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||||
int cb = half.Checkerboard();
|
int cb = half.Checkerboard();
|
||||||
thread_loop( (int ss=0;ss<full.Grid()->oSites();ss++), {
|
thread_loop( (int ss=0;ss<full.Grid()->oSites();ss++), {
|
||||||
std::vector<int> coor;
|
Coordinate coor;
|
||||||
int cbos;
|
int cbos;
|
||||||
|
|
||||||
full.Grid()->oCoorFromOindex(coor,ss);
|
full.Grid()->oCoorFromOindex(coor,ss);
|
||||||
@ -96,7 +96,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
conformable(Basis[i],fineData);
|
conformable(Basis[i],fineData);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> block_r (_ndimension);
|
Coordinate block_r (_ndimension);
|
||||||
|
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||||
@ -109,8 +109,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
thread_loop( (int sf=0;sf<fine->oSites();sf++),{
|
thread_loop( (int sf=0;sf<fine->oSites();sf++),{
|
||||||
|
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
Coordinate coor_f(_ndimension);
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
@ -143,7 +143,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
|
|
||||||
int _ndimension = coarse->_ndimension;
|
int _ndimension = coarse->_ndimension;
|
||||||
|
|
||||||
std::vector<int> block_r (_ndimension);
|
Coordinate block_r (_ndimension);
|
||||||
|
|
||||||
// FIXME merge with subdivide checking routine as this is redundant
|
// FIXME merge with subdivide checking routine as this is redundant
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
@ -154,8 +154,8 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
thread_loop( (int sf=0;sf<fine->oSites();sf++),{
|
thread_loop( (int sf=0;sf<fine->oSites();sf++),{
|
||||||
|
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
Coordinate coor_f(_ndimension);
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
@ -209,7 +209,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
|
|
||||||
int _ndimension = coarse->_ndimension;
|
int _ndimension = coarse->_ndimension;
|
||||||
|
|
||||||
std::vector<int> block_r (_ndimension);
|
Coordinate block_r (_ndimension);
|
||||||
|
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||||
@ -221,8 +221,8 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
thread_region {
|
thread_region {
|
||||||
|
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
Coordinate coor_f(_ndimension);
|
||||||
|
|
||||||
thread_loop_in_region( (int sf=0;sf<fine->oSites();sf++),{
|
thread_loop_in_region( (int sf=0;sf<fine->oSites();sf++),{
|
||||||
|
|
||||||
@ -240,7 +240,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,std::vector<int> coor)
|
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
|
||||||
{
|
{
|
||||||
GridBase * fine = unpicked.Grid();
|
GridBase * fine = unpicked.Grid();
|
||||||
|
|
||||||
@ -301,7 +301,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
conformable(Basis[i].Grid(),fine);
|
conformable(Basis[i].Grid(),fine);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> block_r (_ndimension);
|
Coordinate block_r (_ndimension);
|
||||||
|
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||||
@ -310,8 +310,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
// Loop with a cache friendly loop ordering
|
// Loop with a cache friendly loop ordering
|
||||||
thread_region {
|
thread_region {
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
Coordinate coor_f(_ndimension);
|
||||||
|
|
||||||
thread_loop_in_region( (int sf=0;sf<fine->oSites();sf++),{
|
thread_loop_in_region( (int sf=0;sf<fine->oSites();sf++),{
|
||||||
|
|
||||||
@ -355,7 +355,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
|
|||||||
sobj s;
|
sobj s;
|
||||||
ssobj ss;
|
ssobj ss;
|
||||||
|
|
||||||
std::vector<int> lcoor(ni);
|
Coordinate lcoor(ni);
|
||||||
ig->LocalIndexToLocalCoor(idx,lcoor);
|
ig->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
peekLocalSite(s,in,lcoor);
|
peekLocalSite(s,in,lcoor);
|
||||||
ss=s;
|
ss=s;
|
||||||
@ -391,8 +391,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
Coordinate hcoor(nh);
|
||||||
lg->LocalIndexToLocalCoor(idx,lcoor);
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
int ddl=0;
|
int ddl=0;
|
||||||
hcoor[orthog] = slice;
|
hcoor[orthog] = slice;
|
||||||
@ -432,8 +432,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
thread_loop((int idx=0;idx<lg->lSites();idx++),{
|
thread_loop((int idx=0;idx<lg->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
Coordinate hcoor(nh);
|
||||||
lg->LocalIndexToLocalCoor(idx,lcoor);
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
int ddl=0;
|
int ddl=0;
|
||||||
hcoor[orthog] = slice;
|
hcoor[orthog] = slice;
|
||||||
@ -471,8 +471,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
Coordinate hcoor(nh);
|
||||||
lg->LocalIndexToLocalCoor(idx,lcoor);
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
if( lcoor[orthog] == slice_lo ) {
|
if( lcoor[orthog] == slice_lo ) {
|
||||||
hcoor=lcoor;
|
hcoor=lcoor;
|
||||||
@ -506,8 +506,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
|
|||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
Coordinate hcoor(nh);
|
||||||
lg->LocalIndexToLocalCoor(idx,lcoor);
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
if( lcoor[orthog] == slice_lo ) {
|
if( lcoor[orthog] == slice_lo ) {
|
||||||
hcoor=lcoor;
|
hcoor=lcoor;
|
||||||
@ -533,14 +533,14 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
|
|||||||
|
|
||||||
assert(cg->_ndimension==fg->_ndimension);
|
assert(cg->_ndimension==fg->_ndimension);
|
||||||
|
|
||||||
std::vector<int> ratio(cg->_ndimension);
|
Coordinate ratio(cg->_ndimension);
|
||||||
|
|
||||||
for(int d=0;d<cg->_ndimension;d++){
|
for(int d=0;d<cg->_ndimension;d++){
|
||||||
ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d];
|
ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d];
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> fcoor(nd);
|
Coordinate fcoor(nd);
|
||||||
std::vector<int> ccoor(nd);
|
Coordinate ccoor(nd);
|
||||||
for(int g=0;g<fg->gSites();g++){
|
for(int g=0;g<fg->gSites();g++){
|
||||||
|
|
||||||
fg->GlobalIndexToGlobalCoor(g,fcoor);
|
fg->GlobalIndexToGlobalCoor(g,fcoor);
|
||||||
@ -569,7 +569,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
int ndim = in_grid->Nd();
|
int ndim = in_grid->Nd();
|
||||||
int in_nsimd = vtype::Nsimd();
|
int in_nsimd = vtype::Nsimd();
|
||||||
|
|
||||||
std::vector<std::vector<int> > in_icoor(in_nsimd);
|
std::vector<Coordinate > in_icoor(in_nsimd);
|
||||||
|
|
||||||
for(int lane=0; lane < in_nsimd; lane++){
|
for(int lane=0; lane < in_nsimd; lane++){
|
||||||
in_icoor[lane].resize(ndim);
|
in_icoor[lane].resize(ndim);
|
||||||
@ -579,12 +579,12 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
//loop over outer index
|
//loop over outer index
|
||||||
thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{
|
thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
std::vector<sobj*> out_ptrs(in_nsimd);
|
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
||||||
|
|
||||||
std::vector<int> in_ocoor(ndim);
|
Coordinate in_ocoor(ndim);
|
||||||
in_grid->oCoorFromOindex(in_ocoor, in_oidx);
|
in_grid->oCoorFromOindex(in_ocoor, in_oidx);
|
||||||
|
|
||||||
std::vector<int> lcoor(in_grid->Nd());
|
Coordinate lcoor(in_grid->Nd());
|
||||||
|
|
||||||
for(int lane=0; lane < in_nsimd; lane++){
|
for(int lane=0; lane < in_nsimd; lane++){
|
||||||
for(int mu=0;mu<ndim;mu++)
|
for(int mu=0;mu<ndim;mu++)
|
||||||
@ -597,7 +597,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
|
|
||||||
//Unpack into those ptrs
|
//Unpack into those ptrs
|
||||||
const vobj & in_vobj = in[in_oidx];
|
const vobj & in_vobj = in[in_oidx];
|
||||||
extract1(in_vobj, out_ptrs, 0);
|
extract(in_vobj, out_ptrs, 0);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
|
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
|
||||||
@ -612,10 +612,10 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
GridBase* grid = out.Grid();
|
GridBase* grid = out.Grid();
|
||||||
assert(in.size()==grid->lSites());
|
assert(in.size()==grid->lSites());
|
||||||
|
|
||||||
int ndim = grid->Nd();
|
const int ndim = grid->Nd();
|
||||||
int nsimd = vtype::Nsimd();
|
constexpr int nsimd = vtype::Nsimd();
|
||||||
|
|
||||||
std::vector<std::vector<int> > icoor(nsimd);
|
std::vector<Coordinate > icoor(nsimd);
|
||||||
|
|
||||||
for(int lane=0; lane < nsimd; lane++){
|
for(int lane=0; lane < nsimd; lane++){
|
||||||
icoor[lane].resize(ndim);
|
icoor[lane].resize(ndim);
|
||||||
@ -624,12 +624,11 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
|
|
||||||
thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{
|
thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
std::vector<sobj*> ptrs(nsimd);
|
ExtractPointerArray<sobj> ptrs(nsimd);
|
||||||
|
|
||||||
std::vector<int> ocoor(ndim);
|
Coordinate ocoor(ndim);
|
||||||
|
Coordinate lcoor(ndim);
|
||||||
grid->oCoorFromOindex(ocoor, oidx);
|
grid->oCoorFromOindex(ocoor, oidx);
|
||||||
|
|
||||||
std::vector<int> lcoor(grid->Nd());
|
|
||||||
|
|
||||||
for(int lane=0; lane < nsimd; lane++){
|
for(int lane=0; lane < nsimd; lane++){
|
||||||
|
|
||||||
@ -644,7 +643,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
|
|
||||||
//pack from those ptrs
|
//pack from those ptrs
|
||||||
vobj vecobj;
|
vobj vecobj;
|
||||||
merge1(vecobj, ptrs, 0);
|
merge(vecobj, ptrs, 0);
|
||||||
out[oidx] = vecobj;
|
out[oidx] = vecobj;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -664,7 +663,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
|||||||
int ndim = out.Grid()->Nd();
|
int ndim = out.Grid()->Nd();
|
||||||
int out_nsimd = out_grid->Nsimd();
|
int out_nsimd = out_grid->Nsimd();
|
||||||
|
|
||||||
std::vector<std::vector<int> > out_icoor(out_nsimd);
|
std::vector<Coordinate > out_icoor(out_nsimd);
|
||||||
|
|
||||||
for(int lane=0; lane < out_nsimd; lane++){
|
for(int lane=0; lane < out_nsimd; lane++){
|
||||||
out_icoor[lane].resize(ndim);
|
out_icoor[lane].resize(ndim);
|
||||||
@ -675,12 +674,12 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
|||||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
unvectorizeToLexOrdArray(in_slex_conv, in);
|
||||||
|
|
||||||
thread_loop( (uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++),{
|
thread_loop( (uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++),{
|
||||||
std::vector<int> out_ocoor(ndim);
|
Coordinate out_ocoor(ndim);
|
||||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
||||||
|
|
||||||
std::vector<SobjOut*> ptrs(out_nsimd);
|
ExtractPointerArray<SobjOut> ptrs(out_nsimd);
|
||||||
|
|
||||||
std::vector<int> lcoor(out_grid->Nd());
|
Coordinate lcoor(out_grid->Nd());
|
||||||
|
|
||||||
for(int lane=0; lane < out_nsimd; lane++){
|
for(int lane=0; lane < out_nsimd; lane++){
|
||||||
for(int mu=0;mu<ndim;mu++)
|
for(int mu=0;mu<ndim;mu++)
|
||||||
@ -778,7 +777,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
assert(nvector*split_nproc==full_nproc);
|
assert(nvector*split_nproc==full_nproc);
|
||||||
assert(nvector == full_vecs);
|
assert(nvector == full_vecs);
|
||||||
|
|
||||||
std::vector<int> ratio(ndim);
|
Coordinate ratio(ndim);
|
||||||
for(int d=0;d<ndim;d++){
|
for(int d=0;d<ndim;d++){
|
||||||
ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
|
ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
|
||||||
}
|
}
|
||||||
@ -797,7 +796,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
}
|
}
|
||||||
|
|
||||||
int nvec = nvector; // Counts down to 1 as we collapse dims
|
int nvec = nvector; // Counts down to 1 as we collapse dims
|
||||||
std::vector<int> ldims = full_grid->_ldimensions;
|
Coordinate ldims = full_grid->_ldimensions;
|
||||||
|
|
||||||
for(int d=ndim-1;d>=0;d--){
|
for(int d=ndim-1;d>=0;d--){
|
||||||
|
|
||||||
@ -824,7 +823,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
|
|
||||||
// Loop over reordered data post A2A
|
// Loop over reordered data post A2A
|
||||||
thread_loop( (int c=0;c<chunk;c++),{
|
thread_loop( (int c=0;c<chunk;c++),{
|
||||||
std::vector<int> coor(ndim);
|
Coordinate coor(ndim);
|
||||||
for(int m=0;m<M;m++){
|
for(int m=0;m<M;m++){
|
||||||
for(int s=0;s<sP;s++){
|
for(int s=0;s<sP;s++){
|
||||||
|
|
||||||
@ -904,7 +903,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
assert(nvector*split_nproc==full_nproc);
|
assert(nvector*split_nproc==full_nproc);
|
||||||
assert(nvector == full_vecs);
|
assert(nvector == full_vecs);
|
||||||
|
|
||||||
std::vector<int> ratio(ndim);
|
Coordinate ratio(ndim);
|
||||||
for(int d=0;d<ndim;d++){
|
for(int d=0;d<ndim;d++){
|
||||||
ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
|
ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
|
||||||
}
|
}
|
||||||
@ -923,7 +922,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
|
|
||||||
int nvec = 1;
|
int nvec = 1;
|
||||||
uint64_t rsites = split_grid->lSites();
|
uint64_t rsites = split_grid->lSites();
|
||||||
std::vector<int> rdims = split_grid->_ldimensions;
|
Coordinate rdims = split_grid->_ldimensions;
|
||||||
|
|
||||||
for(int d=0;d<ndim;d++){
|
for(int d=0;d<ndim;d++){
|
||||||
|
|
||||||
@ -943,7 +942,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
{
|
{
|
||||||
// Loop over reordered data post A2A
|
// Loop over reordered data post A2A
|
||||||
thread_loop( (int c=0;c<chunk;c++),{
|
thread_loop( (int c=0;c<chunk;c++),{
|
||||||
std::vector<int> coor(ndim);
|
Coordinate coor(ndim);
|
||||||
for(int m=0;m<M;m++){
|
for(int m=0;m<M;m++){
|
||||||
for(int s=0;s<sP;s++){
|
for(int s=0;s<sP;s++){
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user