1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Better opt face gather scatter

This commit is contained in:
Peter Boyle 2023-12-22 18:03:38 -05:00
parent 66a1b63aa9
commit dd13937bb6

View File

@ -62,6 +62,8 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
{ {
const int Nsimd=vobj::Nsimd(); const int Nsimd=vobj::Nsimd();
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *grid = lat.Grid(); GridBase *grid = lat.Grid();
Coordinate simd = grid->_simd_layout; Coordinate simd = grid->_simd_layout;
@ -124,8 +126,19 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
/////////////////////////////////////////// ///////////////////////////////////////////
// Transfer into lattice - will coalesce // Transfer into lattice - will coalesce
/////////////////////////////////////////// ///////////////////////////////////////////
#if 0
sobj obj = extractLane(blane,buf_p[ss+offset]); sobj obj = extractLane(blane,buf_p[ss+offset]);
insertLane(lane,lat_v[osite],obj); insertLane(lane,lat_v[osite],obj);
#else
const int words=sizeof(vobj)/sizeof(vector_type);
vector_type * from = (vector_type *)&buf_p[ss+offset];
vector_type * to = (vector_type *)&lat_v[osite];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], blane);
putlane(to[w], stmp, lane);
}
#endif
} }
}); });
} }
@ -138,6 +151,8 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
{ {
const int Nsimd=vobj::Nsimd(); const int Nsimd=vobj::Nsimd();
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
autoView(lat_v, lat, AcceleratorRead); autoView(lat_v, lat, AcceleratorRead);
@ -200,9 +215,20 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
/////////////////////////////////////////// ///////////////////////////////////////////
// Take out of lattice // Take out of lattice
/////////////////////////////////////////// ///////////////////////////////////////////
#if 0
sobj obj = extractLane(lane,lat_v[osite]); sobj obj = extractLane(lane,lat_v[osite]);
insertLane(blane,buf_p[ss+offset],obj); insertLane(blane,buf_p[ss+offset],obj);
#else
const int words=sizeof(vobj)/sizeof(vector_type);
vector_type * to = (vector_type *)&buf_p[ss+offset];
vector_type * from = (vector_type *)&lat_v[osite];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], lane);
putlane(to[w], stmp, blane);
}
#endif
} }
}); });
/* /*
@ -545,14 +571,15 @@ public:
t_scatter+= usecond() - t; t_scatter+= usecond() - t;
t_tot+=usecond(); t_tot+=usecond();
std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
std::cout << GridLogDebug << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
std::cout << GridLogDebug << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes :" << depth*bytes/1e6 << "MB"<<std::endl;
} }
}; };