mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
Further updates
This commit is contained in:
parent
e859a199df
commit
2c54be651c
@ -94,26 +94,27 @@ public:
|
|||||||
int ghost_zone=0;
|
int ghost_zone=0;
|
||||||
for(int32_t point = 0 ; point < geom.npoint; point++){
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||||
int i=s*geom.npoint+point;
|
int i=s*geom.npoint+point;
|
||||||
if( Stencil._entries[i]._permute ) {
|
if( Stencil._entries[i]._wrap ) {
|
||||||
ghost_zone=1;
|
ghost_zone=1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// std::cout << "site " <<s<<"/"<<sites <<" ghost_zone "<<ghost_zone<<std::endl;
|
||||||
GeneralStencilEntryReordered tmp;
|
GeneralStencilEntryReordered tmp;
|
||||||
if( ghost_zone==0) {
|
if( ghost_zone==0) {
|
||||||
for(int32_t point = 0 ; point < geom.npoint; point++){
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||||
int i=s*geom.npoint+point;
|
int i=s*geom.npoint+point;
|
||||||
tmp._offset = Stencil._entries[i]._offset;
|
tmp._offset = Stencil._entries[i]._offset;
|
||||||
tmp._permute= Stencil._entries[i]._permute; // Should be no premute and j=site
|
tmp._wrap= Stencil._entries[i]._wrap; // Should be no premute and j=site
|
||||||
tmp._input = s;
|
tmp._input = s;
|
||||||
StencilTmp.push_back(tmp);
|
StencilTmp.push_back(tmp);
|
||||||
}
|
}
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << " oSites " << _CoarseGridMulti->oSites()<<std::endl;
|
std::cout << " oSites " << _CoarseGridMulti->oSites()<<std::endl;
|
||||||
std::cout << " npoint " << geom.npoint<<std::endl;
|
std::cout << " npoint " << geom.npoint<<std::endl;
|
||||||
std::cout << " StencilTmp "<<StencilTmp.size();
|
std::cout << " StencilTmp "<<StencilTmp.size()<<std::endl;
|
||||||
|
|
||||||
assert(_CoarseGridMulti->oSites()*geom.npoint==StencilTmp.size());
|
assert(_CoarseGridMulti->oSites()*geom.npoint==StencilTmp.size());
|
||||||
acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
|
acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
|
||||||
CopyMatrix();
|
CopyMatrix();
|
||||||
@ -198,9 +199,9 @@ public:
|
|||||||
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||||
|
|
||||||
std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
|
// std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
|
||||||
std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
// std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
||||||
std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
// std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
||||||
tmult-=usecond();
|
tmult-=usecond();
|
||||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
|
accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
|
||||||
@ -282,9 +283,9 @@ public:
|
|||||||
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||||
|
|
||||||
std::cout << " osites "<<osites <<" bound "<<bound<< " stencilsize "<<StencilMasked.size()<<std::endl;
|
// std::cout << " osites "<<osites <<" bound "<<bound<< " stencilsize "<<StencilMasked.size()<<std::endl;
|
||||||
std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
// std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
||||||
std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
// std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
||||||
tmult-=usecond();
|
tmult-=usecond();
|
||||||
auto Stencil_v = &StencilMasked[0];
|
auto Stencil_v = &StencilMasked[0];
|
||||||
accelerator_for(rspb, StencilMasked.size()*nbasis, Nsimd, {
|
accelerator_for(rspb, StencilMasked.size()*nbasis, Nsimd, {
|
||||||
@ -294,14 +295,19 @@ public:
|
|||||||
int32_t point= bp/nbasis;
|
int32_t point= bp/nbasis;
|
||||||
int32_t b = bp%nbasis;
|
int32_t b = bp%nbasis;
|
||||||
auto SE = &Stencil_v[ss*npoint+point];
|
auto SE = &Stencil_v[ss*npoint+point];
|
||||||
int32_t s = SE->_input;
|
int32_t s = SE->_input; // site of padded
|
||||||
int32_t snbr= SE->_offset;
|
int32_t snbr= SE->_offset;
|
||||||
std::cout << " unpadded " << ss<<" padded " << s<< " point "<<point <<" row " <<b<<std::endl;
|
|
||||||
auto nbr = coalescedRead(in_v[snbr]);
|
auto nbr = coalescedRead(in_v[snbr]);
|
||||||
auto res = Aview_p[point][s](0,b)*nbr(0);
|
auto res = Aview_p[point][s](0,b)*nbr(0);
|
||||||
for(int bb=1;bb<nbasis;bb++) {
|
for(int bb=1;bb<nbasis;bb++) {
|
||||||
res = res + Aview_p[point][s](bb,b)*nbr(bb);
|
res = res + Aview_p[point][s](bb,b)*nbr(bb);
|
||||||
}
|
}
|
||||||
|
// std::cout << " unpadded " << ss<<" padded " << s<< " point "<<point <<" row " <<b<<" "<< innerProduct(res,res) <<std::endl;
|
||||||
|
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" res "<< innerProduct(res,res) <<std::endl;
|
||||||
|
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbrIP "<< innerProduct(nbr,nbr) <<std::endl;
|
||||||
|
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbr "<< nbr <<std::endl;
|
||||||
|
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" nbr "<< in_v[snbr] <<std::endl;
|
||||||
|
// std::cout << " unpadded " << ss<<" point "<<point <<" row " <<b<<" A "<< innerProduct(Aview_p[point][s],Aview_p[point][s]) <<std::endl;
|
||||||
coalescedWrite(Vview_p[point][ss](b),res);
|
coalescedWrite(Vview_p[point][ss](b),res);
|
||||||
});
|
});
|
||||||
tmult2-=usecond();
|
tmult2-=usecond();
|
||||||
@ -332,10 +338,10 @@ public:
|
|||||||
std::cout << GridLogMessage<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
||||||
// std::cout << GridLogMessage<<std::endl;
|
// std::cout << GridLogMessage<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
// std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
// std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||||
|
|
||||||
};
|
};
|
||||||
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
||||||
|
@ -851,8 +851,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
#endif
|
#endif
|
||||||
});
|
});
|
||||||
t_acc+=usecond();
|
t_acc+=usecond();
|
||||||
std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
|
// std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
|
||||||
std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
|
// std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
|
||||||
acceleratorFreeDevice(table_d);
|
acceleratorFreeDevice(table_d);
|
||||||
free(table);
|
free(table);
|
||||||
|
|
||||||
|
@ -95,32 +95,38 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
|||||||
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
||||||
|
|
||||||
// scalar layout won't coalesce
|
// scalar layout won't coalesce
|
||||||
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
|
#ifdef GRID_SIMT
|
||||||
int olane=blane%rNsimd; // reduced lattice lane
|
{
|
||||||
int obit =blane/rNsimd;
|
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
|
||||||
|
#else
|
||||||
|
for(int blane=0;blane<Nsimd;blane++) {
|
||||||
|
#endif
|
||||||
|
int olane=blane%rNsimd; // reduced lattice lane
|
||||||
|
int obit =blane/rNsimd;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
|
// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
int ssp = ss*simd[dim]+obit;
|
int ssp = ss*simd[dim]+obit;
|
||||||
int b = ssp%block;
|
int b = ssp%block;
|
||||||
int n = ssp/block;
|
int n = ssp/block;
|
||||||
int osite= b+n*stride + ox*block;
|
int osite= b+n*stride + ox*block;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// isite -- map lane within buffer to lane within lattice
|
// isite -- map lane within buffer to lane within lattice
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
int lane;
|
int lane;
|
||||||
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
|
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
|
||||||
icoor[dim]=ix;
|
icoor[dim]=ix;
|
||||||
Lexicographic::IndexFromCoor(icoor,lane,simd);
|
Lexicographic::IndexFromCoor(icoor,lane,simd);
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Transfer into lattice - will coalesce
|
// Transfer into lattice - will coalesce
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
sobj obj = extractLane(blane,buf_p[ss+offset]);
|
sobj obj = extractLane(blane,buf_p[ss+offset]);
|
||||||
insertLane(lane,lat_v[osite],obj);
|
insertLane(lane,lat_v[osite],obj);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,34 +171,39 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
|||||||
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
|
||||||
|
|
||||||
// scalar layout won't coalesce
|
// scalar layout won't coalesce
|
||||||
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
|
#ifdef GRID_SIMT
|
||||||
int olane=blane%rNsimd; // reduced lattice lane
|
{
|
||||||
int obit =blane/rNsimd;
|
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
|
||||||
|
#else
|
||||||
|
for(int blane=0;blane<Nsimd;blane++) {
|
||||||
|
#endif
|
||||||
|
int olane=blane%rNsimd; // reduced lattice lane
|
||||||
|
int obit =blane/rNsimd;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// osite
|
// osite
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
int ssp = ss*simd[dim]+obit;
|
int ssp = ss*simd[dim]+obit;
|
||||||
int b = ssp%block;
|
int b = ssp%block;
|
||||||
int n = ssp/block;
|
int n = ssp/block;
|
||||||
int osite= b+n*stride + ox*block;
|
int osite= b+n*stride + ox*block;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// isite -- map lane within buffer to lane within lattice
|
// isite -- map lane within buffer to lane within lattice
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
int lane;
|
int lane;
|
||||||
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
|
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
|
||||||
icoor[dim]=ix;
|
icoor[dim]=ix;
|
||||||
Lexicographic::IndexFromCoor(icoor,lane,simd);
|
Lexicographic::IndexFromCoor(icoor,lane,simd);
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Take out of lattice
|
// Take out of lattice
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
|
|
||||||
sobj obj = extractLane(lane,lat_v[osite]);
|
|
||||||
insertLane(blane,buf_p[ss+offset],obj);
|
|
||||||
|
|
||||||
|
sobj obj = extractLane(lane,lat_v[osite]);
|
||||||
|
insertLane(blane,buf_p[ss+offset],obj);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
/*
|
/*
|
||||||
int words =block*nblock/simd[dim];
|
int words =block*nblock/simd[dim];
|
||||||
|
@ -32,6 +32,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
struct GeneralStencilEntry {
|
struct GeneralStencilEntry {
|
||||||
uint64_t _offset; // 4 bytes
|
uint64_t _offset; // 4 bytes
|
||||||
uint8_t _permute; // 1 bytes // Horrible alignment properties
|
uint8_t _permute; // 1 bytes // Horrible alignment properties
|
||||||
|
uint8_t _wrap; // 1 bytes // Horrible alignment properties
|
||||||
};
|
};
|
||||||
struct GeneralStencilEntryReordered : public GeneralStencilEntry {
|
struct GeneralStencilEntryReordered : public GeneralStencilEntry {
|
||||||
uint64_t _input;
|
uint64_t _input;
|
||||||
@ -105,10 +106,12 @@ public:
|
|||||||
// Simpler version using icoor calculation
|
// Simpler version using icoor calculation
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
SE._permute =0;
|
SE._permute =0;
|
||||||
|
SE._wrap=0;
|
||||||
for(int d=0;d<Coor.size();d++){
|
for(int d=0;d<Coor.size();d++){
|
||||||
|
|
||||||
int fd = grid->_fdimensions[d];
|
int fd = grid->_fdimensions[d];
|
||||||
int rd = grid->_rdimensions[d];
|
int rd = grid->_rdimensions[d];
|
||||||
|
int ld = grid->_ldimensions[d];
|
||||||
int ly = grid->_simd_layout[d];
|
int ly = grid->_simd_layout[d];
|
||||||
|
|
||||||
assert((ly==1)||(ly==2)||(ly==grid->Nsimd()));
|
assert((ly==1)||(ly==2)||(ly==grid->Nsimd()));
|
||||||
@ -116,6 +119,10 @@ public:
|
|||||||
int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1
|
int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1
|
||||||
int x = Coor[d]; // x in [0... rd-1] as an oSite
|
int x = Coor[d]; // x in [0... rd-1] as an oSite
|
||||||
|
|
||||||
|
if ( (x + shift)%fd != (x+shift)%ld ){
|
||||||
|
SE._wrap = 1;
|
||||||
|
}
|
||||||
|
|
||||||
int permute_dim = grid->PermuteDim(d);
|
int permute_dim = grid->PermuteDim(d);
|
||||||
int permute_slice=0;
|
int permute_slice=0;
|
||||||
if(permute_dim){
|
if(permute_dim){
|
||||||
|
Loading…
Reference in New Issue
Block a user