mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-14 13:57:07 +01:00
Compare commits
6 Commits
a78a61d76f
...
4ed2c2c74f
Author | SHA1 | Date | |
---|---|---|---|
4ed2c2c74f | |||
955da582b6 | |||
11b07b950d | |||
8f70cfeda9 | |||
ce64271048 | |||
5cc4f3241d |
@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const int Cshift_verbose=0;
|
||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
@ -65,10 +65,10 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
||||
Cshift_comms(ret,rhs,dimension,shift);
|
||||
}
|
||||
t1=usecond();
|
||||
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if 1
|
||||
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
@ -175,11 +175,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
tscatter+=usecond();
|
||||
}
|
||||
}
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
if (Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
@ -301,12 +303,243 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
}
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
if(Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
}
|
||||
}
|
||||
#else
|
||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
GridBase *grid=rhs.Grid();
|
||||
Lattice<vobj> temp(rhs.Grid());
|
||||
|
||||
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
int pd = rhs.Grid()->_processors[dimension];
|
||||
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
RealD tcopy=0.0;
|
||||
RealD tgather=0.0;
|
||||
RealD tscatter=0.0;
|
||||
RealD tcomms=0.0;
|
||||
uint64_t xbytes=0;
|
||||
|
||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
||||
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
||||
vobj *send_buf;
|
||||
vobj *recv_buf;
|
||||
{
|
||||
grid->ShmBufferFreeAll();
|
||||
size_t bytes = buffer_size*sizeof(vobj);
|
||||
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||
}
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
|
||||
if (comm_proc==0) {
|
||||
|
||||
tcopy-=usecond();
|
||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
tcopy+=usecond();
|
||||
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
tgather-=usecond();
|
||||
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||
tgather+=usecond();
|
||||
|
||||
// int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
|
||||
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
xbytes+=bytes;
|
||||
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||
|
||||
// grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
}
|
||||
}
|
||||
if(Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
GridBase *grid=rhs.Grid();
|
||||
const int Nsimd = grid->Nsimd();
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_object scalar_object;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int fd = grid->_fdimensions[dimension];
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
int ld = grid->_ldimensions[dimension];
|
||||
int pd = grid->_processors[dimension];
|
||||
int simd_layout = grid->_simd_layout[dimension];
|
||||
int comm_dim = grid->_processors[dimension] >1 ;
|
||||
|
||||
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
RealD tcopy=0.0;
|
||||
RealD tgather=0.0;
|
||||
RealD tscatter=0.0;
|
||||
RealD tcomms=0.0;
|
||||
uint64_t xbytes=0;
|
||||
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||
scalar_object * recv_buf_extract_mpi;
|
||||
scalar_object * send_buf_extract_mpi;
|
||||
{
|
||||
size_t bytes = sizeof(scalar_object)*buffer_size;
|
||||
grid->ShmBufferFreeAll();
|
||||
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||
}
|
||||
for(int s=0;s<Nsimd;s++){
|
||||
send_buf_extract[s].resize(buffer_size);
|
||||
recv_buf_extract[s].resize(buffer_size);
|
||||
}
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
int cb = (cbmask==0x2)? Odd : Even;
|
||||
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
// loop over outer coord planes orthog to dim
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
// FIXME call local permute copy if none are offnode.
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointers[i] = &send_buf_extract[i][0];
|
||||
}
|
||||
tgather-=usecond();
|
||||
int sx = (x+sshift)%rd;
|
||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||
tgather+=usecond();
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
int inner_bit = (Nsimd>>(permute_type+1));
|
||||
int ic= (i&inner_bit)? 1:0;
|
||||
|
||||
int my_coor = rd*ic + x;
|
||||
int nbr_coor = my_coor+sshift;
|
||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||
|
||||
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
||||
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
||||
int nbr_lane = (i&(~inner_bit));
|
||||
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
if (nbr_ic) nbr_lane|=inner_bit;
|
||||
|
||||
assert (sx == nbr_ox);
|
||||
|
||||
if(nbr_proc){
|
||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
|
||||
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||
xmit_to_rank,
|
||||
(void *)recv_buf_extract_mpi,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||
xbytes+=bytes;
|
||||
|
||||
// grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
rpointers[i] = &recv_buf_extract[i][0];
|
||||
} else {
|
||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||
}
|
||||
|
||||
}
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
|
||||
}
|
||||
if(Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
||||
|
@ -522,11 +522,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
||||
int ostride=grid->_ostride[orthogdim];
|
||||
|
||||
//Reduce Data down to lvSum
|
||||
RealD t_sum =-usecond();
|
||||
sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
|
||||
t_sum +=usecond();
|
||||
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
Coordinate icoor(Nd);
|
||||
|
||||
RealD t_rest =-usecond();
|
||||
for(int rt=0;rt<rd;rt++){
|
||||
|
||||
extract(lvSum[rt],extracted);
|
||||
@ -556,6 +559,9 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
||||
scalar_type * ptr = (scalar_type *) &result[0];
|
||||
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
||||
grid->GlobalSumVector(ptr, words);
|
||||
t_rest +=usecond();
|
||||
std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
|
||||
|
||||
}
|
||||
template<class vobj> inline
|
||||
std::vector<typename vobj::scalar_object>
|
||||
|
@ -6,6 +6,34 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#undef DELTA_F_EQ_2
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
//Meson
|
||||
// Interested in
|
||||
//
|
||||
// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
|
||||
//
|
||||
// Conventional meson field:
|
||||
//
|
||||
// = sum_x,y Trace[ sum_j G |v_j(y,ty)> <w_j(x,tx)| G sum_i |v_i(x,tx) ><w_i(y,ty)| ]
|
||||
// = sum_ij sum_x,y < w_j(x,tx)| G |v_i(x,tx) > <w_i(y,ty) (x)|G| v_j(y,ty) >
|
||||
// = sum_ij PI_ji(tx) PI_ij(ty)
|
||||
//
|
||||
// G5-Hermiticity
|
||||
//
|
||||
// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
|
||||
// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ]
|
||||
// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> <w_j(x,tx)| G g5 sum_i (|v_j(y,ty)> <w_i(x,tx)|)^dag ] -- (*)
|
||||
//
|
||||
// NB: Dag applies to internal indices spin,colour,complex
|
||||
//
|
||||
// = sum_ij sum_x,y Trace[ g5 G |v_j(y,ty)> <w_j(x,tx)| G g5 |w_i(x,tx)> <v_i(y,ty)| ]
|
||||
// = sum_ij sum_x,y <v_i(y,ty)|g5 G |v_j(y,ty)> <w_j(x,tx)| G g5 |w_i(x,tx)>
|
||||
// = sum_ij PionVV(ty) PionWW(tx)
|
||||
//
|
||||
// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker
|
||||
// expectation value. Otherwise biased.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename FImpl>
|
||||
class A2Autils
|
||||
{
|
||||
@ -26,7 +54,9 @@ public:
|
||||
|
||||
typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;
|
||||
|
||||
template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
|
||||
|
||||
// output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
|
||||
template <typename TensorType>
|
||||
static void MesonField(TensorType &mat,
|
||||
const FermionField *lhs_wi,
|
||||
const FermionField *rhs_vj,
|
||||
@ -34,6 +64,14 @@ public:
|
||||
const std::vector<ComplexField > &mom,
|
||||
int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
|
||||
|
||||
template <typename TensorType>
|
||||
static void MesonFieldGPU(TensorType &mat,
|
||||
const FermionField *lhs_wi,
|
||||
const FermionField *rhs_vj,
|
||||
std::vector<Gamma::Algebra> gammas,
|
||||
const std::vector<ComplexField > &mom,
|
||||
int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
|
||||
/*
|
||||
static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
|
||||
const FermionField *wi,
|
||||
const FermionField *vj,
|
||||
@ -58,7 +96,8 @@ public:
|
||||
const FermionField *vi,
|
||||
const FermionField *vj,
|
||||
int orthogdim);
|
||||
|
||||
*/
|
||||
|
||||
template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
|
||||
static void AslashField(TensorType &mat,
|
||||
const FermionField *lhs_wi,
|
||||
@ -159,14 +198,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
||||
int MFlvol = ld*Lblock*Rblock*Nmom;
|
||||
|
||||
std::vector<SpinMatrix_v > lvSum(MFrvol);
|
||||
thread_for( r, MFrvol,{
|
||||
for(int r=0;r<MFrvol;r++){
|
||||
lvSum[r] = Zero();
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<SpinMatrix_s > lsSum(MFlvol);
|
||||
thread_for(r,MFlvol,{
|
||||
for(int r=0;r<MFlvol;r++){
|
||||
lsSum[r]=scalar_type(0.0);
|
||||
});
|
||||
}
|
||||
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
@ -174,7 +213,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
||||
|
||||
// potentially wasting cores here if local time extent too small
|
||||
if (t_kernel) *t_kernel = -usecond();
|
||||
thread_for(r,rd,{
|
||||
for(int r=0;r<rd;r++) {
|
||||
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
|
||||
@ -213,10 +252,10 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
thread_for(rt,rd,{
|
||||
for(int rt=0;rt<rd;rt++){
|
||||
|
||||
Coordinate icoor(Nd);
|
||||
ExtractBuffer<SpinMatrix_s> extracted(Nsimd);
|
||||
@ -241,7 +280,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
||||
|
||||
}
|
||||
}}}
|
||||
});
|
||||
}
|
||||
if (t_kernel) *t_kernel += usecond();
|
||||
assert(mat.dimension(0) == Nmom);
|
||||
assert(mat.dimension(1) == Ngamma);
|
||||
@ -290,35 +329,112 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
||||
if (t_gsum) *t_gsum += usecond();
|
||||
}
|
||||
|
||||
const int A2Ablocking=8;
|
||||
template<typename vtype> using iVecSpinMatrix = iVector<iMatrix<iScalar<vtype>, Ns>, A2Ablocking>;
|
||||
typedef iVecSpinMatrix<Complex > VecSpinMatrix;
|
||||
typedef iVecSpinMatrix<vComplex > vVecSpinMatrix;
|
||||
typedef Lattice<vVecSpinMatrix> LatticeVecSpinMatrix;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
//Meson
|
||||
// Interested in
|
||||
//
|
||||
// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
|
||||
//
|
||||
// Conventional meson field:
|
||||
//
|
||||
// = sum_x,y Trace[ sum_j G |v_j(y,ty)> <w_j(x,tx)| G sum_i |v_i(x,tx) ><w_i(y,ty)| ]
|
||||
// = sum_ij sum_x,y < w_j(x,tx)| G |v_i(x,tx) > <w_i(y,ty) (x)|G| v_j(y,ty) >
|
||||
// = sum_ij PI_ji(tx) PI_ij(ty)
|
||||
//
|
||||
// G5-Hermiticity
|
||||
//
|
||||
// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
|
||||
// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ]
|
||||
// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> <w_j(x,tx)| G g5 sum_i (|v_j(y,ty)> <w_i(x,tx)|)^dag ] -- (*)
|
||||
//
|
||||
// NB: Dag applies to internal indices spin,colour,complex
|
||||
//
|
||||
// = sum_ij sum_x,y Trace[ g5 G |v_j(y,ty)> <w_j(x,tx)| G g5 |w_i(x,tx)> <v_i(y,ty)| ]
|
||||
// = sum_ij sum_x,y <v_i(y,ty)|g5 G |v_j(y,ty)> <w_j(x,tx)| G g5 |w_i(x,tx)>
|
||||
// = sum_ij PionVV(ty) PionWW(tx)
|
||||
//
|
||||
// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker
|
||||
// expectation value. Otherwise biased.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class FImpl>
|
||||
template <typename TensorType>
|
||||
void A2Autils<FImpl>::MesonFieldGPU(TensorType &mat,
|
||||
const FermionField *lhs_wi,
|
||||
const FermionField *rhs_vj,
|
||||
std::vector<Gamma::Algebra> gammas,
|
||||
const std::vector<ComplexField > &mom,
|
||||
int orthogdim, double *t_kernel, double *t_gsum)
|
||||
{
|
||||
const int block=A2Ablocking;
|
||||
typedef typename FImpl::SiteSpinor vobj;
|
||||
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Lblock = mat.dimension(3);
|
||||
int Rblock = mat.dimension(4);
|
||||
|
||||
// assert(Lblock % block==0);
|
||||
// assert(Rblock % block==0);
|
||||
|
||||
GridBase *grid = lhs_wi[0].Grid();
|
||||
|
||||
const int Nd = grid->_ndimension;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
int Nt = grid->GlobalDimensions()[orthogdim];
|
||||
int Ngamma = gammas.size();
|
||||
int Nmom = mom.size();
|
||||
|
||||
|
||||
LatticeVecSpinMatrix SpinMat(grid);
|
||||
LatticeVecSpinMatrix MomSpinMat(grid);
|
||||
|
||||
RealD t_afor = 0.0;
|
||||
RealD t_sum = 0.0;
|
||||
RealD t_pha = 0.0;
|
||||
RealD t_trace= 0.0;
|
||||
uint64_t ncall=0;
|
||||
|
||||
std::vector<VecSpinMatrix> sliced;
|
||||
for(int i=0;i<Lblock;i++){
|
||||
autoView(SpinMat_v,SpinMat,AcceleratorWrite);
|
||||
autoView(lhs_v,lhs_wi[i],AcceleratorRead);
|
||||
for(int jo=0;jo<Rblock;jo+=block){
|
||||
for(int j=jo;j<MIN(Rblock,jo+block);j++){
|
||||
int jj=j%block;
|
||||
autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
|
||||
//////////////////////////////////////////
|
||||
// Should write a SpinOuterColorTrace
|
||||
//////////////////////////////////////////
|
||||
t_afor-=usecond();
|
||||
accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
|
||||
auto left = conjugate(lhs_v(ss));
|
||||
auto right = rhs_v(ss);
|
||||
auto vv = SpinMat_v(ss);
|
||||
for(int s1=0;s1<Ns;s1++){
|
||||
for(int s2=0;s2<Ns;s2++){
|
||||
vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
|
||||
+ left()(s2)(1) * right()(s1)(1)
|
||||
+ left()(s2)(2) * right()(s1)(2);
|
||||
}}
|
||||
coalescedWrite(SpinMat_v[ss],vv);
|
||||
});
|
||||
t_afor+=usecond();
|
||||
}// j within block
|
||||
// After getting the sitewise product do the mom phase loop
|
||||
for(int m=0;m<Nmom;m++){
|
||||
t_pha-=usecond();
|
||||
MomSpinMat = SpinMat * mom[m];
|
||||
t_pha+=usecond();
|
||||
t_sum-=usecond();
|
||||
ncall++;
|
||||
sliceSum(MomSpinMat,sliced,orthogdim);
|
||||
t_sum+=usecond();
|
||||
t_trace-=usecond();
|
||||
for(int mu=0;mu<Ngamma;mu++){
|
||||
for(int t=0;t<sliced.size();t++){
|
||||
for(int j=jo;j<MIN(Rblock,jo+block);j++){
|
||||
int jj=j%block;
|
||||
auto tmp = sliced[t](jj);
|
||||
auto trSG = trace(tmp*Gamma(gammas[mu]));
|
||||
mat(m,mu,t,i,j) = trSG()();
|
||||
}
|
||||
}
|
||||
}
|
||||
t_trace+=usecond();
|
||||
}
|
||||
}//jo
|
||||
}
|
||||
std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_afor "<<t_afor<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_pha "<<t_pha<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_sum "<<t_sum<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU N_sum "<<ncall<<" calls"<<std::endl;
|
||||
std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_trace "<<t_trace<<" us"<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
template<class FImpl>
|
||||
void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
|
||||
const FermionField *wi,
|
||||
@ -645,6 +761,7 @@ void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
|
||||
const int nog5=0;
|
||||
PionFieldXX(mat,vi,vj,orthogdim,nog5);
|
||||
}
|
||||
*/
|
||||
|
||||
// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
|
||||
//
|
||||
@ -992,9 +1109,9 @@ typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::va
|
||||
std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
|
||||
void>::type
|
||||
A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
const TensorType &WW_sd,
|
||||
const FermionField *vs,
|
||||
const FermionField *vd)
|
||||
const TensorType &WW_sd,
|
||||
const FermionField *vs,
|
||||
const FermionField *vd)
|
||||
{
|
||||
GridBase *grid = vs[0].Grid();
|
||||
|
||||
@ -1062,7 +1179,6 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
}
|
||||
|
||||
for (int t = 0; t < N_t; t++){
|
||||
std::cout << GridLogMessage << "Contraction t = " << t << std::endl;
|
||||
buf = WW_sd[t];
|
||||
thread_for(ss,grid->oSites(),{
|
||||
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
|
||||
|
@ -132,27 +132,17 @@ inline void cuda_mem(void)
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
if ( num1*num2 ) { \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define accelerator_for6dNB(iter1, num1, \
|
||||
@ -175,19 +165,6 @@ inline void cuda_mem(void)
|
||||
}
|
||||
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
{
|
||||
@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
}
|
||||
template<typename lambda> __global__
|
||||
void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
{
|
||||
// Weird permute is to make lane coalesce for large blocks
|
||||
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
|
||||
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
|
||||
uint64_t z = threadIdx.x;
|
||||
if ( (x < num1) && (y<num2) && (z<num3) ) {
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
||||
@ -523,9 +489,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
|
||||
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
|
||||
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
||||
#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
|
||||
prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
#define accelerator_for( iter, num, nsimd, ... ) \
|
||||
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
|
||||
|
16
systems/Linux-cuda/config-command
Normal file
16
systems/Linux-cuda/config-command
Normal file
@ -0,0 +1,16 @@
|
||||
../../configure \
|
||||
--enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-shm=nvlink \
|
||||
--with-lime=$CLIME \
|
||||
--with-gmp=$GMP \
|
||||
--with-mpfr=$MPFR \
|
||||
--enable-accelerator=cuda \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--disable-unified \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared -L$NVIDIALIB -lcublas" \
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"
|
||||
|
12
systems/Linux-cuda/sourceme.sh
Normal file
12
systems/Linux-cuda/sourceme.sh
Normal file
@ -0,0 +1,12 @@
|
||||
. /home/paboyle/spack/share/spack/setup-env.sh
|
||||
spack load cuda@12.0.0
|
||||
spack load c-lime
|
||||
spack load gmp
|
||||
spack load mpfr
|
||||
spack load openmpi
|
||||
export CUDA=`spack find --paths cuda@11.8.0 | grep cuda | cut -c 14-`
|
||||
export CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
|
||||
export GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
|
||||
export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
|
||||
export NVIDIALIB=$CUDA/targets/x86_64-linux/lib/
|
||||
export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH
|
@ -1,7 +1,7 @@
|
||||
spack load c-lime
|
||||
spack load gmp
|
||||
spack load mpfr
|
||||
CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
|
||||
CLIME=`spack find --paths c-lime | grep c-lime| cut -c 13-`
|
||||
GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
|
||||
MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
|
||||
echo clime X$CLIME
|
||||
|
@ -56,13 +56,9 @@ int main(int argc, char *argv[])
|
||||
|
||||
// MesonField lhs and rhs vectors
|
||||
std::vector<FermionField> phi(VDIM,&grid);
|
||||
std::vector<FermionField> rho(VDIM,&grid);
|
||||
FermionField rho_tmp(&grid);
|
||||
std::cout << GridLogMessage << "Initialising random meson fields" << std::endl;
|
||||
for (unsigned int i = 0; i < VDIM; ++i){
|
||||
random(pRNG,phi[i]);
|
||||
random(pRNG,rho_tmp); //ideally only nonzero on t=0
|
||||
rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0
|
||||
}
|
||||
std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl;
|
||||
|
||||
@ -82,7 +78,7 @@ int main(int argc, char *argv[])
|
||||
{1.,1.,1.},
|
||||
{2.,0.,0.}
|
||||
};
|
||||
|
||||
// 5 momenta x VDIMxVDIM = 125 calls (x 16 spins) 1.4s => 1400/125 ~10ms per call
|
||||
std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "Computing complex phases" << std::endl;
|
||||
@ -102,28 +98,47 @@ int main(int argc, char *argv[])
|
||||
std::cout << GridLogMessage << "Computing complex phases done." << std::endl;
|
||||
|
||||
Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
|
||||
Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
|
||||
Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
|
||||
Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp_gpu(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
|
||||
|
||||
// timer
|
||||
double start,stop;
|
||||
|
||||
//execute meson field routine
|
||||
std::cout << GridLogMessage << "Meson Field Warmup Begin" << std::endl;
|
||||
A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
|
||||
std::cout << GridLogMessage << "Meson Field Timing Begin" << std::endl;
|
||||
start = usecond();
|
||||
A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
|
||||
stop = usecond();
|
||||
std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl;
|
||||
start = usecond();
|
||||
/* Ideally, for this meson field we could pass TSRC (even better a list of timeslices)
|
||||
* to the routine so that all the compnents which are predictably equal to zero are not computed. */
|
||||
A2Autils<WilsonImplR>::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp);
|
||||
stop = usecond();
|
||||
std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl;
|
||||
start = usecond();
|
||||
A2Autils<WilsonImplR>::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp);
|
||||
stop = usecond();
|
||||
std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "Meson Field GPU Warmup Begin" << std::endl;
|
||||
A2Autils<WilsonImplR>::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp);
|
||||
std::cout << GridLogMessage << "Meson Field GPU Timing Begin" << std::endl;
|
||||
start = usecond();
|
||||
A2Autils<WilsonImplR>::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp);
|
||||
stop = usecond();
|
||||
std::cout << GridLogMessage << "M_gpu(phi,phi) created, execution time " << stop-start << " us" << std::endl;
|
||||
|
||||
for(int mom=0;mom<momenta.size();mom++){
|
||||
for(int mu=0;mu<Gmu.size();mu++){
|
||||
for(int t=0;t<Nt;t++){
|
||||
for(int v=0;v<VDIM;v++){
|
||||
for(int w=0;w<VDIM;w++){
|
||||
std::cout << GridLogMessage
|
||||
<< " " << mom
|
||||
<< " " << mu
|
||||
<< " " << t
|
||||
<< " " << v
|
||||
<< " " << w
|
||||
<< " " << Mpp_gpu(mom,mu,t,v,w)
|
||||
<< " " << Mpp(mom,mu,t,v,w) << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string FileName = "Meson_Fields";
|
||||
#ifdef HAVE_HDF5
|
||||
using Default_Reader = Grid::Hdf5Reader;
|
||||
@ -134,12 +149,11 @@ int main(int argc, char *argv[])
|
||||
using Default_Writer = Grid::BinaryWriter;
|
||||
FileName.append(".bin");
|
||||
#endif
|
||||
|
||||
Default_Writer w(FileName);
|
||||
write(w,"phi_phi",Mpp);
|
||||
write(w,"phi_rho",Mpr);
|
||||
write(w,"rho_rho",Mrr);
|
||||
|
||||
{
|
||||
Default_Writer w(FileName);
|
||||
write(w,"phi_phi",Mpp);
|
||||
write(w,"phi_phi_gpu",Mpp_gpu);
|
||||
}
|
||||
// epilogue
|
||||
std::cout << GridLogMessage << "Grid is finalizing now" << std::endl;
|
||||
Grid_finalize();
|
||||
|
Reference in New Issue
Block a user