mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-12 20:27:06 +01:00
Merge GPU support (upstream/develop) into distillation branch.
This compiles and looks right ... but may need some testing * develop: (762 commits) Tensor ambiguous fix Fix for GCC preprocessor/pragma handling bug Trips up NVCC for reasons I dont understand on summit Fix GCC complaint Zero() change Force a couple of things to compile on NVCC Remove debug code nvcc error suppress Merge develop Reduction finished and hopefully fixes CI regression fail on single precisoin and force Double precision variants for summation accuracy Update todo list Freeze the seed Fix compiling of MSource::Gauss for single precision Think the reduction is now sorted and cleaned up Fix force term Printing improvement GPU reduction fix and also exit backtrace option GPU friendly Simplify the comms benchmark ... # Conflicts: # Grid/communicator/SharedMemoryMPI.cc # Grid/qcd/action/fermion/WilsonKernelsAsm.cc # Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h # Grid/qcd/smearing/StoutSmearing.h # Hadrons/Modules.hpp # Hadrons/Utilities/Contractor.cc # Hadrons/modules.inc # tests/forces/Test_dwf_force_eofa.cc # tests/forces/Test_dwf_gpforce_eofa.cc
This commit is contained in:
@ -76,6 +76,7 @@ template <typename FImpl>
|
||||
class TBContraction: public Module<BContractionPar>
|
||||
{
|
||||
public:
|
||||
using W = WilsonImplR; // Debug so I can see type info for default FImpl
|
||||
FERM_TYPE_ALIASES(FImpl,);
|
||||
public:
|
||||
// constructor
|
||||
@ -174,15 +175,25 @@ void TBContraction<FImpl>::execute(void)
|
||||
int Bindex;
|
||||
int Nc=3; //Num colours
|
||||
|
||||
FermionField tmp1(grid3d);
|
||||
FermionField tmp2(grid3d);
|
||||
FermionField tmp3(grid3d);
|
||||
FermionField ftmp1(grid3d);
|
||||
FermionField ftmp2(grid3d);
|
||||
FermionField ftmp3(grid3d);
|
||||
LatticeView<typename FImpl::SiteSpinor> tmp1{ ftmp1 };
|
||||
LatticeView<typename FImpl::SiteSpinor> tmp2{ ftmp2 };
|
||||
LatticeView<typename FImpl::SiteSpinor> tmp3{ ftmp3 };
|
||||
//std::complex<double> * tmp33 = reinterpret_cast<std::complex<double> *>(&(tmp3[0]()(0)(0)));
|
||||
|
||||
#ifdef THIS_IS_NAUGHTY_TODO_FIXME
|
||||
// The reinterpret_cast gets rid of SIMD attributes
|
||||
// Plus other badness - e.g. we perhaps shouldn't explicitly say SpinColourVector
|
||||
// ... but rather use some correct FIMPL types
|
||||
// REVIEW WITH PETER
|
||||
#endif
|
||||
|
||||
SpinColourVector * tmp11 = reinterpret_cast<SpinColourVector *>(&(tmp1[0]()(0)(0)));
|
||||
SpinColourVector * tmp22 = reinterpret_cast<SpinColourVector *>(&(tmp2[0]()(0)(0)));
|
||||
SpinColourVector * tmp33 = reinterpret_cast<SpinColourVector *>(&(tmp3[0]()(0)(0)));
|
||||
|
||||
|
||||
SpinVector tmp11s;
|
||||
SpinVector tmp22s;
|
||||
SpinVector tmp33s;
|
||||
@ -225,10 +236,10 @@ void TBContraction<FImpl>::execute(void)
|
||||
for (int imom=0 ; imom < Nmom ; imom++){
|
||||
for (int t=0 ; t < Nt ; t++){
|
||||
Bindex = i1 + N_1*(i2 + N_2*(i3 + N_3*(imom+Nmom*t)));
|
||||
ExtractSliceLocal(tmp1,one[i1],0,t,3);
|
||||
ExtractSliceLocal(tmp2,two[i2],0,t,3);
|
||||
ExtractSliceLocal(tmp3,three[i3],0,t,3);
|
||||
parallel_for (unsigned int sU = 0; sU < grid3d->oSites(); ++sU)
|
||||
ExtractSliceLocal(ftmp1,one[i1],0,t,3);
|
||||
ExtractSliceLocal(ftmp2,two[i2],0,t,3);
|
||||
ExtractSliceLocal(ftmp3,three[i3],0,t,3);
|
||||
accelerator_for(sU, grid3d->oSites(), grid3d->Nsimd(),
|
||||
{
|
||||
for (int ie=0 ; ie < 6 ; ie++){
|
||||
// Why does peekColour not work????
|
||||
@ -248,7 +259,7 @@ void TBContraction<FImpl>::execute(void)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -187,10 +187,10 @@ void TDistilVectors<FImpl>::setup(void)
|
||||
envCreate(std::vector<FermionField>, SinkName, 1, nnoise*LI*SI*Nt_inv, envGetGrid(FermionField));
|
||||
|
||||
grid4d = env().getGrid();
|
||||
std::vector<int> latt_size = GridDefaultLatt();
|
||||
std::vector<int> simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
|
||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||
std::vector<int> simd_layout_3 = GridDefaultSimd(Nd-1, vComplex::Nsimd());
|
||||
Coordinate latt_size = GridDefaultLatt();
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
Coordinate simd_layout_3 = GridDefaultSimd(Nd-1, vComplex::Nsimd());
|
||||
latt_size[Nd-1] = 1;
|
||||
simd_layout_3.push_back( 1 );
|
||||
mpi_layout[Nd-1] = 1;
|
||||
@ -233,7 +233,7 @@ void TDistilVectors<FImpl>::execute(void)
|
||||
const int Ntlocal{ grid4d->LocalDimensions()[3] };
|
||||
const int Ntfirst{ grid4d->LocalStarts()[3] };
|
||||
|
||||
const int Ns{ Grid::QCD::Ns };
|
||||
const int Ns{ Ns };
|
||||
const int Nt{ env().getDim(Tdir) };
|
||||
const int TI{ Hadrons::MDistil::DistilParameters::ParameterDefault( par().TI, Nt, false ) };
|
||||
const int LI{ static_cast<int>( perambulator.tensor.dimension(2) ) };
|
||||
@ -254,20 +254,20 @@ void TDistilVectors<FImpl>::execute(void)
|
||||
for( int dt = 0; dt < Nt_inv; dt++ ) {
|
||||
for( int ds = 0; ds < SI; ds++ ) {
|
||||
vecindex = inoise + nnoise * dk + nnoise * LI * ds + nnoise *LI * SI*dt;
|
||||
rho[vecindex] = zero;
|
||||
tmp3d_nospin = zero;
|
||||
rho[vecindex] = 0;
|
||||
tmp3d_nospin = 0;
|
||||
for (int it = dt; it < Nt; it += TI){
|
||||
if (full_tdil) t_inv = tsrc; else t_inv = it;
|
||||
if( t_inv >= Ntfirst && t_inv < Ntfirst + Ntlocal ) {
|
||||
for (int ik = dk; ik < nvec; ik += LI){
|
||||
for (int is = ds; is < Ns; is += SI){
|
||||
ExtractSliceLocal(evec3d,epack.evec[ik],0,t_inv-Ntfirst,Grid::QCD::Tdir);
|
||||
ExtractSliceLocal(evec3d,epack.evec[ik],0,t_inv-Ntfirst,Tdir);
|
||||
//tmp3d_nospin = evec3d * noise[inoise + nnoise*(t_inv + Nt*(ik+nvec*is))];
|
||||
tmp3d_nospin = evec3d * noise(inoise, t_inv, ik, is);
|
||||
tmp3d=zero;
|
||||
tmp3d=0;
|
||||
pokeSpin(tmp3d,tmp3d_nospin,is);
|
||||
tmp2=zero;
|
||||
InsertSliceLocal(tmp3d,tmp2,0,t_inv-Ntfirst,Grid::QCD::Tdir);
|
||||
tmp2=0;
|
||||
InsertSliceLocal(tmp3d,tmp2,0,t_inv-Ntfirst,Tdir);
|
||||
rho[vecindex] += tmp2;
|
||||
}
|
||||
}
|
||||
@ -285,14 +285,14 @@ void TDistilVectors<FImpl>::execute(void)
|
||||
for( int dt = 0; dt < Nt_inv; dt++ ) {
|
||||
for( int ds = 0; ds < SI; ds++ ) {
|
||||
vecindex = inoise + nnoise * dk + nnoise * LI * ds + nnoise *LI * SI*dt;
|
||||
phi[vecindex] = zero;
|
||||
phi[vecindex] = 0;
|
||||
for (int t = Ntfirst; t < Ntfirst + Ntlocal; t++) {
|
||||
sink_tslice=zero;
|
||||
sink_tslice=0;
|
||||
for (int ivec = 0; ivec < nvec; ivec++) {
|
||||
ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Grid::QCD::Tdir);
|
||||
ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Tdir);
|
||||
sink_tslice += evec3d * perambulator(t, ivec, dk, inoise,dt,ds);
|
||||
}
|
||||
InsertSliceLocal(sink_tslice,phi[vecindex],0,t-Ntfirst,Grid::QCD::Tdir);
|
||||
InsertSliceLocal(sink_tslice,phi[vecindex],0,t-Ntfirst,Tdir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -236,7 +236,7 @@ void TLapEvec<GImpl>::execute(void)
|
||||
eig[t].resize(LPar.Nk+LPar.Np,gridLD);
|
||||
|
||||
// Construct smearing operator
|
||||
ExtractSliceLocal(UmuNoTime,Umu_smear,0,t,Grid::QCD::Tdir); // switch to 3d/4d objects
|
||||
ExtractSliceLocal(UmuNoTime,Umu_smear,0,t,Tdir); // switch to 3d/4d objects
|
||||
LinOpPeardonNabla<LatticeColourVector> PeardonNabla(UmuNoTime);
|
||||
LOG(Debug) << "Chebyshev preconditioning to order " << ChebPar.PolyOrder
|
||||
<< " with parameters (alpha,beta) = (" << ChebPar.alpha << "," << ChebPar.beta << ")" << std::endl;
|
||||
@ -263,7 +263,7 @@ void TLapEvec<GImpl>::execute(void)
|
||||
RotateEigen( eig[t].evec ); // Rotate the eigenvectors into our phase convention
|
||||
|
||||
for (int i=0;i<LPar.Nvec;i++){
|
||||
InsertSliceLocal(eig[t].evec[i],eig4d.evec[i],0,t,Grid::QCD::Tdir);
|
||||
InsertSliceLocal(eig[t].evec[i],eig4d.evec[i],0,t,Tdir);
|
||||
if(t==0 && Ntfirst==0)
|
||||
eig4d.eval[i] = eig[t].eval[i]; // TODO: Discuss: is this needed? Is there a better way?
|
||||
}
|
||||
|
@ -110,7 +110,7 @@ template <typename FImpl>
|
||||
void TNoises<FImpl>::setup(void)
|
||||
{
|
||||
const int Nt{env().getDim(Tdir)};
|
||||
const int Ns{Grid::QCD::Ns};
|
||||
//const int Ns{Grid::Ns};
|
||||
const int nnoise{par().nnoise};
|
||||
const int nvec{par().nvec};
|
||||
const int TI{ Hadrons::MDistil::DistilParameters::ParameterDefault( par().TI, Nt, true) };
|
||||
@ -123,7 +123,7 @@ template <typename FImpl>
|
||||
void TNoises<FImpl>::execute(void)
|
||||
{
|
||||
const int Nt{env().getDim(Tdir)};
|
||||
const int Ns{Grid::QCD::Ns};
|
||||
//const int Ns{Grid::Ns};
|
||||
const int nnoise{par().nnoise};
|
||||
const int nvec{par().nvec};
|
||||
const int TI{ Hadrons::MDistil::DistilParameters::ParameterDefault( par().TI, Nt, false) };
|
||||
|
@ -169,9 +169,9 @@ void TPerambFromSolve<FImpl>::execute(void)
|
||||
for (int is = 0; is < Ns; is++) {
|
||||
result_nospin = peekSpin(solve[inoise+nnoise*(dk+LI*(dt+Nt_inv*ds))],is);
|
||||
for (int t = Ntfirst; t < Ntfirst + Ntlocal; t++) {
|
||||
ExtractSliceLocal(result_3d,result_nospin,0,t-Ntfirst,Grid::QCD::Tdir);
|
||||
ExtractSliceLocal(result_3d,result_nospin,0,t-Ntfirst,Tdir);
|
||||
for (int ivec = 0; ivec < nvec_reduced; ivec++) {
|
||||
ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Grid::QCD::Tdir);
|
||||
ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Tdir);
|
||||
pokeSpin(perambulator(t, ivec, dk, inoise,dt,ds),static_cast<Complex>(innerProduct(evec3d, result_3d)),is);
|
||||
std::cout << "perambulator(t, ivec, dk, inoise,dt,ds)(is) = (" << t << "," << ivec << "," << dk << "," << inoise << "," << dt << "," << ds << ")(" << is << ") = " << perambulator(t, ivec, dk, inoise,dt,ds)()(is)() << std::endl;
|
||||
}
|
||||
|
@ -203,27 +203,27 @@ void TPerambulator<FImpl>::execute(void)
|
||||
for (int dt = 0; dt < Nt_inv; dt++) {
|
||||
for (int ds = 0; ds < SI; ds++) {
|
||||
std::cout << "LapH source vector from noise " << inoise << " and dilution component (d_k,d_t,d_alpha) : (" << dk << ","<< dt << "," << ds << ")" << std::endl;
|
||||
dist_source = zero;
|
||||
tmp3d_nospin = zero;
|
||||
evec3d = zero;
|
||||
dist_source = 0;
|
||||
tmp3d_nospin = 0;
|
||||
evec3d = 0;
|
||||
for (int it = dt; it < Nt; it += TI){
|
||||
if (full_tdil) t_inv = tsrc; else t_inv = it;
|
||||
if( t_inv >= Ntfirst && t_inv < Ntfirst + Ntlocal ) {
|
||||
for (int ik = dk; ik < nvec; ik += LI){
|
||||
for (int is = ds; is < Ns; is += SI){
|
||||
ExtractSliceLocal(evec3d,epack.evec[ik],0,t_inv-Ntfirst,Grid::QCD::Tdir);
|
||||
ExtractSliceLocal(evec3d,epack.evec[ik],0,t_inv-Ntfirst,Tdir);
|
||||
//tmp3d_nospin = evec3d * noise[inoise + nnoise*(t_inv + Nt*(ik+nvec*is))];
|
||||
tmp3d_nospin = evec3d * noise(inoise, t_inv, ik, is);
|
||||
tmp3d=zero;
|
||||
tmp3d=0;
|
||||
pokeSpin(tmp3d,tmp3d_nospin,is);
|
||||
tmp2=zero;
|
||||
InsertSliceLocal(tmp3d,tmp2,0,t_inv-Ntfirst,Grid::QCD::Tdir);
|
||||
tmp2=0;
|
||||
InsertSliceLocal(tmp3d,tmp2,0,t_inv-Ntfirst,Tdir);
|
||||
dist_source += tmp2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
result=zero;
|
||||
result=0;
|
||||
v4dtmp = dist_source;
|
||||
if (Ls_ == 1){
|
||||
solver(result, v4dtmp);
|
||||
@ -238,9 +238,9 @@ void TPerambulator<FImpl>::execute(void)
|
||||
for (int is = 0; is < Ns; is++) {
|
||||
result_nospin = peekSpin(result,is);
|
||||
for (int t = Ntfirst; t < Ntfirst + Ntlocal; t++) {
|
||||
ExtractSliceLocal(result_3d,result_nospin,0,t-Ntfirst,Grid::QCD::Tdir);
|
||||
ExtractSliceLocal(result_3d,result_nospin,0,t-Ntfirst,Tdir);
|
||||
for (int ivec = 0; ivec < nvec; ivec++) {
|
||||
ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Grid::QCD::Tdir);
|
||||
ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Tdir);
|
||||
pokeSpin(perambulator(t, ivec, dk, inoise,dt,ds),static_cast<Complex>(innerProduct(evec3d, result_3d)),is);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user