mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
fast cpu basisRotate and other small cleanups
This commit is contained in:
parent
0e88bf4bff
commit
197612bc7a
@ -136,34 +136,39 @@ void MemoryManager::Init(void)
|
|||||||
Ncache[SharedSmall]=Nc;
|
Ncache[SharedSmall]=Nc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
|
||||||
#ifdef ALLOCATION_CACHE
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
// only root node delivers messages, this is called before communicator is initialized,
|
||||||
|
// so need a manual restriction
|
||||||
|
if ( CartesianCommunicator::RankWorld() == 0 ) {
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
||||||
|
#ifdef ALLOCATION_CACHE
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_UVM
|
#ifdef GRID_UVM
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||||
|
@ -44,22 +44,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
|
|||||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
if ( !flag ) {
|
if ( !flag ) {
|
||||||
|
|
||||||
// Fugaku Tofu: enable by default
|
|
||||||
/*
|
|
||||||
#if defined (A64FX) || defined (A64FXFIXEDSIZE)
|
|
||||||
#ifndef TOFU
|
|
||||||
#define TOFU
|
|
||||||
#pragma message ("TOFU network / MPI_THREAD_SERIALIZED")
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
|
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
|
||||||
nCommThreads=1;
|
nCommThreads=1;
|
||||||
// wrong results here too
|
// wrong results here too
|
||||||
//MPI_Init(argc,argv);
|
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
|
||||||
|
|
||||||
// comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
|
|
||||||
// other comms schemes are ok
|
// other comms schemes are ok
|
||||||
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
|
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
|
||||||
#else
|
#else
|
||||||
|
@ -54,13 +54,34 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||||
|
|
||||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
|
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
||||||
|
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
int max_threads = thread_max();
|
||||||
|
Vector < vobj > Bt(Nm * max_threads);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
vobj* B = &Bt[Nm * thread_num()];
|
||||||
|
thread_for_in_region(ss, grid->oSites(),{
|
||||||
|
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||||
|
|
||||||
|
for(int j=j0; j<j1; ++j){
|
||||||
|
for(int k=k0; k<k1; ++k){
|
||||||
|
B[j] +=Qt(j,k) * basis_v[k][ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int j=j0; j<j1; ++j){
|
||||||
|
basis_v[j][ss] = B[j];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
#else
|
||||||
View *basis_vp = &basis_v[0];
|
View *basis_vp = &basis_v[0];
|
||||||
|
|
||||||
int nrot = j1-j0;
|
int nrot = j1-j0;
|
||||||
@ -70,14 +91,12 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
uint64_t oSites =grid->oSites();
|
uint64_t oSites =grid->oSites();
|
||||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
||||||
|
|
||||||
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
|
||||||
|
|
||||||
Vector <vobj> Bt(siteBlock * nrot);
|
Vector <vobj> Bt(siteBlock * nrot);
|
||||||
auto Bp=&Bt[0];
|
auto Bp=&Bt[0];
|
||||||
|
|
||||||
// GPU readable copy of matrix
|
// GPU readable copy of matrix
|
||||||
Vector<double> Qt_jv(Nm*Nm);
|
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
||||||
double *Qt_p = & Qt_jv[0];
|
Coeff_t *Qt_p = & Qt_jv[0];
|
||||||
thread_for(i,Nm*Nm,{
|
thread_for(i,Nm*Nm,{
|
||||||
int j = i/Nm;
|
int j = i/Nm;
|
||||||
int k = i%Nm;
|
int k = i%Nm;
|
||||||
@ -118,6 +137,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user