From 2bf3b4d5765b9db6d73fc4b3795971f851cb7d0f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 Dec 2021 09:02:02 -0800 Subject: [PATCH] Update to reduce memory footpring in benchmark test --- Grid/communicator/Communicator_mpi3.cc | 4 +- Grid/threads/Accelerator.h | 2 +- benchmarks/Benchmark_dwf_fp32.cc | 61 ++++++++++++++++++-------- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 305a3a9b..162180bc 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -388,6 +388,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); + std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); } @@ -399,12 +400,13 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { + acceleratorCopySynchronise(); std::cout << "Copy Synchronised\n"< status(nreq); - acceleratorCopySynchronise(); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); assert(ierr==0); list.resize(0); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index cec0600f..8be712ba 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -306,7 +306,7 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index d48486c0..4edf7c16 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -126,19 +126,10 @@ int main (int argc, char ** argv) // Naive wilson implementation //////////////////////////////////// // replicate across fifth dimension - LatticeGaugeFieldF Umu5d(FGrid); - std::vector U(4,FGrid); - { - autoView( Umu5d_v, Umu5d, CpuWrite); - autoView( Umu_v , Umu , CpuRead); - for(int ss=0;ssoSites();ss++){ - for(int s=0;s U(4,UGrid); for(int mu=0;mu(Umu5d,mu); + U[mu] = PeekIndex(Umu,mu); } std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; @@ -147,10 +138,28 @@ int main (int argc, char ** argv) ref = Zero(); for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s