mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 21:25:56 +01:00
Merge remote-tracking branch 'upstream' into gauge_action_deriv
This commit is contained in:
commit
1a1fe85428
@ -746,26 +746,34 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
// int nreq=list.size();
|
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
||||||
|
|
||||||
// if (nreq==0) return;
|
std::vector<MPI_Status> status;
|
||||||
// std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Request> MpiRequests;
|
||||||
// std::vector<MPI_Request> MpiRequests(nreq);
|
|
||||||
|
for(int r=0;r<list.size();r++){
|
||||||
|
// Must check each Send buf is clear to reuse
|
||||||
|
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
||||||
|
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
||||||
|
}
|
||||||
|
|
||||||
// for(int r=0;r<nreq;r++){
|
int nreq=MpiRequests.size();
|
||||||
// MpiRequests[r] = list[r].req;
|
|
||||||
// }
|
std::cout << GridLogMessage << " StencilSendToRevFromComplete "<<nreq<<" Mpi Requests"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
if (nreq>0) {
|
||||||
|
status.resize(MpiRequests.size());
|
||||||
|
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
// int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
|
||||||
// assert(ierr==0);
|
|
||||||
|
|
||||||
// for(int r=0;r<nreq;r++){
|
// for(int r=0;r<nreq;r++){
|
||||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
// if ( list[r].PacketType==InterNodeRecv ) {
|
||||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
|
||||||
|
|
||||||
list.resize(0); // Delete the list
|
list.resize(0); // Delete the list
|
||||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||||
|
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
|
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
|
@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||||
}
|
}
|
||||||
@ -359,12 +359,12 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
|
|||||||
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
|
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
||||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
@ -511,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||||
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
||||||
@ -583,9 +583,9 @@ inline void acceleratorMem(void)
|
|||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
||||||
inline void acceleratorCopySynchronise(void) {};
|
inline void acceleratorCopySynchronise(void) {};
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
@ -668,15 +668,15 @@ accelerator_inline void acceleratorFence(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class T> void acceleratorPut(T& dev,T&host)
|
template<class T> void acceleratorPut(T& dev,const T&host)
|
||||||
{
|
{
|
||||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
|
||||||
}
|
}
|
||||||
template<class T> T acceleratorGet(T& dev)
|
template<class T> T acceleratorGet(T& dev)
|
||||||
{
|
{
|
||||||
|
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define thread_critical DO_PRAGMA(omp critical)
|
#define thread_critical DO_PRAGMA(omp critical)
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
uint64_t *ufrom = (uint64_t *)from;
|
const uint64_t *ufrom = (const uint64_t *)from;
|
||||||
uint64_t *uto = (uint64_t *)to;
|
uint64_t *uto = (uint64_t *)to;
|
||||||
assert(bytes%8==0);
|
assert(bytes%8==0);
|
||||||
uint64_t words=bytes/8;
|
uint64_t words=bytes/8;
|
||||||
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(from,to,bytes);
|
bcopy(from,to,bytes);
|
||||||
}
|
}
|
||||||
|
@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
Grid_default_latt,
|
Grid_default_latt,
|
||||||
Grid_default_mpi);
|
Grid_default_mpi);
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
|
||||||
|
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
||||||
|
FlightRecorder::PrintEntireLog = 1;
|
||||||
|
FlightRecorder::ChecksumComms = 1;
|
||||||
|
FlightRecorder::ChecksumCommsSend=1;
|
||||||
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||||
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
||||||
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
||||||
@ -651,3 +658,4 @@ void Grid_debug_handler_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ namespace Grid{
|
|||||||
int64_t index64;
|
int64_t index64;
|
||||||
IndexFromCoorReversed(coor,index64,dims);
|
IndexFromCoorReversed(coor,index64,dims);
|
||||||
if ( index64>=2*1024*1024*1024LL ){
|
if ( index64>=2*1024*1024*1024LL ){
|
||||||
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
// std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
||||||
}
|
}
|
||||||
assert(index64<2*1024*1024*1024LL);
|
assert(index64<2*1024*1024*1024LL);
|
||||||
index = (int) index64;
|
index = (int) index64;
|
||||||
|
@ -492,17 +492,18 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = 500;
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -520,11 +521,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -535,6 +536,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -654,17 +656,19 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = 500;
|
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -675,11 +679,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -689,6 +693,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -792,19 +797,18 @@ public:
|
|||||||
Dc.M(src,r);
|
Dc.M(src,r);
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
uint64_t ni = 100;
|
||||||
uint64_t ncall = 500;
|
uint64_t no = 50;
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
double t0=usecond();
|
||||||
Dc.M(src,r);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
t1=usecond();
|
Dc.M(src,r);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
@ -814,20 +818,21 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -872,7 +877,7 @@ int main (int argc, char ** argv)
|
|||||||
int do_dslash=1;
|
int do_dslash=1;
|
||||||
|
|
||||||
int sel=4;
|
int sel=4;
|
||||||
std::vector<int> L_list({8,12,16,24});
|
std::vector<int> L_list({8,12,16,24,32});
|
||||||
int selm1=sel-1;
|
int selm1=sel-1;
|
||||||
|
|
||||||
std::vector<double> clover;
|
std::vector<double> clover;
|
||||||
|
@ -1,18 +1,19 @@
|
|||||||
#Ahead of time compile for PVC
|
#Ahead of time compile for PVC
|
||||||
|
|
||||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
||||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
|
||||||
|
|
||||||
#JIT compile
|
#JIT compile
|
||||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
||||||
|
|
||||||
../../configure \
|
../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-reduction=grid \
|
--enable-reduction=grid \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-comms=mpi-auto \
|
--enable-comms=mpi-auto \
|
||||||
--enable-debug \
|
--enable-debug \
|
||||||
|
--prefix $HOME/gpt-install \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--with-lime=$CLIME \
|
--with-lime=$CLIME \
|
||||||
|
206
systems/WorkArounds.txt
Normal file
206
systems/WorkArounds.txt
Normal file
@ -0,0 +1,206 @@
|
|||||||
|
The purpose of this file is to collate all non-obvious known magic shell variables
|
||||||
|
and compiler flags required for either correctness or performance on various systems.
|
||||||
|
|
||||||
|
A repository of work-arounds.
|
||||||
|
|
||||||
|
Contents:
|
||||||
|
1. Interconnect + MPI
|
||||||
|
2. Compilation
|
||||||
|
3. Profiling
|
||||||
|
|
||||||
|
************************
|
||||||
|
* 1. INTERCONNECT + MPI
|
||||||
|
************************
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
export OMPI_MCA_io=romio321
|
||||||
|
|
||||||
|
--------------------------------------
|
||||||
|
ROMIO fail with > 2GB per node read (32 bit issue)
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
Use later MPICH
|
||||||
|
|
||||||
|
https://github.com/paboyle/Grid/issues/381
|
||||||
|
|
||||||
|
https://github.com/pmodels/mpich/commit/3a479ab0
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Slingshot: Frontier and Perlmutter libfabric slow down
|
||||||
|
and physical memory fragmentation
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
export FI_MR_CACHE_MONITOR=disabled
|
||||||
|
or
|
||||||
|
export FI_MR_CACHE_MONITOR=kdreg2
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Perlmutter
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
export MPICH_RDMA_ENABLED_CUDA=1
|
||||||
|
export MPICH_GPU_IPC_ENABLED=1
|
||||||
|
export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
|
||||||
|
export MPICH_GPU_NO_ASYNC_MEMCPY=0
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Frontier/LumiG
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
|
||||||
|
|
||||||
|
cat << EOF > select_gpu
|
||||||
|
#!/bin/bash
|
||||||
|
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||||
|
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||||
|
export GPU_MAP=(0 1 2 3 7 6 5 4)
|
||||||
|
export NUMA_MAP=(3 3 1 1 2 2 0 0)
|
||||||
|
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
|
||||||
|
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
|
||||||
|
export HIP_VISIBLE_DEVICES=\$GPU
|
||||||
|
unset ROCR_VISIBLE_DEVICES
|
||||||
|
echo RANK \$SLURM_LOCALID using GPU \$GPU
|
||||||
|
exec numactl -m \$NUMA -N \$NUMA \$*
|
||||||
|
EOF
|
||||||
|
chmod +x ./select_gpu
|
||||||
|
|
||||||
|
srun ./select_gpu BINARY
|
||||||
|
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
export OMPI_MCA_btl=^uct,openib
|
||||||
|
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||||
|
export UCX_RNDV_SCHEME=put_zcopy
|
||||||
|
export UCX_RNDV_THRESH=16384
|
||||||
|
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Mellanox + A100 correctness (Tursa, Booster, Leonardo)
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
export UCX_MEMTYPE_CACHE=n
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
MPICH/Aurora/PVC correctness and performance
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
https://github.com/pmodels/mpich/issues/7302
|
||||||
|
|
||||||
|
--enable-cuda-aware-mpi=no
|
||||||
|
--enable-unified=no
|
||||||
|
|
||||||
|
Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
|
||||||
|
Do not use SVM
|
||||||
|
|
||||||
|
Ideally use MPICH with fix to issue 7302:
|
||||||
|
|
||||||
|
https://github.com/pmodels/mpich/pull/7312
|
||||||
|
|
||||||
|
Ideally:
|
||||||
|
MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
|
||||||
|
|
||||||
|
Alternatives:
|
||||||
|
export MPIR_CVAR_NOLOCAL=1
|
||||||
|
export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
MPICH/Aurora/PVC correctness and performance
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
Broken:
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
This gives good peformance without requiring
|
||||||
|
--enable-cuda-aware-mpi=no
|
||||||
|
|
||||||
|
But is an open issue reported by James Osborn
|
||||||
|
https://github.com/pmodels/mpich/issues/7139
|
||||||
|
|
||||||
|
Possibly resolved but unclear if in the installed software yet.
|
||||||
|
|
||||||
|
************************
|
||||||
|
* 2. COMPILATION
|
||||||
|
************************
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
G++ compiler breakage / graveyard
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
9.3.0, 10.3.1,
|
||||||
|
https://github.com/paboyle/Grid/issues/290
|
||||||
|
https://github.com/paboyle/Grid/issues/264
|
||||||
|
|
||||||
|
Working (-) Broken (X):
|
||||||
|
|
||||||
|
4.9.0 -
|
||||||
|
4.9.1 -
|
||||||
|
5.1.0 X
|
||||||
|
5.2.0 X
|
||||||
|
5.3.0 X
|
||||||
|
5.4.0 X
|
||||||
|
6.1.0 X
|
||||||
|
6.2.0 X
|
||||||
|
6.3.0 -
|
||||||
|
7.1.0 -
|
||||||
|
8.0.0 (HEAD) -
|
||||||
|
|
||||||
|
https://github.com/paboyle/Grid/issues/100
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
AMD GPU nodes :
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
multiple ROCM versions broken; use 5.3.0
|
||||||
|
manifests itself as wrong results in fp32
|
||||||
|
|
||||||
|
https://github.com/paboyle/Grid/issues/464
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Aurora/PVC
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
|
||||||
|
SYCL slow link and relocatable code issues (Christoph Lehner)
|
||||||
|
Opt large register file required for good performance in fp64
|
||||||
|
|
||||||
|
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
||||||
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC"
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Aurora/PVC useful extra options
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
Host only sanitizer:
|
||||||
|
-Xarch_host -fsanitize=leak
|
||||||
|
-Xarch_host -fsanitize=address
|
||||||
|
|
||||||
|
Deterministic MPI reduction:
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
************************
|
||||||
|
* 3. Visual profile tools
|
||||||
|
************************
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Frontier/rocprof
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Aurora/unitrace
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Tursa/nsight-sys
|
||||||
|
--------------------------------------------------------------------
|
32
systems/sdcc-genoa/bench.slurm
Normal file
32
systems/sdcc-genoa/bench.slurm
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --partition lqcd
|
||||||
|
#SBATCH --time=00:50:00
|
||||||
|
#SBATCH -A lqcdtest
|
||||||
|
#SBATCH -q lqcd
|
||||||
|
#SBATCH --exclusive
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
||||||
|
#SBATCH --ntasks=1
|
||||||
|
#SBATCH --cpus-per-task=64
|
||||||
|
#SBATCH --qos lqcd
|
||||||
|
|
||||||
|
source sourceme.sh
|
||||||
|
|
||||||
|
export PLACES=(1:16:4 1:32:2 0:64:1);
|
||||||
|
export THR=(16 32 64)
|
||||||
|
|
||||||
|
for t in 2
|
||||||
|
do
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=${THR[$t]}
|
||||||
|
export OMP_PLACES=${PLACES[$t]}
|
||||||
|
export thr=${THR[$t]}
|
||||||
|
|
||||||
|
#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
|
||||||
|
for vol in 48.48.48.96
|
||||||
|
do
|
||||||
|
srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
|
||||||
|
done
|
||||||
|
#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
|
||||||
|
done
|
||||||
|
|
36
systems/sdcc-genoa/bench2.slurm
Normal file
36
systems/sdcc-genoa/bench2.slurm
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --partition lqcd
|
||||||
|
#SBATCH --time=00:50:00
|
||||||
|
#SBATCH -A lqcdtest
|
||||||
|
#SBATCH -q lqcd
|
||||||
|
#SBATCH --exclusive
|
||||||
|
#SBATCH --nodes=2
|
||||||
|
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
||||||
|
#SBATCH --ntasks=2
|
||||||
|
#SBATCH --cpus-per-task=64
|
||||||
|
#SBATCH --qos lqcd
|
||||||
|
|
||||||
|
source sourceme.sh
|
||||||
|
|
||||||
|
export PLACES=(1:16:4 1:32:2 0:64:1);
|
||||||
|
export THR=(16 32 64)
|
||||||
|
|
||||||
|
nodes=2
|
||||||
|
mpi=1.1.1.2
|
||||||
|
|
||||||
|
for t in 2
|
||||||
|
do
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=${THR[$t]}
|
||||||
|
export OMP_PLACES=${PLACES[$t]}
|
||||||
|
export thr=${THR[$t]}
|
||||||
|
|
||||||
|
#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
|
||||||
|
|
||||||
|
for vol in 64.64.64.128
|
||||||
|
do
|
||||||
|
srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
|
||||||
|
done
|
||||||
|
|
||||||
|
done
|
||||||
|
|
16
systems/sdcc-genoa/config-command
Normal file
16
systems/sdcc-genoa/config-command
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
../../configure \
|
||||||
|
--enable-comms=mpi-auto \
|
||||||
|
--enable-unified=yes \
|
||||||
|
--enable-shm=shmopen \
|
||||||
|
--enable-shm-fast-path=shmopen \
|
||||||
|
--enable-accelerator=none \
|
||||||
|
--enable-simd=AVX512 \
|
||||||
|
--disable-accelerator-cshift \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--disable-gparity \
|
||||||
|
CXX=clang++ \
|
||||||
|
MPICXX=mpicxx \
|
||||||
|
CXXFLAGS="-std=c++17"
|
||||||
|
|
||||||
|
|
||||||
|
|
4
systems/sdcc-genoa/sourceme.sh
Normal file
4
systems/sdcc-genoa/sourceme.sh
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
source $HOME/spack/share/spack/setup-env.sh
|
||||||
|
spack load llvm@17.0.4
|
||||||
|
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
|
||||||
|
module load openmpi
|
Loading…
x
Reference in New Issue
Block a user