mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Makinig LLVM happy
This commit is contained in:
parent
1d22841811
commit
3d014864e2
@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||||
}
|
}
|
||||||
@ -359,12 +359,12 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
|
|||||||
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
|
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
||||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
@ -511,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||||
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
||||||
@ -583,9 +583,9 @@ inline void acceleratorMem(void)
|
|||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
||||||
inline void acceleratorCopySynchronise(void) {};
|
inline void acceleratorCopySynchronise(void) {};
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
@ -668,7 +668,7 @@ accelerator_inline void acceleratorFence(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
|
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define thread_critical DO_PRAGMA(omp critical)
|
#define thread_critical DO_PRAGMA(omp critical)
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
uint64_t *ufrom = (uint64_t *)from;
|
const uint64_t *ufrom = (const uint64_t *)from;
|
||||||
uint64_t *uto = (uint64_t *)to;
|
uint64_t *uto = (uint64_t *)to;
|
||||||
assert(bytes%8==0);
|
assert(bytes%8==0);
|
||||||
uint64_t words=bytes/8;
|
uint64_t words=bytes/8;
|
||||||
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(from,to,bytes);
|
bcopy(from,to,bytes);
|
||||||
}
|
}
|
||||||
|
@ -492,17 +492,18 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = 500;
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -520,11 +521,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -535,6 +536,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -654,17 +656,19 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = 500;
|
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -675,11 +679,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -689,6 +693,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -792,19 +797,18 @@ public:
|
|||||||
Dc.M(src,r);
|
Dc.M(src,r);
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
uint64_t ni = 100;
|
||||||
uint64_t ncall = 500;
|
uint64_t no = 50;
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
double t0=usecond();
|
||||||
Dc.M(src,r);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
t1=usecond();
|
Dc.M(src,r);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
@ -814,20 +818,21 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -872,7 +877,7 @@ int main (int argc, char ** argv)
|
|||||||
int do_dslash=1;
|
int do_dslash=1;
|
||||||
|
|
||||||
int sel=4;
|
int sel=4;
|
||||||
std::vector<int> L_list({8,12,16,24});
|
std::vector<int> L_list({8,12,16,24,32});
|
||||||
int selm1=sel-1;
|
int selm1=sel-1;
|
||||||
|
|
||||||
std::vector<double> clover;
|
std::vector<double> clover;
|
||||||
|
32
systems/sdcc-genoa/bench.slurm
Normal file
32
systems/sdcc-genoa/bench.slurm
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --partition lqcd
|
||||||
|
#SBATCH --time=00:50:00
|
||||||
|
#SBATCH -A lqcdtest
|
||||||
|
#SBATCH -q lqcd
|
||||||
|
#SBATCH --exclusive
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
||||||
|
#SBATCH --ntasks=1
|
||||||
|
#SBATCH --cpus-per-task=64
|
||||||
|
#SBATCH --qos lqcd
|
||||||
|
|
||||||
|
source sourceme.sh
|
||||||
|
|
||||||
|
export PLACES=(1:16:4 1:32:2 0:64:1);
|
||||||
|
export THR=(16 32 64)
|
||||||
|
|
||||||
|
for t in 2
|
||||||
|
do
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=${THR[$t]}
|
||||||
|
export OMP_PLACES=${PLACES[$t]}
|
||||||
|
export thr=${THR[$t]}
|
||||||
|
|
||||||
|
#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
|
||||||
|
for vol in 48.48.48.96
|
||||||
|
do
|
||||||
|
srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
|
||||||
|
done
|
||||||
|
#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
|
||||||
|
done
|
||||||
|
|
36
systems/sdcc-genoa/bench2.slurm
Normal file
36
systems/sdcc-genoa/bench2.slurm
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --partition lqcd
|
||||||
|
#SBATCH --time=00:50:00
|
||||||
|
#SBATCH -A lqcdtest
|
||||||
|
#SBATCH -q lqcd
|
||||||
|
#SBATCH --exclusive
|
||||||
|
#SBATCH --nodes=2
|
||||||
|
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
||||||
|
#SBATCH --ntasks=2
|
||||||
|
#SBATCH --cpus-per-task=64
|
||||||
|
#SBATCH --qos lqcd
|
||||||
|
|
||||||
|
source sourceme.sh
|
||||||
|
|
||||||
|
export PLACES=(1:16:4 1:32:2 0:64:1);
|
||||||
|
export THR=(16 32 64)
|
||||||
|
|
||||||
|
nodes=2
|
||||||
|
mpi=1.1.1.2
|
||||||
|
|
||||||
|
for t in 2
|
||||||
|
do
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=${THR[$t]}
|
||||||
|
export OMP_PLACES=${PLACES[$t]}
|
||||||
|
export thr=${THR[$t]}
|
||||||
|
|
||||||
|
#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
|
||||||
|
|
||||||
|
for vol in 64.64.64.128
|
||||||
|
do
|
||||||
|
srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
|
||||||
|
done
|
||||||
|
|
||||||
|
done
|
||||||
|
|
16
systems/sdcc-genoa/config-command
Normal file
16
systems/sdcc-genoa/config-command
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
../../configure \
|
||||||
|
--enable-comms=mpi-auto \
|
||||||
|
--enable-unified=yes \
|
||||||
|
--enable-shm=shmopen \
|
||||||
|
--enable-shm-fast-path=shmopen \
|
||||||
|
--enable-accelerator=none \
|
||||||
|
--enable-simd=AVX512 \
|
||||||
|
--disable-accelerator-cshift \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--disable-gparity \
|
||||||
|
CXX=clang++ \
|
||||||
|
MPICXX=mpicxx \
|
||||||
|
CXXFLAGS="-std=c++17"
|
||||||
|
|
||||||
|
|
||||||
|
|
4
systems/sdcc-genoa/sourceme.sh
Normal file
4
systems/sdcc-genoa/sourceme.sh
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
source $HOME/spack/share/spack/setup-env.sh
|
||||||
|
spack load llvm@17.0.4
|
||||||
|
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
|
||||||
|
module load openmpi
|
Loading…
x
Reference in New Issue
Block a user