mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-03 18:55:56 +01:00
Makinig LLVM happy
This commit is contained in:
parent
1d22841811
commit
3d014864e2
@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||
}
|
||||
@ -359,12 +359,12 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
|
||||
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
|
||||
}
|
||||
|
||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
@ -511,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
||||
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||
}
|
||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
||||
}
|
||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
||||
@ -583,9 +583,9 @@ inline void acceleratorMem(void)
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
||||
inline void acceleratorCopySynchronise(void) {};
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
@ -668,7 +668,7 @@ accelerator_inline void acceleratorFence(void)
|
||||
return;
|
||||
}
|
||||
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
||||
inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes)
|
||||
{
|
||||
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
||||
acceleratorCopySynchronise();
|
||||
|
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define thread_critical DO_PRAGMA(omp critical)
|
||||
|
||||
#ifdef GRID_OMP
|
||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||
{
|
||||
uint64_t *ufrom = (uint64_t *)from;
|
||||
const uint64_t *ufrom = (const uint64_t *)from;
|
||||
uint64_t *uto = (uint64_t *)to;
|
||||
assert(bytes%8==0);
|
||||
uint64_t words=bytes/8;
|
||||
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||
});
|
||||
}
|
||||
#else
|
||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||
{
|
||||
bcopy(from,to,bytes);
|
||||
}
|
||||
|
@ -492,17 +492,18 @@ public:
|
||||
}
|
||||
FGrid->Barrier();
|
||||
double t1=usecond();
|
||||
uint64_t ncall = 500;
|
||||
|
||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||
uint64_t no = 50;
|
||||
uint64_t ni = 100;
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(ncall);
|
||||
for(uint64_t i=0;i<ncall;i++){
|
||||
std::vector<double> t_time(no);
|
||||
for(uint64_t i=0;i<no;i++){
|
||||
t0=usecond();
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
for(uint64_t j=0;j<ni;j++){
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
t1=usecond();
|
||||
t_time[i] = t1-t0;
|
||||
}
|
||||
@ -520,11 +521,11 @@ public:
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
mf_hi = flops/timestat.min;
|
||||
mf_lo = flops/timestat.max;
|
||||
mf_hi = flops/timestat.min*ni;
|
||||
mf_lo = flops/timestat.max*ni;
|
||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||
|
||||
mflops = flops/timestat.mean;
|
||||
mflops = flops/timestat.mean*ni;
|
||||
mflops_all.push_back(mflops);
|
||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||
@ -535,6 +536,7 @@ public:
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
@ -654,17 +656,19 @@ public:
|
||||
}
|
||||
FGrid->Barrier();
|
||||
double t1=usecond();
|
||||
uint64_t ncall = 500;
|
||||
|
||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||
uint64_t no = 50;
|
||||
uint64_t ni = 100;
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(ncall);
|
||||
for(uint64_t i=0;i<ncall;i++){
|
||||
std::vector<double> t_time(no);
|
||||
for(uint64_t i=0;i<no;i++){
|
||||
t0=usecond();
|
||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||
for(uint64_t j=0;j<ni;j++){
|
||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
t1=usecond();
|
||||
t_time[i] = t1-t0;
|
||||
}
|
||||
@ -675,11 +679,11 @@ public:
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
mf_hi = flops/timestat.min;
|
||||
mf_lo = flops/timestat.max;
|
||||
mf_hi = flops/timestat.min*ni;
|
||||
mf_lo = flops/timestat.max*ni;
|
||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||
|
||||
mflops = flops/timestat.mean;
|
||||
mflops = flops/timestat.mean*ni;
|
||||
mflops_all.push_back(mflops);
|
||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||
@ -689,6 +693,7 @@ public:
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
@ -792,19 +797,18 @@ public:
|
||||
Dc.M(src,r);
|
||||
}
|
||||
FGrid->Barrier();
|
||||
double t1=usecond();
|
||||
uint64_t ncall = 500;
|
||||
|
||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||
uint64_t ni = 100;
|
||||
uint64_t no = 50;
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(ncall);
|
||||
for(uint64_t i=0;i<ncall;i++){
|
||||
t0=usecond();
|
||||
Dc.M(src,r);
|
||||
t1=usecond();
|
||||
std::vector<double> t_time(no);
|
||||
for(uint64_t i=0;i<no;i++){
|
||||
double t0=usecond();
|
||||
for(uint64_t j=0;j<ni;j++){
|
||||
Dc.M(src,r);
|
||||
}
|
||||
double t1=usecond();
|
||||
t_time[i] = t1-t0;
|
||||
}
|
||||
FGrid->Barrier();
|
||||
@ -814,20 +818,21 @@ public:
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
mf_hi = flops/timestat.min;
|
||||
mf_lo = flops/timestat.max;
|
||||
mf_hi = flops/timestat.min*ni;
|
||||
mf_lo = flops/timestat.max*ni;
|
||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||
|
||||
mflops = flops/timestat.mean;
|
||||
mflops = flops/timestat.mean*ni;
|
||||
mflops_all.push_back(mflops);
|
||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
@ -872,7 +877,7 @@ int main (int argc, char ** argv)
|
||||
int do_dslash=1;
|
||||
|
||||
int sel=4;
|
||||
std::vector<int> L_list({8,12,16,24});
|
||||
std::vector<int> L_list({8,12,16,24,32});
|
||||
int selm1=sel-1;
|
||||
|
||||
std::vector<double> clover;
|
||||
|
32
systems/sdcc-genoa/bench.slurm
Normal file
32
systems/sdcc-genoa/bench.slurm
Normal file
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --partition lqcd
|
||||
#SBATCH --time=00:50:00
|
||||
#SBATCH -A lqcdtest
|
||||
#SBATCH -q lqcd
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --cpus-per-task=64
|
||||
#SBATCH --qos lqcd
|
||||
|
||||
source sourceme.sh
|
||||
|
||||
export PLACES=(1:16:4 1:32:2 0:64:1);
|
||||
export THR=(16 32 64)
|
||||
|
||||
for t in 2
|
||||
do
|
||||
|
||||
export OMP_NUM_THREADS=${THR[$t]}
|
||||
export OMP_PLACES=${PLACES[$t]}
|
||||
export thr=${THR[$t]}
|
||||
|
||||
#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
|
||||
for vol in 48.48.48.96
|
||||
do
|
||||
srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
|
||||
done
|
||||
#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
|
||||
done
|
||||
|
36
systems/sdcc-genoa/bench2.slurm
Normal file
36
systems/sdcc-genoa/bench2.slurm
Normal file
@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --partition lqcd
|
||||
#SBATCH --time=00:50:00
|
||||
#SBATCH -A lqcdtest
|
||||
#SBATCH -q lqcd
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --nodes=2
|
||||
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
||||
#SBATCH --ntasks=2
|
||||
#SBATCH --cpus-per-task=64
|
||||
#SBATCH --qos lqcd
|
||||
|
||||
source sourceme.sh
|
||||
|
||||
export PLACES=(1:16:4 1:32:2 0:64:1);
|
||||
export THR=(16 32 64)
|
||||
|
||||
nodes=2
|
||||
mpi=1.1.1.2
|
||||
|
||||
for t in 2
|
||||
do
|
||||
|
||||
export OMP_NUM_THREADS=${THR[$t]}
|
||||
export OMP_PLACES=${PLACES[$t]}
|
||||
export thr=${THR[$t]}
|
||||
|
||||
#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
|
||||
|
||||
for vol in 64.64.64.128
|
||||
do
|
||||
srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
|
||||
done
|
||||
|
||||
done
|
||||
|
16
systems/sdcc-genoa/config-command
Normal file
16
systems/sdcc-genoa/config-command
Normal file
@ -0,0 +1,16 @@
|
||||
../../configure \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-unified=yes \
|
||||
--enable-shm=shmopen \
|
||||
--enable-shm-fast-path=shmopen \
|
||||
--enable-accelerator=none \
|
||||
--enable-simd=AVX512 \
|
||||
--disable-accelerator-cshift \
|
||||
--disable-fermion-reps \
|
||||
--disable-gparity \
|
||||
CXX=clang++ \
|
||||
MPICXX=mpicxx \
|
||||
CXXFLAGS="-std=c++17"
|
||||
|
||||
|
||||
|
4
systems/sdcc-genoa/sourceme.sh
Normal file
4
systems/sdcc-genoa/sourceme.sh
Normal file
@ -0,0 +1,4 @@
|
||||
source $HOME/spack/share/spack/setup-env.sh
|
||||
spack load llvm@17.0.4
|
||||
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
|
||||
module load openmpi
|
Loading…
x
Reference in New Issue
Block a user