mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Merged openmp offload implementation with develop
This commit is contained in:
commit
f6661ce29b
@ -110,8 +110,10 @@ public:
|
||||
autoView( out_v, out, AcceleratorWrite);
|
||||
autoView( phi_v, phi, AcceleratorRead);
|
||||
autoView( Umu_v, Umu, AcceleratorRead);
|
||||
int size=out.Grid()->oSites();
|
||||
|
||||
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
|
||||
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
|
||||
accelerator_for(sss,size,Nsimd,{
|
||||
calcSpinor tmp;
|
||||
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
|
||||
coalescedWrite(out_v[sss],tmp);
|
||||
@ -203,7 +205,8 @@ public:
|
||||
autoView( tmp_v , tmp, AcceleratorWrite);
|
||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||
accelerator_for(sss,tmp.Grid()->oSites(),1,{
|
||||
int size=tmp.Grid()->oSites();
|
||||
accelerator_for(sss,size,1,{
|
||||
int sU=sss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
@ -217,7 +220,8 @@ public:
|
||||
const int Nsimd = SiteSpinor::Nsimd();
|
||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
|
||||
int size=mat.Grid()->oSites();
|
||||
accelerator_for(sss,size,Nsimd,{
|
||||
int sU=sss;
|
||||
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
|
||||
ColorMatrixType sum;
|
||||
|
@ -90,7 +90,8 @@ public:
|
||||
static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
|
||||
autoView(U_v,U,AcceleratorWrite);
|
||||
autoView(W_v,W,AcceleratorRead);
|
||||
accelerator_for( ss, U.Grid()->oSites(), 1, {
|
||||
int size=U.Grid()->oSites();
|
||||
accelerator_for( ss, size, 1, {
|
||||
U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
|
||||
});
|
||||
}
|
||||
@ -135,7 +136,8 @@ public:
|
||||
//auto start = std::chrono::high_resolution_clock::now();
|
||||
autoView(U_v,U,AcceleratorWrite);
|
||||
autoView(P_v,P,AcceleratorRead);
|
||||
accelerator_for(ss, P.Grid()->oSites(),1,{
|
||||
int size=P.Grid()->oSites();
|
||||
accelerator_for(ss, size,1,{
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
|
||||
}
|
||||
|
@ -26,8 +26,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#ifndef ACCELERATOR_H
|
||||
#define ACCELERATOR_H
|
||||
|
||||
#pragma once
|
||||
#include <string.h>
|
||||
|
||||
#ifdef HAVE_MALLOC_MALLOC_H
|
||||
@ -471,7 +474,70 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
|
||||
|
||||
#undef GRID_SIMT
|
||||
|
||||
//OpenMP Target Offloading
|
||||
#ifdef OMPTARGET
|
||||
#define THREAD_LIMIT acceleratorThreads()
|
||||
|
||||
#define accelerator
|
||||
#define accelerator_inline strong_inline
|
||||
#ifdef THREAD_LIMIT
|
||||
#define accelerator_for(i,num,nsimd, ... ) \
|
||||
_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT)") \
|
||||
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define accelerator_forNB(i,num,nsimd, ... ) \
|
||||
_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT) nowait") \
|
||||
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define accelerator_barrier(dummy) _Pragma("omp barrier")
|
||||
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT) collapse(2)") \
|
||||
for ( uint64_t iter1=0;iter1<num1;iter1++) \
|
||||
for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ;
|
||||
#else
|
||||
#define accelerator_for(i,num,nsimd, ... ) \
|
||||
_Pragma("omp target teams distribute parallel for") \
|
||||
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define accelerator_forNB(i,num,nsimd, ... ) \
|
||||
_Pragma("omp target teams distribute parallel for nowait") \
|
||||
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||
#define accelerator_barrier(dummy) _Pragma("omp barrier")
|
||||
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
_Pragma("omp target teams distribute parallel for collapse(2)") \
|
||||
for ( uint64_t iter1=0;iter1<num1;iter1++) \
|
||||
for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ;
|
||||
#endif
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) {;}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){;}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopySynchronize(void) {;};
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
||||
#ifdef OMPTARGET_MANAGED
|
||||
#include <cuda_runtime_api.h>
|
||||
inline void *acceleratorAllocShared(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||
#else
|
||||
inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||
#endif
|
||||
|
||||
//OpenMP CPU threads
|
||||
#else
|
||||
|
||||
#define accelerator
|
||||
#define accelerator_inline strong_inline
|
||||
@ -500,6 +566,7 @@ inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALI
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // CPU target
|
||||
|
||||
@ -566,3 +633,5 @@ accelerator_inline void acceleratorFence(void)
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
#endif
|
||||
|
||||
|
@ -46,7 +46,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#endif
|
||||
|
||||
#ifdef GRID_OMP
|
||||
#define DO_PRAGMA_(x) _Pragma (#x)
|
||||
#define DO_PRAGMA_(x) _Pragma ("x")
|
||||
#define DO_PRAGMA(x) DO_PRAGMA_(x)
|
||||
#define thread_num(a) omp_get_thread_num()
|
||||
#define thread_max(a) omp_get_max_threads()
|
||||
|
@ -36,19 +36,21 @@ int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
#define LMAX (40)
|
||||
#define LMAX (8)
|
||||
#define LMIN (8)
|
||||
#define LADD (8)
|
||||
|
||||
int64_t Nwarm=10;
|
||||
int64_t Nloop=100;
|
||||
int64_t Nwarm=0;
|
||||
int64_t Nloop=1;
|
||||
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
|
||||
int64_t threads = GridThread::GetThreads();
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
int64_t accelerator_threads = acceleratorThreads();
|
||||
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<accelerator_threads<<" GPU threads"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 x= x*y"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
@ -222,6 +224,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
}
|
||||
|
||||
#if 1
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 CovShiftForward(z,x,y)"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
@ -254,7 +257,9 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
|
||||
}
|
||||
}
|
||||
#if 1
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 z= x * Cshift(y)"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
|
8
config-command
Normal file
8
config-command
Normal file
@ -0,0 +1,8 @@
|
||||
../configure \
|
||||
--enable-comms=none \
|
||||
--enable-simd=GEN \
|
||||
--enable-gen-simd-width=16 \
|
||||
CXX=clang++ \
|
||||
LDFLAGS="-lcudart " \
|
||||
CXXFLAGS="-fopenmp -std=c++14 -fopenmp-cuda-mode -O3 -target x86_64-pc-linux-gnu -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_70 -DOMPTARGET -DOMPTARGET_MANAGED"
|
||||
|
Loading…
Reference in New Issue
Block a user