diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 92af7111..ea187a1b 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -266,7 +266,8 @@ protected: T = Zero(); autoView(T_v,T,AcceleratorWrite); autoView(F_v,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, + int size=CloverTerm.Grid()->oSites(); + accelerator_for(i, size,1, { T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); @@ -284,7 +285,8 @@ protected: autoView(T_v, T,AcceleratorWrite); autoView(F_v, F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, + int size=CloverTerm.Grid()->oSites(); + accelerator_for(i, size,1, { T_v[i]()(0, 1) = -F_v[i]()(); T_v[i]()(1, 0) = F_v[i]()(); @@ -302,7 +304,8 @@ protected: autoView(T_v,T,AcceleratorWrite); autoView(F_v,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, + int size=CloverTerm.Grid()->oSites(); + accelerator_for(i, size,1, { T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); T_v[i]()(1, 1) = timesI(F_v[i]()()); @@ -320,7 +323,8 @@ protected: autoView( T_v , T, AcceleratorWrite); autoView( F_v , F, AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, + int size=CloverTerm.Grid()->oSites(); + accelerator_for(i, size,1, { T_v[i]()(0, 1) = timesI(F_v[i]()()); T_v[i]()(1, 0) = timesI(F_v[i]()()); @@ -338,7 +342,8 @@ protected: autoView( T_v ,T,AcceleratorWrite); autoView( F_v ,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, + int size=CloverTerm.Grid()->oSites(); + accelerator_for(i, size,1, { T_v[i]()(0, 1) = -(F_v[i]()()); T_v[i]()(1, 0) = (F_v[i]()()); @@ -357,7 +362,8 @@ protected: autoView( T_v , T,AcceleratorWrite); autoView( F_v , F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, + int size=CloverTerm.Grid()->oSites(); + accelerator_for(i, size,1, { T_v[i]()(0, 0) = timesI(F_v[i]()()); T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 2685796d..a2eaf7a9 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -110,8 +110,10 @@ public: autoView( out_v, out, AcceleratorWrite); autoView( phi_v, phi, AcceleratorRead); autoView( Umu_v, Umu, AcceleratorRead); + int size=out.Grid()->oSites(); + typedef decltype(coalescedRead(out_v[0])) calcSpinor; - accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ + accelerator_for(sss,size,Nsimd,{ calcSpinor tmp; multLink(tmp,Umu_v[sss],phi_v(sss),mu); coalescedWrite(out_v[sss],tmp); @@ -203,7 +205,8 @@ public: autoView( tmp_v , tmp, AcceleratorWrite); autoView( Btilde_v , Btilde, AcceleratorRead); autoView( Atilde_v , Atilde, AcceleratorRead); - accelerator_for(sss,tmp.Grid()->oSites(),1,{ + int size=tmp.Grid()->oSites(); + accelerator_for(sss,size,1,{ int sU=sss; for(int s=0;soSites(),Nsimd,{ + int size=mat.Grid()->oSites(); + accelerator_for(sss,size,Nsimd,{ int sU=sss; typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; ColorMatrixType sum; diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index 2499e0e9..dc302829 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -88,7 +88,8 @@ public: static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W autoView(U_v,U,AcceleratorWrite); autoView(W_v,W,AcceleratorRead); - accelerator_for( ss, U.Grid()->oSites(), 1, { + int size=U.Grid()->oSites(); + accelerator_for( ss, size, 1, { U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); }); } @@ -133,7 +134,8 @@ public: //auto start = std::chrono::high_resolution_clock::now(); autoView(U_v,U,AcceleratorWrite); autoView(P_v,P,AcceleratorRead); - accelerator_for(ss, P.Grid()->oSites(),1,{ + int size=P.Grid()->oSites(); + accelerator_for(ss, size,1,{ for (int mu = 0; mu < Nd; mu++) { U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); } diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f3b70e50..c0e07556 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -28,6 +28,9 @@ Author: paboyle /* END LEGAL */ #pragma once +//#ifndef ACCELERATOR_H +//#define ACCELERATOR_H + #include #ifdef HAVE_MALLOC_MALLOC_H @@ -434,30 +437,33 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas //OpenMP Target Offloading #ifdef OMPTARGET -uint32_t nt=acceleratorThreads(); + +//uint32_t gpu_threads=acceleratorThreads(); #define accelerator #define accelerator_inline strong_inline -#define accelerator_for(iterator,num,nsimd, ... ) \ - DO_PRAGMA(omp target teams distribute parallel for thread_limit(nt)) \ +#define accelerator_for(i,num,nsimd, ... ) \ + _Pragma("omp target teams distribute parallel for") \ for ( uint64_t i=0;i inline void *acceleratorAllocShared(size_t bytes) { void *ptr=NULL; @@ -469,11 +475,14 @@ inline void *acceleratorAllocShared(size_t bytes) return ptr; }; inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);}; +inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr);}; #else inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; inline void acceleratorFreeShared(void *ptr){free(ptr);}; inline void acceleratorFreeDevice(void *ptr){free(ptr);}; +#endif //OpenMP CPU threads #else @@ -504,6 +513,7 @@ inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALI inline void acceleratorFreeShared(void *ptr){free(ptr);}; inline void acceleratorFreeDevice(void *ptr){free(ptr);}; #endif +#endif #endif // CPU target @@ -567,3 +577,5 @@ accelerator_inline void acceleratorFence(void) } NAMESPACE_END(Grid); +//#endif + diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index a9fa13ea..685373e7 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -46,7 +46,7 @@ Author: paboyle #endif #ifdef GRID_OMP -#define DO_PRAGMA_(x) _Pragma (#x) +#define DO_PRAGMA_(x) _Pragma ("x") #define DO_PRAGMA(x) DO_PRAGMA_(x) #define thread_num(a) omp_get_thread_num() #define thread_max(a) omp_get_max_threads()