mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Working simple OpenMP offloading with cudaMallocManaged; cshift not working
This commit is contained in:
		| @@ -266,7 +266,8 @@ protected: | ||||
|     T = Zero(); | ||||
|     autoView(T_v,T,AcceleratorWrite); | ||||
|     autoView(F_v,F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     int size=CloverTerm.Grid()->oSites(); | ||||
|     accelerator_for(i, size,1, | ||||
|     { | ||||
|       T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); | ||||
|       T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); | ||||
| @@ -284,7 +285,8 @@ protected: | ||||
|      | ||||
|     autoView(T_v, T,AcceleratorWrite); | ||||
|     autoView(F_v, F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     int size=CloverTerm.Grid()->oSites(); | ||||
|     accelerator_for(i, size,1, | ||||
|     { | ||||
|       T_v[i]()(0, 1) = -F_v[i]()(); | ||||
|       T_v[i]()(1, 0) = F_v[i]()(); | ||||
| @@ -302,7 +304,8 @@ protected: | ||||
|  | ||||
|     autoView(T_v,T,AcceleratorWrite); | ||||
|     autoView(F_v,F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     int size=CloverTerm.Grid()->oSites(); | ||||
|     accelerator_for(i, size,1, | ||||
|     { | ||||
|       T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); | ||||
|       T_v[i]()(1, 1) = timesI(F_v[i]()()); | ||||
| @@ -320,7 +323,8 @@ protected: | ||||
|  | ||||
|     autoView( T_v , T, AcceleratorWrite); | ||||
|     autoView( F_v , F, AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     int size=CloverTerm.Grid()->oSites(); | ||||
|     accelerator_for(i, size,1, | ||||
|     { | ||||
|       T_v[i]()(0, 1) = timesI(F_v[i]()()); | ||||
|       T_v[i]()(1, 0) = timesI(F_v[i]()()); | ||||
| @@ -338,7 +342,8 @@ protected: | ||||
|      | ||||
|     autoView( T_v ,T,AcceleratorWrite); | ||||
|     autoView( F_v ,F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     int size=CloverTerm.Grid()->oSites(); | ||||
|     accelerator_for(i, size,1, | ||||
|     { | ||||
|       T_v[i]()(0, 1) = -(F_v[i]()()); | ||||
|       T_v[i]()(1, 0) = (F_v[i]()()); | ||||
| @@ -357,7 +362,8 @@ protected: | ||||
|  | ||||
|     autoView( T_v , T,AcceleratorWrite); | ||||
|     autoView( F_v , F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     int size=CloverTerm.Grid()->oSites(); | ||||
|     accelerator_for(i, size,1, | ||||
|     { | ||||
|       T_v[i]()(0, 0) = timesI(F_v[i]()()); | ||||
|       T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); | ||||
|   | ||||
| @@ -110,8 +110,10 @@ public: | ||||
|     autoView( out_v, out, AcceleratorWrite); | ||||
|     autoView( phi_v, phi, AcceleratorRead); | ||||
|     autoView( Umu_v, Umu, AcceleratorRead); | ||||
|     int size=out.Grid()->oSites(); | ||||
|  | ||||
|     typedef decltype(coalescedRead(out_v[0]))   calcSpinor; | ||||
|     accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ | ||||
|     accelerator_for(sss,size,Nsimd,{ | ||||
| 	calcSpinor tmp; | ||||
| 	multLink(tmp,Umu_v[sss],phi_v(sss),mu); | ||||
| 	coalescedWrite(out_v[sss],tmp); | ||||
| @@ -203,7 +205,8 @@ public: | ||||
|       autoView( tmp_v , tmp, AcceleratorWrite); | ||||
|       autoView( Btilde_v , Btilde, AcceleratorRead); | ||||
|       autoView( Atilde_v , Atilde, AcceleratorRead); | ||||
|       accelerator_for(sss,tmp.Grid()->oSites(),1,{ | ||||
|       int size=tmp.Grid()->oSites(); | ||||
|       accelerator_for(sss,size,1,{ | ||||
| 	  int sU=sss; | ||||
| 	  for(int s=0;s<Ls;s++){ | ||||
| 	    int sF = s+Ls*sU; | ||||
| @@ -217,7 +220,8 @@ public: | ||||
|       const int Nsimd = SiteSpinor::Nsimd(); | ||||
|       autoView( Btilde_v , Btilde, AcceleratorRead); | ||||
|       autoView( Atilde_v , Atilde, AcceleratorRead); | ||||
|       accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{ | ||||
|       int size=mat.Grid()->oSites(); | ||||
|       accelerator_for(sss,size,Nsimd,{ | ||||
| 	  int sU=sss; | ||||
|   	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; | ||||
|   	  ColorMatrixType sum; | ||||
|   | ||||
| @@ -88,7 +88,8 @@ public: | ||||
|   static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W | ||||
|     autoView(U_v,U,AcceleratorWrite); | ||||
|     autoView(W_v,W,AcceleratorRead); | ||||
|     accelerator_for( ss, U.Grid()->oSites(), 1, { | ||||
|     int size=U.Grid()->oSites(); | ||||
|     accelerator_for( ss, size, 1, { | ||||
|       U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); | ||||
|     }); | ||||
|   } | ||||
| @@ -133,7 +134,8 @@ public: | ||||
|     //auto start = std::chrono::high_resolution_clock::now(); | ||||
|     autoView(U_v,U,AcceleratorWrite); | ||||
|     autoView(P_v,P,AcceleratorRead); | ||||
|     accelerator_for(ss, P.Grid()->oSites(),1,{ | ||||
|     int size=P.Grid()->oSites(); | ||||
|     accelerator_for(ss, size,1,{ | ||||
|       for (int mu = 0; mu < Nd; mu++) { | ||||
|         U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); | ||||
|       } | ||||
|   | ||||
| @@ -28,6 +28,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| /*  END LEGAL */ | ||||
| #pragma once | ||||
|  | ||||
| //#ifndef ACCELERATOR_H | ||||
| //#define ACCELERATOR_H | ||||
|  | ||||
| #include <string.h> | ||||
|  | ||||
| #ifdef HAVE_MALLOC_MALLOC_H | ||||
| @@ -434,30 +437,33 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas | ||||
|  | ||||
| //OpenMP Target Offloading | ||||
| #ifdef OMPTARGET | ||||
| uint32_t nt=acceleratorThreads(); | ||||
|  | ||||
| //uint32_t gpu_threads=acceleratorThreads(); | ||||
|  | ||||
| #define accelerator | ||||
| #define accelerator_inline strong_inline | ||||
| #define accelerator_for(iterator,num,nsimd, ... )  \ | ||||
| 	DO_PRAGMA(omp target teams distribute parallel for thread_limit(nt)) \  | ||||
| #define accelerator_for(i,num,nsimd, ... ) \ | ||||
| 	_Pragma("omp target teams distribute parallel for") \ | ||||
| 	for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;  | ||||
| #define accelerator_forNB(iterator,num,nsimd, ... ) \ | ||||
| 	DO_PRAGMA(omp target teams distribute parallel for thread_limit(nt) nowait) \ | ||||
| #define accelerator_forNB(i,num,nsimd, ... ) \ | ||||
| 	_Pragma("omp target teams distribute parallel for nowait") \ | ||||
|         for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ; | ||||
| #define accelerator_barrier(dummy) DO_PRAGMA(omp barrier)  | ||||
| #define accelerator_barrier(dummy) _Pragma("omp barrier")  | ||||
| #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \ | ||||
| 	DO_PRAGMA(omp target teams distribute parallel for thread_limit(nt) collapse(2)) \ | ||||
|         for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ; | ||||
| 	_Pragma("omp target teams distribute parallel for collapse(2)") \ | ||||
|         for ( uint64_t iter1=0;iter1<num1;iter1++) \ | ||||
| 	for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ; | ||||
|  | ||||
| accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific | ||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { DO_PRAGMA(omp target enter data map(to:from[0:bytes])} | ||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ DO_PRAGMA(omp target exit data map(from:from[0:bytes])} | ||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  {;} | ||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){;} | ||||
| inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);} | ||||
| inline void acceleratorCopySynchronize(void) {}; | ||||
| inline void acceleratorCopySynchronize(void) {;}; | ||||
|  | ||||
| inline int  acceleratorIsCommunicable(void *ptr){ return 1; } | ||||
| inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} | ||||
| #ifdef OMPTARGET_MANAGED  | ||||
| #include <cuda_runtime_api.h> | ||||
| inline void *acceleratorAllocShared(size_t bytes) | ||||
| { | ||||
|   void *ptr=NULL; | ||||
| @@ -469,11 +475,14 @@ inline void *acceleratorAllocShared(size_t bytes) | ||||
|   return ptr; | ||||
| }; | ||||
| inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);}; | ||||
| inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; | ||||
| inline void acceleratorFreeDevice(void *ptr){free(ptr);}; | ||||
| #else | ||||
| inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; | ||||
| inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; | ||||
| inline void acceleratorFreeShared(void *ptr){free(ptr);}; | ||||
| inline void acceleratorFreeDevice(void *ptr){free(ptr);}; | ||||
| #endif | ||||
|  | ||||
| //OpenMP CPU threads | ||||
| #else | ||||
| @@ -504,6 +513,7 @@ inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALI | ||||
| inline void acceleratorFreeShared(void *ptr){free(ptr);}; | ||||
| inline void acceleratorFreeDevice(void *ptr){free(ptr);}; | ||||
| #endif | ||||
| #endif | ||||
|  | ||||
| #endif // CPU target | ||||
|  | ||||
| @@ -567,3 +577,5 @@ accelerator_inline void acceleratorFence(void) | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
| //#endif | ||||
|  | ||||
|   | ||||
| @@ -46,7 +46,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| #endif | ||||
|  | ||||
| #ifdef GRID_OMP | ||||
| #define DO_PRAGMA_(x) _Pragma (#x) | ||||
| #define DO_PRAGMA_(x) _Pragma ("x") | ||||
| #define DO_PRAGMA(x) DO_PRAGMA_(x) | ||||
| #define thread_num(a) omp_get_thread_num() | ||||
| #define thread_max(a) omp_get_max_threads() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user