mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-01 04:24:32 +00:00 
			
		
		
		
	Merged openmp offload implementation with develop
This commit is contained in:
		| @@ -110,8 +110,10 @@ public: | |||||||
|     autoView( out_v, out, AcceleratorWrite); |     autoView( out_v, out, AcceleratorWrite); | ||||||
|     autoView( phi_v, phi, AcceleratorRead); |     autoView( phi_v, phi, AcceleratorRead); | ||||||
|     autoView( Umu_v, Umu, AcceleratorRead); |     autoView( Umu_v, Umu, AcceleratorRead); | ||||||
|  |     int size=out.Grid()->oSites(); | ||||||
|  |  | ||||||
|     typedef decltype(coalescedRead(out_v[0]))   calcSpinor; |     typedef decltype(coalescedRead(out_v[0]))   calcSpinor; | ||||||
|     accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ |     accelerator_for(sss,size,Nsimd,{ | ||||||
| 	calcSpinor tmp; | 	calcSpinor tmp; | ||||||
| 	multLink(tmp,Umu_v[sss],phi_v(sss),mu); | 	multLink(tmp,Umu_v[sss],phi_v(sss),mu); | ||||||
| 	coalescedWrite(out_v[sss],tmp); | 	coalescedWrite(out_v[sss],tmp); | ||||||
| @@ -203,7 +205,8 @@ public: | |||||||
|       autoView( tmp_v , tmp, AcceleratorWrite); |       autoView( tmp_v , tmp, AcceleratorWrite); | ||||||
|       autoView( Btilde_v , Btilde, AcceleratorRead); |       autoView( Btilde_v , Btilde, AcceleratorRead); | ||||||
|       autoView( Atilde_v , Atilde, AcceleratorRead); |       autoView( Atilde_v , Atilde, AcceleratorRead); | ||||||
|       accelerator_for(sss,tmp.Grid()->oSites(),1,{ |       int size=tmp.Grid()->oSites(); | ||||||
|  |       accelerator_for(sss,size,1,{ | ||||||
| 	  int sU=sss; | 	  int sU=sss; | ||||||
| 	  for(int s=0;s<Ls;s++){ | 	  for(int s=0;s<Ls;s++){ | ||||||
| 	    int sF = s+Ls*sU; | 	    int sF = s+Ls*sU; | ||||||
| @@ -217,7 +220,8 @@ public: | |||||||
|       const int Nsimd = SiteSpinor::Nsimd(); |       const int Nsimd = SiteSpinor::Nsimd(); | ||||||
|       autoView( Btilde_v , Btilde, AcceleratorRead); |       autoView( Btilde_v , Btilde, AcceleratorRead); | ||||||
|       autoView( Atilde_v , Atilde, AcceleratorRead); |       autoView( Atilde_v , Atilde, AcceleratorRead); | ||||||
|       accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{ |       int size=mat.Grid()->oSites(); | ||||||
|  |       accelerator_for(sss,size,Nsimd,{ | ||||||
| 	  int sU=sss; | 	  int sU=sss; | ||||||
|   	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; |   	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; | ||||||
|   	  ColorMatrixType sum; |   	  ColorMatrixType sum; | ||||||
|   | |||||||
| @@ -90,7 +90,8 @@ public: | |||||||
|   static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W |   static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W | ||||||
|     autoView(U_v,U,AcceleratorWrite); |     autoView(U_v,U,AcceleratorWrite); | ||||||
|     autoView(W_v,W,AcceleratorRead); |     autoView(W_v,W,AcceleratorRead); | ||||||
|     accelerator_for( ss, U.Grid()->oSites(), 1, { |     int size=U.Grid()->oSites(); | ||||||
|  |     accelerator_for( ss, size, 1, { | ||||||
|       U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); |       U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); | ||||||
|     }); |     }); | ||||||
|   } |   } | ||||||
| @@ -135,7 +136,8 @@ public: | |||||||
|     //auto start = std::chrono::high_resolution_clock::now(); |     //auto start = std::chrono::high_resolution_clock::now(); | ||||||
|     autoView(U_v,U,AcceleratorWrite); |     autoView(U_v,U,AcceleratorWrite); | ||||||
|     autoView(P_v,P,AcceleratorRead); |     autoView(P_v,P,AcceleratorRead); | ||||||
|     accelerator_for(ss, P.Grid()->oSites(),1,{ |     int size=P.Grid()->oSites(); | ||||||
|  |     accelerator_for(ss, size,1,{ | ||||||
|       for (int mu = 0; mu < Nd; mu++) { |       for (int mu = 0; mu < Nd; mu++) { | ||||||
|         U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); |         U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); | ||||||
|       } |       } | ||||||
|   | |||||||
| @@ -26,8 +26,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
| #pragma once |  | ||||||
|  |  | ||||||
|  | #ifndef ACCELERATOR_H | ||||||
|  | #define ACCELERATOR_H | ||||||
|  |  | ||||||
|  | #pragma once | ||||||
| #include <string.h> | #include <string.h> | ||||||
|  |  | ||||||
| #ifdef HAVE_MALLOC_MALLOC_H | #ifdef HAVE_MALLOC_MALLOC_H | ||||||
| @@ -471,7 +474,70 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); | |||||||
|  |  | ||||||
| #undef GRID_SIMT | #undef GRID_SIMT | ||||||
|  |  | ||||||
|  | //OpenMP Target Offloading | ||||||
|  | #ifdef OMPTARGET | ||||||
|  | #define THREAD_LIMIT acceleratorThreads() | ||||||
|  |  | ||||||
|  | #define accelerator | ||||||
|  | #define accelerator_inline strong_inline | ||||||
|  | #ifdef THREAD_LIMIT | ||||||
|  | #define accelerator_for(i,num,nsimd, ... ) \ | ||||||
|  | 	_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT)") \ | ||||||
|  | 	for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;  | ||||||
|  | #define accelerator_forNB(i,num,nsimd, ... ) \ | ||||||
|  | 	_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT) nowait") \ | ||||||
|  |         for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ; | ||||||
|  | #define accelerator_barrier(dummy) _Pragma("omp barrier")  | ||||||
|  | #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \ | ||||||
|  | 	_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT) collapse(2)") \ | ||||||
|  |         for ( uint64_t iter1=0;iter1<num1;iter1++) \ | ||||||
|  | 	for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ; | ||||||
|  | #else | ||||||
|  | #define accelerator_for(i,num,nsimd, ... ) \ | ||||||
|  |         _Pragma("omp target teams distribute parallel for") \ | ||||||
|  |         for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ; | ||||||
|  | #define accelerator_forNB(i,num,nsimd, ... ) \ | ||||||
|  |         _Pragma("omp target teams distribute parallel for nowait") \ | ||||||
|  |         for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ; | ||||||
|  | #define accelerator_barrier(dummy) _Pragma("omp barrier") | ||||||
|  | #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \ | ||||||
|  |         _Pragma("omp target teams distribute parallel for collapse(2)") \ | ||||||
|  |         for ( uint64_t iter1=0;iter1<num1;iter1++) \ | ||||||
|  |         for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific | ||||||
|  | inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  {;} | ||||||
|  | inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){;} | ||||||
|  | inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);} | ||||||
|  | inline void acceleratorCopySynchronize(void) {;}; | ||||||
|  |  | ||||||
|  | inline int  acceleratorIsCommunicable(void *ptr){ return 1; } | ||||||
|  | inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} | ||||||
|  | #ifdef OMPTARGET_MANAGED  | ||||||
|  | #include <cuda_runtime_api.h> | ||||||
|  | inline void *acceleratorAllocShared(size_t bytes) | ||||||
|  | { | ||||||
|  |   void *ptr=NULL; | ||||||
|  |   auto err = cudaMallocManaged((void **)&ptr,bytes); | ||||||
|  |   if( err != cudaSuccess ) { | ||||||
|  |     ptr = (void *) NULL; | ||||||
|  |     printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err)); | ||||||
|  |   } | ||||||
|  |   return ptr; | ||||||
|  | }; | ||||||
|  | inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);}; | ||||||
|  | inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; | ||||||
|  | inline void acceleratorFreeDevice(void *ptr){free(ptr);}; | ||||||
|  | #else | ||||||
|  | inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; | ||||||
|  | inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);}; | ||||||
|  | inline void acceleratorFreeShared(void *ptr){free(ptr);}; | ||||||
|  | inline void acceleratorFreeDevice(void *ptr){free(ptr);}; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | //OpenMP CPU threads | ||||||
|  | #else | ||||||
|  |  | ||||||
| #define accelerator  | #define accelerator  | ||||||
| #define accelerator_inline strong_inline | #define accelerator_inline strong_inline | ||||||
| @@ -500,6 +566,7 @@ inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALI | |||||||
| inline void acceleratorFreeShared(void *ptr){free(ptr);}; | inline void acceleratorFreeShared(void *ptr){free(ptr);}; | ||||||
| inline void acceleratorFreeDevice(void *ptr){free(ptr);}; | inline void acceleratorFreeDevice(void *ptr){free(ptr);}; | ||||||
| #endif | #endif | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #endif // CPU target | #endif // CPU target | ||||||
|  |  | ||||||
| @@ -566,3 +633,5 @@ accelerator_inline void acceleratorFence(void) | |||||||
| } | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -46,7 +46,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
| #define DO_PRAGMA_(x) _Pragma (#x) | #define DO_PRAGMA_(x) _Pragma ("x") | ||||||
| #define DO_PRAGMA(x) DO_PRAGMA_(x) | #define DO_PRAGMA(x) DO_PRAGMA_(x) | ||||||
| #define thread_num(a) omp_get_thread_num() | #define thread_num(a) omp_get_thread_num() | ||||||
| #define thread_max(a) omp_get_max_threads() | #define thread_max(a) omp_get_max_threads() | ||||||
|   | |||||||
| @@ -36,19 +36,21 @@ int main (int argc, char ** argv) | |||||||
| { | { | ||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
| #define LMAX (40) | #define LMAX (8) | ||||||
| #define LMIN (8) | #define LMIN (8) | ||||||
| #define LADD (8) | #define LADD (8) | ||||||
|  |  | ||||||
|   int64_t Nwarm=10; |   int64_t Nwarm=0; | ||||||
|   int64_t Nloop=100; |   int64_t Nloop=1; | ||||||
|  |  | ||||||
|   Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); |   Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||||
|   Coordinate mpi_layout  = GridDefaultMpi(); |   Coordinate mpi_layout  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|   int64_t threads = GridThread::GetThreads(); |   int64_t threads = GridThread::GetThreads(); | ||||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; |   int64_t accelerator_threads = acceleratorThreads(); | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "Grid is setup to use "<<accelerator_threads<<" GPU threads"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  x= x*y"<<std::endl; |   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  x= x*y"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
| @@ -222,6 +224,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | #if 1 | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl; |   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
| @@ -254,7 +257,9 @@ int main (int argc, char ** argv) | |||||||
| 	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; | 	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; | ||||||
|       } |       } | ||||||
|   } |   } | ||||||
| #if 1 | #endif | ||||||
|  |  | ||||||
|  | #if 0 | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl; |   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   | |||||||
							
								
								
									
										8
									
								
								config-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								config-command
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,8 @@ | |||||||
|  | ../configure \ | ||||||
|  |     --enable-comms=none \ | ||||||
|  |     --enable-simd=GEN \ | ||||||
|  |     --enable-gen-simd-width=16 \ | ||||||
|  |     CXX=clang++ \ | ||||||
|  |     LDFLAGS="-lcudart " \ | ||||||
|  |     CXXFLAGS="-fopenmp -std=c++14 -fopenmp-cuda-mode  -O3 -target x86_64-pc-linux-gnu -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_70 -DOMPTARGET -DOMPTARGET_MANAGED" | ||||||
|  |  | ||||||
		Reference in New Issue
	
	Block a user