mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 19:44:32 +00:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			feature/su
			...
			feature/cp
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 92a83a9eb3 | ||
|  | e16fc5b2e4 | ||
|  | 694306f202 | 
| @@ -142,15 +142,6 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) | |||||||
|   return sumD_cpu(arg,osites); |   return sumD_cpu(arg,osites); | ||||||
| #endif   | #endif   | ||||||
| } | } | ||||||
| template<class vobj> |  | ||||||
| inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) |  | ||||||
| { |  | ||||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) |  | ||||||
|   return sumD_gpu_large(arg,osites); |  | ||||||
| #else |  | ||||||
|   return sumD_cpu(arg,osites); |  | ||||||
| #endif   |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||||
| @@ -168,22 +159,6 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | |||||||
|   return ssum; |   return ssum; | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> |  | ||||||
| inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) |  | ||||||
| { |  | ||||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) |  | ||||||
|   autoView( arg_v, arg, AcceleratorRead); |  | ||||||
|   Integer osites = arg.Grid()->oSites(); |  | ||||||
|   auto ssum= sum_gpu_large(&arg_v[0],osites); |  | ||||||
| #else |  | ||||||
|   autoView(arg_v, arg, CpuRead); |  | ||||||
|   Integer osites = arg.Grid()->oSites(); |  | ||||||
|   auto ssum= sum_cpu(&arg_v[0],osites); |  | ||||||
| #endif |  | ||||||
|   arg.Grid()->GlobalSum(ssum); |  | ||||||
|   return ssum; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Deterministic Reduction operations | // Deterministic Reduction operations | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) { | |||||||
| } | } | ||||||
|  |  | ||||||
| template <class Iterator> | template <class Iterator> | ||||||
| int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { | void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { | ||||||
|    |    | ||||||
|   int device; |   int device; | ||||||
| #ifdef GRID_CUDA | #ifdef GRID_CUDA | ||||||
| @@ -37,13 +37,13 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & | |||||||
|   Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock; |   Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock; | ||||||
|   Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock; |   Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock; | ||||||
|   Iterator multiProcessorCount = gpu_props[device].multiProcessorCount; |   Iterator multiProcessorCount = gpu_props[device].multiProcessorCount; | ||||||
|   /*   |    | ||||||
|   std::cout << GridLogDebug << "GPU has:" << std::endl; |   std::cout << GridLogDebug << "GPU has:" << std::endl; | ||||||
|   std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl; |   std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl; | ||||||
|   std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl; |   std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl; | ||||||
|   std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl; |   std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl; | ||||||
|   std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; |   std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; | ||||||
|   */   |    | ||||||
|   if (warpSize != WARP_SIZE) { |   if (warpSize != WARP_SIZE) { | ||||||
|     std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl; |     std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl; | ||||||
|     exit(EXIT_FAILURE); |     exit(EXIT_FAILURE); | ||||||
| @@ -53,12 +53,12 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & | |||||||
|   threads = warpSize; |   threads = warpSize; | ||||||
|   if ( threads*sizeofsobj > sharedMemPerBlock ) { |   if ( threads*sizeofsobj > sharedMemPerBlock ) { | ||||||
|     std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; |     std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; | ||||||
|     return 0; |     exit(EXIT_FAILURE); | ||||||
|   } |   } | ||||||
|   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; |   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; | ||||||
|   // keep all the streaming multiprocessors busy |   // keep all the streaming multiprocessors busy | ||||||
|   blocks = nextPow2(multiProcessorCount); |   blocks = nextPow2(multiProcessorCount); | ||||||
|   return 1; |    | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class sobj, class Iterator> | template <class sobj, class Iterator> | ||||||
| @@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { | |||||||
| // Possibly promote to double and sum | // Possibly promote to double and sum | ||||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| template <class vobj> | template <class vobj> | ||||||
| inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)  | inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)  | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_objectD sobj; |   typedef typename vobj::scalar_objectD sobj; | ||||||
|   typedef decltype(lat) Iterator; |   typedef decltype(lat) Iterator; | ||||||
| @@ -207,9 +207,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi | |||||||
|   Integer size = osites*nsimd; |   Integer size = osites*nsimd; | ||||||
|  |  | ||||||
|   Integer numThreads, numBlocks; |   Integer numThreads, numBlocks; | ||||||
|   int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); |   getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); | ||||||
|   assert(ok); |  | ||||||
|  |  | ||||||
|   Integer smemSize = numThreads * sizeof(sobj); |   Integer smemSize = numThreads * sizeof(sobj); | ||||||
|  |  | ||||||
|   Vector<sobj> buffer(numBlocks); |   Vector<sobj> buffer(numBlocks); | ||||||
| @@ -220,54 +218,6 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi | |||||||
|   auto result = buffer_v[0]; |   auto result = buffer_v[0]; | ||||||
|   return result; |   return result; | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class vobj> |  | ||||||
| inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites) |  | ||||||
| { |  | ||||||
|   typedef typename vobj::vector_type  vector; |  | ||||||
|   typedef typename vobj::scalar_typeD scalarD; |  | ||||||
|   typedef typename vobj::scalar_objectD sobj; |  | ||||||
|   sobj ret; |  | ||||||
|   scalarD *ret_p = (scalarD *)&ret; |  | ||||||
|    |  | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |  | ||||||
|  |  | ||||||
|   Vector<vector> buffer(osites); |  | ||||||
|   vector *dat = (vector *)lat; |  | ||||||
|   vector *buf = &buffer[0]; |  | ||||||
|   iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0]; |  | ||||||
|   for(int w=0;w<words;w++) { |  | ||||||
|  |  | ||||||
|     accelerator_for(ss,osites,1,{ |  | ||||||
| 	buf[ss] = dat[ss*words+w]; |  | ||||||
|       }); |  | ||||||
|        |  | ||||||
|     ret_p[w] = sumD_gpu_small(tbuf,osites); |  | ||||||
|   } |  | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class vobj> |  | ||||||
| inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) |  | ||||||
| { |  | ||||||
|   typedef typename vobj::vector_type  vector; |  | ||||||
|   typedef typename vobj::scalar_typeD scalarD; |  | ||||||
|   typedef typename vobj::scalar_objectD sobj; |  | ||||||
|   sobj ret; |  | ||||||
|    |  | ||||||
|   Integer nsimd= vobj::Nsimd(); |  | ||||||
|   Integer size = osites*nsimd; |  | ||||||
|   Integer numThreads, numBlocks; |  | ||||||
|   int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); |  | ||||||
|    |  | ||||||
|   if ( ok ) { |  | ||||||
|     ret = sumD_gpu_small(lat,osites); |  | ||||||
|   } else { |  | ||||||
|     ret = sumD_gpu_large(lat,osites); |  | ||||||
|   } |  | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Return as same precision as input performing reduction in double precision though | // Return as same precision as input performing reduction in double precision though | ||||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -280,13 +230,6 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) | |||||||
|   return result; |   return result; | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class vobj> |  | ||||||
| inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites) |  | ||||||
| { |  | ||||||
|   typedef typename vobj::scalar_object sobj; |  | ||||||
|   sobj result; |  | ||||||
|   result = sumD_gpu_large(lat,osites); |  | ||||||
|   return result; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -481,9 +481,10 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); | |||||||
| #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ }); | #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ }); | ||||||
|  |  | ||||||
| accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific | accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific | ||||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);} |  | ||||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} | inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); } | ||||||
| inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);} | inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} | ||||||
|  | inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);} | ||||||
| inline void acceleratorCopySynchronise(void) {}; | inline void acceleratorCopySynchronise(void) {}; | ||||||
|  |  | ||||||
| inline int  acceleratorIsCommunicable(void *ptr){ return 1; } | inline int  acceleratorIsCommunicable(void *ptr){ return 1; } | ||||||
|   | |||||||
| @@ -72,3 +72,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #define thread_region                                       DO_PRAGMA(omp parallel) | #define thread_region                                       DO_PRAGMA(omp parallel) | ||||||
| #define thread_critical                                     DO_PRAGMA(omp critical) | #define thread_critical                                     DO_PRAGMA(omp critical) | ||||||
|  |  | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | inline void thread_bcopy(void *from, void *to,size_t bytes) | ||||||
|  | { | ||||||
|  |   uint64_t *ufrom = (uint64_t *)from; | ||||||
|  |   uint64_t *uto   = (uint64_t *)to; | ||||||
|  |   assert(bytes%8==0); | ||||||
|  |   uint64_t words=bytes/8; | ||||||
|  |   thread_for(w,words,{ | ||||||
|  |       uto[w] = ufrom[w]; | ||||||
|  |   }); | ||||||
|  | } | ||||||
|  | #else | ||||||
|  | inline void thread_bcopy(void *from, void *to,size_t bytes) | ||||||
|  | { | ||||||
|  |   bcopy(from,to,bytes); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								systems/mac-arm/config-command-mpi
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								systems/mac-arm/config-command-mpi
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | |||||||
|  | CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi | ||||||
		Reference in New Issue
	
	Block a user