mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-24 17:54:47 +01:00 
			
		
		
		
	Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs
This commit is contained in:
		| @@ -1,4 +1,5 @@ | ||||
| #pragma once | ||||
| #include <type_traits> | ||||
| #if defined(GRID_CUDA) | ||||
|  | ||||
| #include <cub/cub.cuh> | ||||
| @@ -26,20 +27,16 @@ | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|  | ||||
| #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||
| template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|  | ||||
| template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | ||||
|   size_t subvol_size = e1*e2; | ||||
|  | ||||
|   commVector<vobj> reduction_buffer(rd*subvol_size); | ||||
|   auto rb_p = &reduction_buffer[0]; | ||||
|   vobj zero_init; | ||||
|   zeroit(zero_init); | ||||
|  | ||||
|   vobj vobj_zero; //Need to provide initial value for reduction operation | ||||
|   zeroit(vobj_zero); | ||||
|    | ||||
|  | ||||
|   void *temp_storage_array = NULL; | ||||
|   size_t temp_storage_bytes = 0; | ||||
|   vobj *d_out; | ||||
| @@ -71,8 +68,8 @@ template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data | ||||
|     exit(EXIT_FAILURE); | ||||
|   } | ||||
|  | ||||
|   //determine temp_storage_array size | ||||
|   gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), vobj_zero, computeStream); | ||||
|    | ||||
|   gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); | ||||
|   if (gpuErr!=gpuSuccess) { | ||||
|     std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl; | ||||
|     exit(EXIT_FAILURE); | ||||
| @@ -85,11 +82,9 @@ template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data | ||||
|     exit(EXIT_FAILURE); | ||||
|   } | ||||
|    | ||||
|   autoView( Data_v, Data, AcceleratorRead); | ||||
|   //prepare buffer for reduction | ||||
|   //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream) | ||||
|   //use 2d accelerator_for to avoid launch latencies found when serially looping over rd  | ||||
|    | ||||
|   accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{  | ||||
|    | ||||
|     int n = s / e2; | ||||
| @@ -97,12 +92,12 @@ template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data | ||||
|     int so=r*ostride; // base offset for start of plane  | ||||
|     int ss= so+n*stride+b; | ||||
|  | ||||
|     coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); | ||||
|     coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); | ||||
|  | ||||
|   }); | ||||
|    | ||||
|   //issue segmented reductions in computeStream | ||||
|   gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), vobj_zero, computeStream); | ||||
|   gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream); | ||||
|   if (gpuErr!=gpuSuccess) { | ||||
|     std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl; | ||||
|     exit(EXIT_FAILURE); | ||||
| @@ -119,6 +114,48 @@ template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data | ||||
|   | ||||
|  | ||||
| } | ||||
|  | ||||
| template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | ||||
|   typedef typename vobj::vector_type vector; | ||||
|   const int words = sizeof(vobj)/sizeof(vector); | ||||
|   const int osites = rd*e1*e2; | ||||
|   Vector<vector>buffer(osites); | ||||
|   vector *dat = (vector *)Data; | ||||
|   vector *buf = &buffer[0]; | ||||
|   Vector<vector> lvSum_small(rd); | ||||
|   vector *lvSum_ptr = (vector *)&lvSum[0]; | ||||
|  | ||||
|   for (int w = 0; w < words; w++) { | ||||
|     accelerator_for(ss,osites,1,{ | ||||
| 	    buf[ss] = dat[ss*words+w]; | ||||
|     }); | ||||
|  | ||||
|     sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||
|        | ||||
|     for (int r = 0; r < rd; r++) { | ||||
|       lvSum_ptr[w+words*r]=lvSum_small[r]; | ||||
|     } | ||||
|  | ||||
|   } | ||||
|  | ||||
|    | ||||
| } | ||||
|  | ||||
| template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | ||||
| { | ||||
|   autoView(Data_v, Data, AcceleratorRead); | ||||
|   #if defined(GRID_CUDA) | ||||
|     sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|  | ||||
|   #elif defined (GRID_HIP) //hipcub cannot deal with large vobjs that don't fit in shared memory, therefore separate into _small/_large.  | ||||
|     if constexpr (sizeof(vobj) <= 256) { | ||||
|       sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|     } | ||||
|     else { | ||||
|       sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|     } | ||||
|   #endif | ||||
| } | ||||
| #endif | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -140,19 +140,21 @@ int main (int argc, char ** argv) { | ||||
|     } | ||||
|     traceStop(trace_id); | ||||
|  | ||||
|     LatticeColourVectorD test_data_cv(&Grid); | ||||
|     LatticeSpinVectorD test_data_cv(&Grid); | ||||
|     gaussian(pRNG,test_data_cv); | ||||
|  | ||||
|     std::vector<ColourVectorD> reduction_reference_cv; | ||||
|     std::vector<ColourVectorD> reduction_result_cv; | ||||
|     std::vector<SpinVectorD> reduction_reference_cv; | ||||
|     std::vector<SpinVectorD> reduction_result_cv; | ||||
|  | ||||
|     //warmup | ||||
|     for (int sweeps = 0; sweeps < 5; sweeps++) { | ||||
|       reduction_result_cv = sliceSum(test_data_cv,0); | ||||
|     } | ||||
|     trace_id = traceStart("sliceSum benchmark - ColourVectorD"); | ||||
|     trace_id = traceStart("sliceSum benchmark - SpinVectorD"); | ||||
|  | ||||
|     std::cout << GridLogMessage << "Testing ColourVectorD" << std::endl; | ||||
|     std::cout << GridLogMessage << "Testing SpinVectorD" << std::endl; | ||||
|     std::cout << GridLogMessage << "sizeof(SpinVectorD) = " << sizeof(SpinVectorD) << std::endl; | ||||
|     std::cout << GridLogMessage << "sizeof(vSpinVectorD) = " << sizeof(vSpinVectorD) << std::endl; | ||||
|     for (int i = 0; i < Nd; i++) { | ||||
|  | ||||
|       RealD t=-usecond(); | ||||
| @@ -180,9 +182,10 @@ int main (int argc, char ** argv) { | ||||
|       for(int t=0;t<reduction_reference_cv.size();t++) { | ||||
|  | ||||
|         auto diff = reduction_reference_cv[t]-reduction_result_cv[t]; | ||||
|         assert(abs(diff()()(0)) < 1e-8 ); | ||||
|         assert(abs(diff()()(1)) < 1e-8 ); | ||||
|         assert(abs(diff()()(2)) < 1e-8 ); | ||||
|         assert(abs(diff()(0)()) < 1e-8 ); | ||||
|         assert(abs(diff()(1)()) < 1e-8 ); | ||||
|         assert(abs(diff()(2)()) < 1e-8 ); | ||||
|         assert(abs(diff()(3)()) < 1e-8 ); | ||||
|  | ||||
|       } | ||||
|  | ||||
| @@ -203,6 +206,8 @@ int main (int argc, char ** argv) { | ||||
|     trace_id = traceStart("sliceSum benchmark - SpinColourVectorD"); | ||||
|  | ||||
|     std::cout << GridLogMessage << "Testing SpinColourVectorD" << std::endl; | ||||
|     std::cout << GridLogMessage << "sizeof(SpinColourVectorD) = " << sizeof(SpinColourVectorD) << std::endl; | ||||
|     std::cout << GridLogMessage << "sizeof(vSpinColourVectorD) = " << sizeof(vSpinColourVectorD) << std::endl; | ||||
|     for (int i = 0; i < Nd; i++) { | ||||
|  | ||||
|       RealD t=-usecond(); | ||||
| @@ -230,6 +235,7 @@ int main (int argc, char ** argv) { | ||||
|       for(int t=0;t<reduction_reference_scv.size();t++) { | ||||
|  | ||||
|         auto diff = reduction_reference_scv[t]-reduction_result_scv[t]; | ||||
|         // std::cout << diff <<std::endl; | ||||
|         assert(abs(diff()(0)(0)) < 1e-8 ); | ||||
|         assert(abs(diff()(0)(1)) < 1e-8 ); | ||||
|         assert(abs(diff()(0)(2)) < 1e-8 ); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user