mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 13:34:33 +00:00 
			
		
		
		
	Compare commits
	
		
			6 Commits
		
	
	
		
			hotfix/unw
			...
			da81a73b4a
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					da81a73b4a | ||
| 
						 | 
					461cd045c6 | ||
| 
						 | 
					fee65d7a75 | ||
| 
						 | 
					31f9971dbf | ||
| 
						 | 
					d87296f3e8 | ||
| 
						 | 
					be94cf1c6f | 
@@ -1,5 +1,5 @@
 | 
			
		||||
#pragma once
 | 
			
		||||
#include <type_traits>
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_CUDA)
 | 
			
		||||
 | 
			
		||||
#include <cub/cub.cuh>
 | 
			
		||||
@@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_SYCL)
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 | 
			
		||||
{
 | 
			
		||||
  size_t subvol_size = e1*e2;
 | 
			
		||||
 | 
			
		||||
  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
 | 
			
		||||
  vobj vobj_zero;
 | 
			
		||||
  zeroit(vobj_zero);
 | 
			
		||||
  for (int r = 0; r<rd; r++) { 
 | 
			
		||||
    mysum[r] = vobj_zero; 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  commVector<vobj> reduction_buffer(rd*subvol_size);    
 | 
			
		||||
 | 
			
		||||
  auto rb_p = &reduction_buffer[0];
 | 
			
		||||
 | 
			
		||||
  // autoView(Data_v, Data, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  //prepare reduction buffer 
 | 
			
		||||
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
 | 
			
		||||
  
 | 
			
		||||
      int n = s / e2;
 | 
			
		||||
      int b = s % e2;
 | 
			
		||||
      int so=r*ostride; // base offset for start of plane 
 | 
			
		||||
      int ss= so+n*stride+b;
 | 
			
		||||
 | 
			
		||||
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
 | 
			
		||||
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  for (int r = 0; r < rd; r++) {
 | 
			
		||||
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
 | 
			
		||||
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
 | 
			
		||||
          Reduction,
 | 
			
		||||
          [=](cl::sycl::id<1> item, auto &sum) {
 | 
			
		||||
              auto s = item[0];
 | 
			
		||||
              sum += rb_p[r*subvol_size+s];
 | 
			
		||||
          });
 | 
			
		||||
      });
 | 
			
		||||
      
 | 
			
		||||
     
 | 
			
		||||
  }
 | 
			
		||||
  theGridAccelerator->wait();
 | 
			
		||||
  for (int r = 0; r < rd; r++) {
 | 
			
		||||
    lvSum[r] = mysum[r];
 | 
			
		||||
  }
 | 
			
		||||
  free(mysum,*theGridAccelerator);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
 | 
			
		||||
  typedef typename vobj::vector_type vector;
 | 
			
		||||
  const int words = sizeof(vobj)/sizeof(vector);
 | 
			
		||||
  const int osites = rd*e1*e2;
 | 
			
		||||
@@ -106,7 +159,11 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 | 
			
		||||
	    buf[ss] = dat[ss*words+w];
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    #if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
 | 
			
		||||
    #elif defined(GRID_SYCL)
 | 
			
		||||
      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
    for (int r = 0; r < rd; r++) {
 | 
			
		||||
      lvSum_ptr[w+words*r]=lvSum_small[r];
 | 
			
		||||
@@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 | 
			
		||||
{
 | 
			
		||||
  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
 | 
			
		||||
  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
 | 
			
		||||
    if constexpr (sizeof(vobj) <= 256) { 
 | 
			
		||||
 | 
			
		||||
      #if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
      #elif defined (GRID_SYCL)
 | 
			
		||||
        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
      #endif
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_SYCL)
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  size_t subvol_size = e1*e2;
 | 
			
		||||
 | 
			
		||||
  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
 | 
			
		||||
  vobj vobj_zero;
 | 
			
		||||
  zeroit(vobj_zero);
 | 
			
		||||
    
 | 
			
		||||
  commVector<vobj> reduction_buffer(rd*subvol_size);    
 | 
			
		||||
 | 
			
		||||
  auto rb_p = &reduction_buffer[0];
 | 
			
		||||
 | 
			
		||||
  autoView(Data_v, Data, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  //prepare reduction buffer 
 | 
			
		||||
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
 | 
			
		||||
  
 | 
			
		||||
      int n = s / e2;
 | 
			
		||||
      int b = s % e2;
 | 
			
		||||
      int so=r*ostride; // base offset for start of plane 
 | 
			
		||||
      int ss= so+n*stride+b;
 | 
			
		||||
 | 
			
		||||
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
 | 
			
		||||
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  for (int r = 0; r < rd; r++) {
 | 
			
		||||
      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
 | 
			
		||||
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
 | 
			
		||||
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
 | 
			
		||||
          Reduction,
 | 
			
		||||
          [=](cl::sycl::id<1> item, auto &sum) {
 | 
			
		||||
              auto s = item[0];
 | 
			
		||||
              sum += rb_p[r*subvol_size+s];
 | 
			
		||||
          });
 | 
			
		||||
      });
 | 
			
		||||
      theGridAccelerator->wait();
 | 
			
		||||
      lvSum[r] = mysum[0];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  free(mysum,*theGridAccelerator);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 | 
			
		||||
{
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
@@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 | 
			
		||||
{
 | 
			
		||||
  #if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
 | 
			
		||||
  
 | 
			
		||||
  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
  
 | 
			
		||||
  #elif defined(GRID_SYCL)
 | 
			
		||||
  
 | 
			
		||||
  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
  
 | 
			
		||||
  #else
 | 
			
		||||
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user