Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2026-06-18 18:03:44 +01:00 · 2024-07-23 09:53:58 -04:00
parent 4563b39305 41d8adca95
commit b461184797
14 changed files with 307 additions and 47 deletions
@@ -89,9 +89,10 @@ public:
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::cpu_selector selector;
+      cl::sycl::gpu_selector selector;
      cl::sycl::device selectedDevice { selector };
-      gridblasHandle =new sycl::queue (selectedDevice);
+      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
+      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
      gridblasInit=1;
    }
@@ -266,8 +267,46 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+    std::cerr << " Calling SYCL batched ZGEMM "<<std::endl;
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+      oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &notransp,
+						  &notransp,
+						  &m64,&n64,&k64,
+						  (ComplexD *) &alpha_p[0],
+						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexD *) &beta_p[0],
+						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+      synchronise();
+    std::cerr << " Called SYCL batched ZGEMM "<<std::endl;
+      std::vector<ComplexD> A(m*k);  // pointer list to matrices
+      std::vector<ComplexD> B(k*n);
+      std::vector<ComplexD> C(m*n);
+      int sda = lda*k;
+      int sdb = ldb*k;
+      int sdc = ldc*n;
+      for (int p = 0; p < 1; ++p) {
+	acceleratorCopyFromDevice((void *)&Amk[p][0],(void *)&A[0],m*k*sizeof(ComplexD));
+	acceleratorCopyFromDevice((void *)&Bkn[p][0],(void *)&B[0],k*n*sizeof(ComplexD));
+	acceleratorCopyFromDevice((void *)&Cmn[p][0],(void *)&C[0],m*n*sizeof(ComplexD));
+	for (int mm = 0; mm < m; ++mm) {
+	  for (int nn = 0; nn < n; ++nn) {
+	    ComplexD c_mn(0.0);
+	    for (int kk = 0; kk < k; ++kk)
+	      c_mn += A[mm + kk*lda ] * B[kk + nn*ldb];
+	    std::cout << " beta "<<beta<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
+	  }
+	}
+      }
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    // Need a default/reference implementation
@@ -285,7 +324,6 @@ public:
      }
    }
 #endif
-    //    synchronise();
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@@ -366,8 +404,25 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+      oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &notransp,
+						  &notransp,
+						  &m64,&n64,&k64,
+						  (ComplexF *) &alpha_p[0],
+						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
+						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
+						  (ComplexF *) &beta_p[0],
+						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    int sda = lda*k;
@@ -467,8 +522,25 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      int64_t m64=m;
+      int64_t n64=n;
+      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
+      int64_t batchCount64=batchCount;
+      oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &notransp,
+						  &notransp,
+						  &m64,&n64,&k64,
+						  (float *) &alpha_p[0],
+						  (const float **)&Amk[0], (const int64_t *)&lda64,
+						  (const float **)&Bkn[0], (const int64_t *)&ldb64,
+						  (float *) &beta_p[0],
+						  (float **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    int sda = lda*k;
@@ -568,24 +640,25 @@ public:
    assert(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    /*
      int64_t m64=m;
      int64_t n64=n;
      int64_t k64=k;
+      int64_t lda64=lda;
+      int64_t ldb64=ldb;
+      int64_t ldc64=ldc;
      int64_t batchCount64=batchCount;
-      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
-      onemkl::transpose::N,
-      onemkl::transpose::N,
-      &m64,&n64,&k64,
-      (double *) &alpha_p[0],
-      (double **)&Amk[0], lda,
-      (double **)&Bkn[0], ldb,
-      (double *) &beta_p[0],
-      (double **)&Cmn[0], ldc,
-      1,&batchCount64);
-     */
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
+      oneapi::mkl::transpose notransp =oneapi::mkl::transpose::N;
+      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
+						  &notransp,
+						  &notransp,
+						  &m64,&n64,&k64,
+						  (double *) &alpha_p[0],
+						  (const double **)&Amk[0], (const int64_t *)&lda64,
+						  (const double **)&Bkn[0], (const int64_t *)&ldb64,
+						  (double *) &beta_p[0],
+						  (double **)&Cmn[0], (const int64_t *)&ldc64,
+						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
    int sda = lda*k;
@@ -673,6 +746,7 @@ public:
 						beta,
 						(ComplexD *)Cmn,ldc,sdc,
 						batchCount);
+    synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
     // Need a default/reference implementation
@@ -499,6 +499,87 @@ namespace Grid {
      }
  };

+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, left preconditioned by Mee^inv
+  // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta
+  //
+  // Solve:
+  // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta
+  //
+  // Old notation e<->o
+  //
+  // Left precon by Moo^-1
+  //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o
+  //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
+      const bool _solnAsInitGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
+
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      Mtmp=src_o-Mtmp;                 
+      _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);     
+      
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+    }
+
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even);
+      tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd );
+    };
+
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta