mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 12:04:33 +00:00 
			
		
		
		
	Compare commits
	
		
			41 Commits
		
	
	
		
			d299c86633
			...
			feature/ft
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | bffd30abec | ||
|  | da919949f9 | ||
|  | b12b4fdaff | ||
|  | 557fa483ff | ||
|  | fc15d55df6 | ||
|  | 53573d7d94 | ||
|  | bb3c177000 | ||
|  | a3322b470f | ||
|  | f8f408e7a9 | ||
|  | baac1127d0 | ||
|  | 6f1328160c | ||
|  | 04cf902791 | ||
|  | 7a5b1c1a19 | ||
|  | 18d2d7da4a | ||
|  | b461184797 | ||
|  | 4563b39305 | ||
|  | c9d5674d5b | ||
|  | 486412635a | ||
|  | 8b23a1546a | ||
|  | a901e4e369 | ||
|  | 804d9367d4 | ||
|  | 41d8adca95 | ||
|  | 059e8e5bb0 | ||
|  | b3ee8ded96 | ||
|  | cf3584ad15 | ||
|  | a66973163f | ||
|  | 4502a8c8a1 | ||
|  | 9c902e4c2d | ||
|  | f3eb36adcf | ||
|  | 7c246606c1 | ||
|  | 172c75029e | ||
|  | 6ae52da571 | ||
|  | 4ee9c68053 | ||
|  | a15b4378a3 | ||
|  | 89fdd7f8dd | ||
|  | c328be24b7 | ||
|  | a73dc6dbf4 | ||
|  | eee2a2657f | ||
|  | 12b8be7cb9 | ||
|  | 63c223ea5d | ||
|  | 2877fb4a2c | 
							
								
								
									
										1052
									
								
								BLAS_benchmark/BatchBlasBench.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1052
									
								
								BLAS_benchmark/BatchBlasBench.cc
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										2
									
								
								BLAS_benchmark/compile-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								BLAS_benchmark/compile-command
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  |  | ||||||
|  | mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench | ||||||
| @@ -30,9 +30,14 @@ directory | |||||||
|  |  | ||||||
| #include <type_traits> | #include <type_traits> | ||||||
| #include <cassert> | #include <cassert> | ||||||
|  | #include <exception> | ||||||
|  |  | ||||||
| #define NAMESPACE_BEGIN(A) namespace A { | #define NAMESPACE_BEGIN(A) namespace A { | ||||||
| #define NAMESPACE_END(A)   } | #define NAMESPACE_END(A)   } | ||||||
| #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) | #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) | ||||||
| #define GRID_NAMESPACE_END   NAMESPACE_END(Grid) | #define GRID_NAMESPACE_END   NAMESPACE_END(Grid) | ||||||
| #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  );  | #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  );  | ||||||
|  |  | ||||||
|  | #define EXCEPTION_CHECK_BEGIN(A) try { | ||||||
|  | #define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -89,9 +89,10 @@ public: | |||||||
|       gridblasHandle = theGridAccelerator; |       gridblasHandle = theGridAccelerator; | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_ONE_MKL | #ifdef GRID_ONE_MKL | ||||||
|       cl::sycl::cpu_selector selector; |       cl::sycl::gpu_selector selector; | ||||||
|       cl::sycl::device selectedDevice { selector }; |       cl::sycl::device selectedDevice { selector }; | ||||||
|       gridblasHandle =new sycl::queue (selectedDevice); |       cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; | ||||||
|  |       gridblasHandle =new sycl::queue (selectedDevice,q_prop); | ||||||
| #endif | #endif | ||||||
|       gridblasInit=1; |       gridblasInit=1; | ||||||
|     } |     } | ||||||
| @@ -207,6 +208,9 @@ public: | |||||||
|     assert(Bkn.size()==batchCount); |     assert(Bkn.size()==batchCount); | ||||||
|     assert(Cmn.size()==batchCount); |     assert(Cmn.size()==batchCount); | ||||||
|  |  | ||||||
|  |     assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose | ||||||
|  |     assert(OpB!=GridBLAS_OP_T); | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |     int lda = m; // m x k column major | ||||||
|     int ldb = k; // k x n column major |     int ldb = k; // k x n column major | ||||||
|     int ldc = m; // m x b column major |     int ldc = m; // m x b column major | ||||||
| @@ -266,26 +270,130 @@ public: | |||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |     assert(err==CUBLAS_STATUS_SUCCESS); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |       int64_t m64=m; | ||||||
| #warning "oneMKL implementation not built " |       int64_t n64=n; | ||||||
| #endif |       int64_t k64=k; | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) |       int64_t lda64=lda; | ||||||
|     // Need a default/reference implementation |       int64_t ldb64=ldb; | ||||||
|     int sda = lda*k; |       int64_t ldc64=ldc; | ||||||
|     int sdb = ldb*k; |       int64_t batchCount64=batchCount; | ||||||
|     int sdc = ldc*n; |  | ||||||
|     for (int p = 0; p < batchCount; ++p) { |       oneapi::mkl::transpose iOpA; | ||||||
|  |       oneapi::mkl::transpose iOpB; | ||||||
|  |        | ||||||
|  |       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||||
|  |       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||||
|  | 						  &iOpA, | ||||||
|  | 						  &iOpB, | ||||||
|  | 						  &m64,&n64,&k64, | ||||||
|  | 						  (ComplexD *) &alpha_p[0], | ||||||
|  | 						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64, | ||||||
|  | 						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64, | ||||||
|  | 						  (ComplexD *) &beta_p[0], | ||||||
|  | 						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64, | ||||||
|  | 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||||
|  |       synchronise(); | ||||||
|  | #if 0 | ||||||
|  |       // This code was used to check the mat mul on Sunspot/OneMKL | ||||||
|  |       std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl; | ||||||
|  |       std::vector<ComplexD> A(m*k);  // pointer list to matrices | ||||||
|  |       std::vector<ComplexD> B(k*n); | ||||||
|  |       std::vector<ComplexD> C(m*n); | ||||||
|  |       //      int sda = lda*k; | ||||||
|  |       //      int sdb = ldb*k; | ||||||
|  |       //      int sdc = ldc*n; | ||||||
|  |       std::cerr << " Checking the GEMM results "<<std::endl; | ||||||
|  |       for (int p = 0; p < 1; ++p) { | ||||||
|  | 	ComplexD * Amk_p;  // pointer list to matrices | ||||||
|  | 	ComplexD * Bkn_p;  // pointer list to matrices | ||||||
|  | 	ComplexD * Cmn_p;  // pointer list to matrices | ||||||
|  | 	acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*)); | ||||||
|  | 	acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*)); | ||||||
|  | 	acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*)); | ||||||
|  | 	std::cerr << " p " << p << " copied pointers "<<std::endl; | ||||||
|  | 	acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD)); | ||||||
|  | 	acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD)); | ||||||
|  | 	acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD)); | ||||||
|  | 	std::cerr << " p " << p << " copied matrices "<<std::endl; | ||||||
|  | 	std::cerr << " C[0] "<<C[0]<<std::endl; | ||||||
|  | 	std::cerr << " A[0] "<<A[0]<<std::endl; | ||||||
|  | 	std::cerr << " B[0] "<<B[0]<<std::endl; | ||||||
|  | 	std::cerr << " m "<<m<<std::endl; | ||||||
|  | 	std::cerr << " n "<<n<<std::endl; | ||||||
|  | 	std::cerr << " k "<<k<<std::endl; | ||||||
| 	for (int mm = 0; mm < m; ++mm) { | 	for (int mm = 0; mm < m; ++mm) { | ||||||
| 	  for (int nn = 0; nn < n; ++nn) { | 	  for (int nn = 0; nn < n; ++nn) { | ||||||
| 	    ComplexD c_mn(0.0); | 	    ComplexD c_mn(0.0); | ||||||
| 	  for (int kk = 0; kk < k; ++kk) | 	    for (int kk = 0; kk < k; ++kk) { | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | 	      int idx_a, idx_b; | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; | 	      //    int lda = m; // m x k column major | ||||||
|  | 	      //    int ldb = k; // k x n column major | ||||||
|  | 	      //    int ldc = m; // m x b column major | ||||||
|  | 	      if(OpA!=GridBLAS_OP_N) { | ||||||
|  | 		idx_a =kk + mm*lda; | ||||||
|  | 	      } else { | ||||||
|  | 		idx_a =mm + kk*lda; | ||||||
|  | 	      } | ||||||
|  | 	      if(OpB!=GridBLAS_OP_N) { | ||||||
|  | 		idx_b =nn + kk*ldb; | ||||||
|  | 	      } else { | ||||||
|  | 		idx_b =kk + nn*ldb; | ||||||
|  | 	      } | ||||||
|  | 	      //	      std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl; | ||||||
|  |  | ||||||
|  | 	      ComplexD Ac = A[idx_a]; | ||||||
|  | 	      ComplexD Bc = B[idx_b]; | ||||||
|  | 	      if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac); | ||||||
|  | 	      if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc); | ||||||
|  | 	       | ||||||
|  | 	      c_mn += Ac*Bc; | ||||||
|  | 	    } | ||||||
|  | 	    std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl; | ||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
|     //    synchronise(); | #endif | ||||||
|  | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||||
|  |     // Need a default/reference implementation; use Eigen | ||||||
|  |       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||||
|  |         }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; | ||||||
|  | 	  } ); | ||||||
|  |       } else {  | ||||||
|  | 	assert(0); | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|      RealD t1=usecond(); |      RealD t1=usecond(); | ||||||
|      RealD flops = 8.0*m*n*k*batchCount; |      RealD flops = 8.0*m*n*k*batchCount; | ||||||
|      RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; |      RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; | ||||||
| @@ -306,6 +414,9 @@ public: | |||||||
|     RealD t2=usecond(); |     RealD t2=usecond(); | ||||||
|     int32_t batchCount = Amk.size(); |     int32_t batchCount = Amk.size(); | ||||||
|  |  | ||||||
|  |     assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose | ||||||
|  |     assert(OpB!=GridBLAS_OP_T); | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |     int lda = m; // m x k column major | ||||||
|     int ldb = k; // k x n column major |     int ldb = k; // k x n column major | ||||||
|     int ldc = m; // m x b column major |     int ldc = m; // m x b column major | ||||||
| @@ -366,25 +477,68 @@ public: | |||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |     assert(err==CUBLAS_STATUS_SUCCESS); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |       int64_t m64=m; | ||||||
| #warning "oneMKL implementation not built " |       int64_t n64=n; | ||||||
|  |       int64_t k64=k; | ||||||
|  |       int64_t lda64=lda; | ||||||
|  |       int64_t ldb64=ldb; | ||||||
|  |       int64_t ldc64=ldc; | ||||||
|  |       int64_t batchCount64=batchCount; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::transpose iOpA; | ||||||
|  |       oneapi::mkl::transpose iOpB; | ||||||
|  |        | ||||||
|  |       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||||
|  |       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||||
|  | 						  &iOpA, | ||||||
|  | 						  &iOpB, | ||||||
|  | 						  &m64,&n64,&k64, | ||||||
|  | 						  (ComplexF *) &alpha_p[0], | ||||||
|  | 						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64, | ||||||
|  | 						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, | ||||||
|  | 						  (ComplexF *) &beta_p[0], | ||||||
|  | 						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, | ||||||
|  | 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||||
|  |     synchronise(); | ||||||
| #endif | #endif | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||||
|     int sda = lda*k; |     // Need a default/reference implementation; use Eigen | ||||||
|     int sdb = ldb*k; |       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|     int sdc = ldc*n; | 	thread_for (p, batchCount, { | ||||||
|     ComplexF alphaf(real(alpha),imag(alpha)); | 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); | ||||||
|     ComplexF betaf(real(beta),imag(beta)); | 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); | ||||||
|     // Need a default/reference implementation | 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||||
|     for (int p = 0; p < batchCount; ++p) { | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||||
|       for (int mm = 0; mm < m; ++mm) { | 	  }); | ||||||
| 	for (int nn = 0; nn < n; ++nn) { |       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { | ||||||
| 	  ComplexF c_mn(0.0); | 	thread_for (p, batchCount, { | ||||||
| 	  for (int kk = 0; kk < k; ++kk) | 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ]; | 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||||
| 	} | 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; | ||||||
|       } | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; | ||||||
|  | 	  } ); | ||||||
|  |       } else {  | ||||||
|  | 	assert(0); | ||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
|      RealD t1=usecond(); |      RealD t1=usecond(); | ||||||
| @@ -408,6 +562,9 @@ public: | |||||||
|     RealD t2=usecond(); |     RealD t2=usecond(); | ||||||
|     int32_t batchCount = Amk.size(); |     int32_t batchCount = Amk.size(); | ||||||
|  |  | ||||||
|  |     assert(OpA!=GridBLAS_OP_C); // Real case no conjugate | ||||||
|  |     assert(OpB!=GridBLAS_OP_C); | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |     int lda = m; // m x k column major | ||||||
|     int ldb = k; // k x n column major |     int ldb = k; // k x n column major | ||||||
|     int ldc = m; // m x b column major |     int ldc = m; // m x b column major | ||||||
| @@ -467,23 +624,68 @@ public: | |||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |     assert(err==CUBLAS_STATUS_SUCCESS); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |       int64_t m64=m; | ||||||
| #warning "oneMKL implementation not built " |       int64_t n64=n; | ||||||
|  |       int64_t k64=k; | ||||||
|  |       int64_t lda64=lda; | ||||||
|  |       int64_t ldb64=ldb; | ||||||
|  |       int64_t ldc64=ldc; | ||||||
|  |       int64_t batchCount64=batchCount; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::transpose iOpA; | ||||||
|  |       oneapi::mkl::transpose iOpB; | ||||||
|  |        | ||||||
|  |       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||||
|  |       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||||
|  | 						  &iOpA, | ||||||
|  | 						  &iOpB, | ||||||
|  | 						  &m64,&n64,&k64, | ||||||
|  | 						  (float *) &alpha_p[0], | ||||||
|  | 						  (const float **)&Amk[0], (const int64_t *)&lda64, | ||||||
|  | 						  (const float **)&Bkn[0], (const int64_t *)&ldb64, | ||||||
|  | 						  (float *) &beta_p[0], | ||||||
|  | 						  (float **)&Cmn[0], (const int64_t *)&ldc64, | ||||||
|  | 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||||
|  |       synchronise(); | ||||||
| #endif | #endif | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||||
|     int sda = lda*k; |     // Need a default/reference implementation; use Eigen | ||||||
|     int sdb = ldb*k; |       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|     int sdc = ldc*n; | 	thread_for (p, batchCount, { | ||||||
|     // Need a default/reference implementation | 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); | ||||||
|     for (int p = 0; p < batchCount; ++p) { | 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); | ||||||
|       for (int mm = 0; mm < m; ++mm) { | 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||||
| 	for (int nn = 0; nn < n; ++nn) { | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||||
| 	  RealD c_mn(0.0); | 	  }); | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | 	thread_for (p, batchCount, { | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; | 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); | ||||||
| 	} | 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); | ||||||
|       } | 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; | ||||||
|  | 	  } ); | ||||||
|  |       } else {  | ||||||
|  | 	assert(0); | ||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
|      RealD t1=usecond(); |      RealD t1=usecond(); | ||||||
| @@ -495,7 +697,6 @@ public: | |||||||
|   /////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////// | ||||||
|   // Double precision real GEMM |   // Double precision real GEMM | ||||||
|   /////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|   void gemmBatched(GridBLASOperation_t OpA, |   void gemmBatched(GridBLASOperation_t OpA, | ||||||
| 		   GridBLASOperation_t OpB, | 		   GridBLASOperation_t OpB, | ||||||
| 		   int m,int n, int k, | 		   int m,int n, int k, | ||||||
| @@ -508,6 +709,9 @@ public: | |||||||
|     RealD t2=usecond(); |     RealD t2=usecond(); | ||||||
|     int32_t batchCount = Amk.size(); |     int32_t batchCount = Amk.size(); | ||||||
|  |  | ||||||
|  |     assert(OpA!=GridBLAS_OP_C); // Real case no conjugate | ||||||
|  |     assert(OpB!=GridBLAS_OP_C); | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |     int lda = m; // m x k column major | ||||||
|     int ldb = k; // k x n column major |     int ldb = k; // k x n column major | ||||||
|     int ldc = m; // m x b column major |     int ldc = m; // m x b column major | ||||||
| @@ -568,39 +772,68 @@ public: | |||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |     assert(err==CUBLAS_STATUS_SUCCESS); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|     /* |  | ||||||
|       int64_t m64=m; |       int64_t m64=m; | ||||||
|       int64_t n64=n; |       int64_t n64=n; | ||||||
|       int64_t k64=k; |       int64_t k64=k; | ||||||
|  |       int64_t lda64=lda; | ||||||
|  |       int64_t ldb64=ldb; | ||||||
|  |       int64_t ldc64=ldc; | ||||||
|       int64_t batchCount64=batchCount; |       int64_t batchCount64=batchCount; | ||||||
|       oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, |  | ||||||
|       onemkl::transpose::N, |       oneapi::mkl::transpose iOpA; | ||||||
|       onemkl::transpose::N, |       oneapi::mkl::transpose iOpB; | ||||||
|  |        | ||||||
|  |       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||||
|  |       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||||
|  | 						  &iOpA, | ||||||
|  | 						  &iOpB, | ||||||
| 						  &m64,&n64,&k64, | 						  &m64,&n64,&k64, | ||||||
| 						  (double *) &alpha_p[0], | 						  (double *) &alpha_p[0], | ||||||
|       (double **)&Amk[0], lda, | 						  (const double **)&Amk[0], (const int64_t *)&lda64, | ||||||
|       (double **)&Bkn[0], ldb, | 						  (const double **)&Bkn[0], (const int64_t *)&ldb64, | ||||||
| 						  (double *) &beta_p[0], | 						  (double *) &beta_p[0], | ||||||
|       (double **)&Cmn[0], ldc, | 						  (double **)&Cmn[0], (const int64_t *)&ldc64, | ||||||
|       1,&batchCount64); | 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||||
|      */ |       synchronise(); | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |  | ||||||
| #warning "oneMKL implementation not built " |  | ||||||
| #endif | #endif | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||||
|     int sda = lda*k; |     // Need a default/reference implementation; use Eigen | ||||||
|     int sdb = ldb*k; |       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|     int sdc = ldc*n; | 	thread_for (p, batchCount, { | ||||||
|     // Need a default/reference implementation | 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); | ||||||
|     for (int p = 0; p < batchCount; ++p) { | 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); | ||||||
|       for (int mm = 0; mm < m; ++mm) { | 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||||
| 	for (int nn = 0; nn < n; ++nn) { | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||||
| 	  RealD c_mn(0.0); | 	  }); | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | 	thread_for (p, batchCount, { | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; | 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); | ||||||
| 	} | 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); | ||||||
|       } | 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; | ||||||
|  | 	  }); | ||||||
|  |       } else {  | ||||||
|  | 	assert(0); | ||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
|      RealD t1=usecond(); |      RealD t1=usecond(); | ||||||
| @@ -608,120 +841,55 @@ public: | |||||||
|      RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; |      RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   template<class CComplex> | ||||||
|    |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Strided case used by benchmark, but generally unused in Grid |  | ||||||
|   // Keep a code example in double complex, but don't generate the single and real variants for now |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|    |  | ||||||
|   void gemmStridedBatched(int m,int n, int k, |  | ||||||
| 			  ComplexD alpha, |  | ||||||
| 			  ComplexD* Amk,  // pointer list to matrices |  | ||||||
| 			  ComplexD* Bkn, |  | ||||||
| 			  ComplexD beta, |  | ||||||
| 			  ComplexD* Cmn, |  | ||||||
| 			  int batchCount) |  | ||||||
|   { |  | ||||||
|     // Use C-row major storage, so transpose calls |  | ||||||
|     int lda = m; // m x k column major |  | ||||||
|     int ldb = k; // k x n column major |  | ||||||
|     int ldc = m; // m x b column major |  | ||||||
|     int sda = m*k; |  | ||||||
|     int sdb = k*n; |  | ||||||
|     int sdc = m*n; |  | ||||||
|     deviceVector<ComplexD> alpha_p(1); |  | ||||||
|     deviceVector<ComplexD> beta_p(1); |  | ||||||
|     acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); |  | ||||||
|     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl; |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl; |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl; |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|     auto err = hipblasZgemmStridedBatched(gridblasHandle, |  | ||||||
| 					  HIPBLAS_OP_N, |  | ||||||
| 					  HIPBLAS_OP_N, |  | ||||||
| 					  m,n,k, |  | ||||||
| 					  (hipblasDoubleComplex *) &alpha_p[0], |  | ||||||
| 					  (hipblasDoubleComplex *) Amk, lda, sda, |  | ||||||
| 					  (hipblasDoubleComplex *) Bkn, ldb, sdb, |  | ||||||
| 					  (hipblasDoubleComplex *) &beta_p[0], |  | ||||||
| 					  (hipblasDoubleComplex *) Cmn, ldc, sdc, |  | ||||||
| 					  batchCount); |  | ||||||
|     assert(err==HIPBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|     cublasZgemmStridedBatched(gridblasHandle, |  | ||||||
| 			      CUBLAS_OP_N, |  | ||||||
| 			      CUBLAS_OP_N, |  | ||||||
| 			      m,n,k, |  | ||||||
| 			      (cuDoubleComplex *) &alpha_p[0], |  | ||||||
| 			      (cuDoubleComplex *) Amk, lda, sda, |  | ||||||
| 			      (cuDoubleComplex *) Bkn, ldb, sdb, |  | ||||||
| 			      (cuDoubleComplex *) &beta_p[0], |  | ||||||
| 			      (cuDoubleComplex *) Cmn, ldc, sdc, |  | ||||||
| 			      batchCount); |  | ||||||
| #endif |  | ||||||
| #if defined(GRID_SYCL) || defined(GRID_ONE_MKL) |  | ||||||
|     oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, |  | ||||||
| 						oneapi::mkl::transpose::N, |  | ||||||
| 						oneapi::mkl::transpose::N, |  | ||||||
| 						m,n,k, |  | ||||||
| 						alpha, |  | ||||||
| 						(const ComplexD *)Amk,lda,sda, |  | ||||||
| 						(const ComplexD *)Bkn,ldb,sdb, |  | ||||||
| 						beta, |  | ||||||
| 						(ComplexD *)Cmn,ldc,sdc, |  | ||||||
| 						batchCount); |  | ||||||
| #endif |  | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) |  | ||||||
|      // Need a default/reference implementation |  | ||||||
|      for (int p = 0; p < batchCount; ++p) { |  | ||||||
|        for (int mm = 0; mm < m; ++mm) { |  | ||||||
| 	 for (int nn = 0; nn < n; ++nn) { |  | ||||||
| 	   ComplexD c_mn(0.0); |  | ||||||
| 	   for (int kk = 0; kk < k; ++kk) |  | ||||||
| 	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; |  | ||||||
| 	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc]; |  | ||||||
| 	 } |  | ||||||
|        } |  | ||||||
|      } |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   double benchmark(int M, int N, int K, int BATCH) |   double benchmark(int M, int N, int K, int BATCH) | ||||||
|   { |   { | ||||||
|     int32_t N_A = M*K*BATCH; |     int32_t N_A = M*K*BATCH; | ||||||
|     int32_t N_B = K*N*BATCH; |     int32_t N_B = K*N*BATCH; | ||||||
|     int32_t N_C = M*N*BATCH; |     int32_t N_C = M*N*BATCH; | ||||||
|     deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); |     deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex)); | ||||||
|     deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); |     deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex)); | ||||||
|     deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); |     deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex)); | ||||||
|     ComplexD alpha(1.0); |     CComplex alpha(1.0); | ||||||
|     ComplexD beta (1.0); |     CComplex beta (1.0); | ||||||
|     RealD flops = 8.0*M*N*K*BATCH; |     RealD flops = 8.0*M*N*K*BATCH; | ||||||
|     int ncall=10; |     int ncall=1000; | ||||||
|  |     deviceVector<CComplex *> As(BATCH); | ||||||
|  |     deviceVector<CComplex *> Bs(BATCH); | ||||||
|  |     deviceVector<CComplex *> Cs(BATCH); | ||||||
|  |     for(int b = 0 ; b < BATCH;b++) { | ||||||
|  |       CComplex *ptr; | ||||||
|  |       ptr = &A[b*M*K];      acceleratorPut(As[b],ptr); | ||||||
|  |       ptr = &B[b*K*N];      acceleratorPut(Bs[b],ptr); | ||||||
|  |       ptr = &C[b*M*N];      acceleratorPut(Cs[b],ptr); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Warm up call | ||||||
|  |     gemmBatched(M,N,K, | ||||||
|  | 		alpha, | ||||||
|  | 		As, // m x k  | ||||||
|  | 		Bs, // k x n | ||||||
|  | 		beta,  | ||||||
|  | 		Cs); | ||||||
|  |     synchronise(); | ||||||
|  |  | ||||||
|     RealD t0 = usecond(); |     RealD t0 = usecond(); | ||||||
|     for(int i=0;i<ncall;i++){ |     for(int i=0;i<ncall;i++){ | ||||||
|       gemmStridedBatched(M,N,K, |       gemmBatched(M,N,K, | ||||||
| 		  alpha, | 		  alpha, | ||||||
| 			 &A[0], // m x k  | 		  As, // m x k  | ||||||
| 			 &B[0], // k x n | 		  Bs, // k x n | ||||||
| 		  beta,  | 		  beta,  | ||||||
| 			 &C[0], // m x n | 		  Cs); | ||||||
| 			 BATCH); |  | ||||||
|     } |  | ||||||
|       synchronise(); |       synchronise(); | ||||||
|  |     } | ||||||
|     RealD t1 = usecond(); |     RealD t1 = usecond(); | ||||||
|     RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH; |     RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH; | ||||||
|     flops = 8.0*M*N*K*BATCH*ncall; |     flops = 8.0*M*N*K*BATCH*ncall; | ||||||
|     flops = flops/(t1-t0)/1.e3; |     flops = flops/(t1-t0)/1.e3; | ||||||
|     return flops; // Returns gigaflops |     return flops; // Returns gigaflops | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -279,11 +279,11 @@ public: | |||||||
|       Qt = Eigen::MatrixXcd::Identity(Nm,Nm); |       Qt = Eigen::MatrixXcd::Identity(Nm,Nm); | ||||||
|       diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid); |       diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid); | ||||||
|       _sort.push(eval2,Nm); |       _sort.push(eval2,Nm); | ||||||
|       //      Glog << "#Ritz value before shift: "<< std::endl; |       Glog << "#Ritz value before shift: "<< std::endl; | ||||||
|       for(int i=0; i<Nm; ++i){ |       for(int i=0; i<Nm; ++i){ | ||||||
| 	//        std::cout.precision(13); | 	std::cout.precision(13); | ||||||
| 	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | 	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||||
| 	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | 	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | ||||||
|       } |       } | ||||||
|        |        | ||||||
|       //---------------------------------------------------------------------- |       //---------------------------------------------------------------------- | ||||||
| @@ -298,6 +298,7 @@ public: | |||||||
|         unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM); |         unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM); | ||||||
|  |  | ||||||
|         for(int ip=Nk; ip<Nm; ++ip){ |         for(int ip=Nk; ip<Nm; ++ip){ | ||||||
|  | 	  Glog << " ip "<<ip<<" / "<<Nm<<std::endl; | ||||||
|           shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q); |           shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q); | ||||||
|         } |         } | ||||||
|          |          | ||||||
| @@ -325,7 +326,7 @@ public: | |||||||
|         Qt = Eigen::MatrixXcd::Identity(Nm,Nm); |         Qt = Eigen::MatrixXcd::Identity(Nm,Nm); | ||||||
|         diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid); |         diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid); | ||||||
|         _sort.push(eval2,Nk); |         _sort.push(eval2,Nk); | ||||||
| 	//        Glog << "#Ritz value after shift: "<< std::endl; | 	Glog << "#Ritz value after shift: "<< std::endl; | ||||||
|         for(int i=0; i<Nk; ++i){ |         for(int i=0; i<Nk; ++i){ | ||||||
| 	  //          std::cout.precision(13); | 	  //          std::cout.precision(13); | ||||||
| 	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | 	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||||
| @@ -467,10 +468,10 @@ public: | |||||||
|    |    | ||||||
|     // set initial vector |     // set initial vector | ||||||
|     for (int i=0; i<Nu; ++i) { |     for (int i=0; i<Nu; ++i) { | ||||||
|       //      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl; |       Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl; | ||||||
|       evec[i] = src[i]; |       evec[i] = src[i]; | ||||||
|       orthogonalize(evec[i],evec,i); |       orthogonalize(evec[i],evec,i); | ||||||
|       //      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl; |       Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl; | ||||||
|     } |     } | ||||||
| //    exit(-43); | //    exit(-43); | ||||||
|      |      | ||||||
| @@ -506,11 +507,11 @@ public: | |||||||
|       Qt = Eigen::MatrixXcd::Identity(Nr,Nr); |       Qt = Eigen::MatrixXcd::Identity(Nr,Nr); | ||||||
|       diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid); |       diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid); | ||||||
|       _sort.push(eval2,Nr); |       _sort.push(eval2,Nr); | ||||||
|       //      Glog << "#Ritz value: "<< std::endl; |       Glog << "#Ritz value: "<< std::endl; | ||||||
|       for(int i=0; i<Nr; ++i){ |       for(int i=0; i<Nr; ++i){ | ||||||
| 	//        std::cout.precision(13); | 	std::cout.precision(13); | ||||||
| 	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | 	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||||
| 	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | 	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | ||||||
|       } |       } | ||||||
|        |        | ||||||
|       // Convergence test |       // Convergence test | ||||||
| @@ -570,6 +571,7 @@ public: | |||||||
|       Glog << fname + " NOT converged ; Summary :\n"; |       Glog << fname + " NOT converged ; Summary :\n"; | ||||||
|     } else { |     } else { | ||||||
|       Glog << fname + " CONVERGED ; Summary :\n"; |       Glog << fname + " CONVERGED ; Summary :\n"; | ||||||
|  |       Nstop = Nconv_guess; // Just take them all | ||||||
|       // Sort convered eigenpairs. |       // Sort convered eigenpairs. | ||||||
|       std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating |       std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating | ||||||
|  |  | ||||||
| @@ -642,7 +644,7 @@ private: | |||||||
|       //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl; |       //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl; | ||||||
|       k_start +=mrhs; |       k_start +=mrhs; | ||||||
|     } |     } | ||||||
|     //    Glog << "LinAlg "<< std::endl; |     Glog << "LinAlg "<< std::endl; | ||||||
|      |      | ||||||
|     if (b>0) { |     if (b>0) { | ||||||
|       for (int u=0; u<Nu; ++u) { |       for (int u=0; u<Nu; ++u) { | ||||||
| @@ -676,7 +678,7 @@ private: | |||||||
|       } |       } | ||||||
|       w_copy[u] = w[u]; |       w_copy[u] = w[u]; | ||||||
|     } |     } | ||||||
|     //    Glog << "LinAlg done"<< std::endl; |     Glog << "LinAlg done"<< std::endl; | ||||||
|      |      | ||||||
|     // In block version, the steps 6 and 7 in Lanczos construction is |     // In block version, the steps 6 and 7 in Lanczos construction is | ||||||
|     // replaced by the QR decomposition of new basis block. |     // replaced by the QR decomposition of new basis block. | ||||||
| @@ -689,15 +691,15 @@ private: | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     // re-orthogonalization for numerical stability |     // re-orthogonalization for numerical stability | ||||||
|     //    Glog << "Gram Schmidt"<< std::endl; |     Glog << "Gram Schmidt"<< std::endl; | ||||||
|     orthogonalize(w,Nu,evec,R); |     orthogonalize(w,Nu,evec,R); | ||||||
|     // QR part |     // QR part | ||||||
|     for (int u=1; u<Nu; ++u) { |     for (int u=1; u<Nu; ++u) { | ||||||
|       orthogonalize(w[u],w,u); |       orthogonalize(w[u],w,u); | ||||||
|     } |     } | ||||||
|     //    Glog << "Gram Schmidt done "<< std::endl; |     Glog << "Gram Schmidt done "<< std::endl; | ||||||
|      |      | ||||||
|     //    Glog << "LinAlg "<< std::endl; |     Glog << "LinAlg "<< std::endl; | ||||||
|     for (int u=0; u<Nu; ++u) { |     for (int u=0; u<Nu; ++u) { | ||||||
|       //for (int v=0; v<Nu; ++v) { |       //for (int v=0; v<Nu; ++v) { | ||||||
|       for (int v=u; v<Nu; ++v) { |       for (int v=u; v<Nu; ++v) { | ||||||
| @@ -714,7 +716,7 @@ private: | |||||||
| 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; | 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     //    Glog << "LinAlg done "<< std::endl; |     Glog << "LinAlg done "<< std::endl; | ||||||
|  |  | ||||||
|     if (b < Nm/Nu-1) { |     if (b < Nm/Nu-1) { | ||||||
|       for (int u=0; u<Nu; ++u) { |       for (int u=0; u<Nu; ++u) { | ||||||
| @@ -933,7 +935,7 @@ if (1){ | |||||||
|          int Nu, int Nb, int Nk, int Nm, |          int Nu, int Nb, int Nk, int Nm, | ||||||
|          Eigen::MatrixXcd& M) |          Eigen::MatrixXcd& M) | ||||||
|   { |   { | ||||||
|     //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';  |     Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';  | ||||||
|     assert( Nk%Nu == 0 && Nm%Nu == 0 ); |     assert( Nk%Nu == 0 && Nm%Nu == 0 ); | ||||||
|     assert( Nk <= Nm ); |     assert( Nk <= Nm ); | ||||||
|     M = Eigen::MatrixXcd::Zero(Nk,Nk); |     M = Eigen::MatrixXcd::Zero(Nk,Nk); | ||||||
| @@ -951,7 +953,7 @@ if (1){ | |||||||
|         M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; |         M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl;  |     Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;  | ||||||
|   } |   } | ||||||
|   |   | ||||||
|  |  | ||||||
| @@ -961,7 +963,7 @@ if (1){ | |||||||
|          int Nu, int Nb, int Nk, int Nm, |          int Nu, int Nb, int Nk, int Nm, | ||||||
|          Eigen::MatrixXcd& M) |          Eigen::MatrixXcd& M) | ||||||
|   { |   { | ||||||
|     //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';  |     Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';  | ||||||
|     assert( Nk%Nu == 0 && Nm%Nu == 0 ); |     assert( Nk%Nu == 0 && Nm%Nu == 0 ); | ||||||
|     assert( Nk <= Nm ); |     assert( Nk <= Nm ); | ||||||
|      |      | ||||||
| @@ -977,7 +979,7 @@ if (1){ | |||||||
|         lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); |         lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl;  |     Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -986,7 +988,7 @@ if (1){ | |||||||
| 		            RealD Dsh, | 		            RealD Dsh, | ||||||
| 		            Eigen::MatrixXcd& Qprod) | 		            Eigen::MatrixXcd& Qprod) | ||||||
|   { |   { | ||||||
|     //Glog << "shiftedQRDecompEigen() begin" << '\n';  |     Glog << "shiftedQRDecompEigen() begin" << '\n';  | ||||||
|     Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); |     Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
|     Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm); |     Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
|     Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); |     Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
| @@ -1002,6 +1004,7 @@ if (1){ | |||||||
|                         // lower triangular part used to represent series |                         // lower triangular part used to represent series | ||||||
|                         // of Q sequence. |                         // of Q sequence. | ||||||
|  |  | ||||||
|  |     Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';  | ||||||
|     // equivalent operation of Qprod *= Q |     // equivalent operation of Qprod *= Q | ||||||
|     //M = Eigen::MatrixXcd::Zero(Nm,Nm); |     //M = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
|      |      | ||||||
| @@ -1022,6 +1025,7 @@ if (1){ | |||||||
|      |      | ||||||
|     Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); |     Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
|  |  | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';  | ||||||
|     for (int i=0; i<Nm; ++i) { |     for (int i=0; i<Nm; ++i) { | ||||||
|       for (int j=0; j<Nm-(Nu+1); ++j) { |       for (int j=0; j<Nm-(Nu+1); ++j) { | ||||||
|         for (int k=0; k<Nu+1+j; ++k) { |         for (int k=0; k<Nu+1+j; ++k) { | ||||||
| @@ -1029,6 +1033,7 @@ if (1){ | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';  | ||||||
|     for (int i=0; i<Nm; ++i) { |     for (int i=0; i<Nm; ++i) { | ||||||
|       for (int j=Nm-(Nu+1); j<Nm; ++j) { |       for (int j=Nm-(Nu+1); j<Nm; ++j) { | ||||||
|         for (int k=0; k<Nm; ++k) { |         for (int k=0; k<Nm; ++k) { | ||||||
| @@ -1036,6 +1041,7 @@ if (1){ | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';  | ||||||
|      |      | ||||||
|     //static int ntimes = 2; |     //static int ntimes = 2; | ||||||
|     //for (int j=0; j<Nm-(ntimes*Nu); ++j) { |     //for (int j=0; j<Nm-(ntimes*Nu); ++j) { | ||||||
| @@ -1061,11 +1067,13 @@ if (1){ | |||||||
|         Mtmp(j,i) = conj(Mtmp(i,j)); |         Mtmp(j,i) = conj(Mtmp(i,j)); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';  | ||||||
|  |  | ||||||
|     for (int i=0; i<Nm; ++i) { |     for (int i=0; i<Nm; ++i) { | ||||||
|       Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; |       Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; | ||||||
|     } |     } | ||||||
|      |      | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';  | ||||||
|     M = Mtmp; |     M = Mtmp; | ||||||
|  |  | ||||||
|     //M = Q.adjoint()*(M*Q); |     //M = Q.adjoint()*(M*Q); | ||||||
| @@ -1077,7 +1085,7 @@ if (1){ | |||||||
|     //  } |     //  } | ||||||
|     //} |     //} | ||||||
|      |      | ||||||
|     //Glog << "shiftedQRDecompEigen() end" << endl;  |     Glog << "shiftedQRDecompEigen() end" <<std::endl;  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void exampleQRDecompEigen(void) |   void exampleQRDecompEigen(void) | ||||||
|   | |||||||
| @@ -499,6 +499,87 @@ namespace Grid { | |||||||
|       } |       } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Site diagonal is identity, left preconditioned by Mee^inv | ||||||
|  |   // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta | ||||||
|  |   // | ||||||
|  |   // Solve: | ||||||
|  |   // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta | ||||||
|  |   // | ||||||
|  |   // Old notation e<->o | ||||||
|  |   // | ||||||
|  |   // Left precon by Moo^-1 | ||||||
|  |   //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o | ||||||
|  |   //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e) | ||||||
|  |   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> { | ||||||
|  |   public: | ||||||
|  |     typedef CheckerBoardedSparseMatrixBase<Field> Matrix; | ||||||
|  |  | ||||||
|  |     ///////////////////////////////////////////////////// | ||||||
|  |     // Wrap the usual normal equations Schur trick | ||||||
|  |     ///////////////////////////////////////////////////// | ||||||
|  |   SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false, | ||||||
|  |       const bool _solnAsInitGuess = false)   | ||||||
|  |     : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {}; | ||||||
|  |  | ||||||
|  |     virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) | ||||||
|  |     { | ||||||
|  |       GridBase *grid = _Matrix.RedBlackGrid(); | ||||||
|  |       GridBase *fgrid= _Matrix.Grid(); | ||||||
|  |  | ||||||
|  |       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||||
|  |        | ||||||
|  |       Field   tmp(grid); | ||||||
|  |       Field  Mtmp(grid); | ||||||
|  |  | ||||||
|  |       pickCheckerboard(Even,src_e,src); | ||||||
|  |       pickCheckerboard(Odd ,src_o,src); | ||||||
|  |      | ||||||
|  |       ///////////////////////////////////////////////////// | ||||||
|  |       // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e) | ||||||
|  |       ///////////////////////////////////////////////////// | ||||||
|  |       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even); | ||||||
|  |       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);      | ||||||
|  |       Mtmp=src_o-Mtmp;                  | ||||||
|  |       _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);      | ||||||
|  |        | ||||||
|  |       // get the right MpcDag | ||||||
|  |       _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);        | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) | ||||||
|  |     { | ||||||
|  |       GridBase *grid = _Matrix.RedBlackGrid(); | ||||||
|  |       GridBase *fgrid= _Matrix.Grid(); | ||||||
|  |  | ||||||
|  |       Field   tmp(grid); | ||||||
|  |       Field   sol_e(grid); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |       /////////////////////////////////////////////////// | ||||||
|  |       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... | ||||||
|  |       /////////////////////////////////////////////////// | ||||||
|  |       _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even); | ||||||
|  |       tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even); | ||||||
|  |       _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even); | ||||||
|  |       | ||||||
|  |       setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even); | ||||||
|  |       setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd ); | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o) | ||||||
|  |     { | ||||||
|  |       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||||
|  |       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); | ||||||
|  |     }; | ||||||
|  |     virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o) | ||||||
|  |     { | ||||||
|  |       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||||
|  |       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Site diagonal is identity, right preconditioned by Mee^inv |   // Site diagonal is identity, right preconditioned by Mee^inv | ||||||
|   // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta |   // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta | ||||||
|   | |||||||
| @@ -54,6 +54,9 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|     profilerAllocate(bytes); |     profilerAllocate(bytes); | ||||||
|     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); |     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); | ||||||
|  |     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||||
|  |       printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||||
|  |     } | ||||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); |     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
| @@ -100,6 +103,9 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|     profilerAllocate(bytes); |     profilerAllocate(bytes); | ||||||
|     _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); |     _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); | ||||||
|  |     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||||
|  |       printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||||
|  |     } | ||||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); |     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
| @@ -145,6 +151,9 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|     profilerAllocate(bytes); |     profilerAllocate(bytes); | ||||||
|     _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); |     _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); | ||||||
|  |     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||||
|  |       printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||||
|  |     } | ||||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); |     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid); | |||||||
| uint64_t total_shared; | uint64_t total_shared; | ||||||
| uint64_t total_device; | uint64_t total_device; | ||||||
| uint64_t total_host;; | uint64_t total_host;; | ||||||
|  |  | ||||||
|  | #if defined(__has_feature) | ||||||
|  | #if __has_feature(leak_sanitizer) | ||||||
|  | #define ASAN_LEAK_CHECK | ||||||
|  | #endif | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #ifdef ASAN_LEAK_CHECK | ||||||
|  | #include <sanitizer/asan_interface.h> | ||||||
|  | #include <sanitizer/common_interface_defs.h> | ||||||
|  | #include <sanitizer/lsan_interface.h> | ||||||
|  | #define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); } | ||||||
|  | #else | ||||||
|  | #define LEAK_CHECK(A) { } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | void MemoryManager::DisplayMallinfo(void) | ||||||
|  | { | ||||||
|  | #ifdef __linux__ | ||||||
|  |   struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform | ||||||
|  |    | ||||||
|  |   mi = mallinfo(); | ||||||
|  |  | ||||||
|  |   std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl; | ||||||
|  | #endif | ||||||
|  |   LEAK_CHECK(); | ||||||
|  |   | ||||||
|  | } | ||||||
|  |  | ||||||
| void MemoryManager::PrintBytes(void) | void MemoryManager::PrintBytes(void) | ||||||
| { | { | ||||||
|   std::cout << " MemoryManager : ------------------------------------ "<<std::endl; |   std::cout << " MemoryManager : ------------------------------------ "<<std::endl; | ||||||
| @@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void) | |||||||
| #ifdef GRID_CUDA | #ifdef GRID_CUDA | ||||||
|   cuda_mem(); |   cuda_mem(); | ||||||
| #endif | #endif | ||||||
|    |   DisplayMallinfo(); | ||||||
| } | } | ||||||
|  |  | ||||||
| uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | ||||||
|   | |||||||
| @@ -211,6 +211,7 @@ private: | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  public: |  public: | ||||||
|  |   static void DisplayMallinfo(void); | ||||||
|   static void NotifyDeletion(void * CpuPtr); |   static void NotifyDeletion(void * CpuPtr); | ||||||
|   static void Print(void); |   static void Print(void); | ||||||
|   static void PrintAll(void); |   static void PrintAll(void); | ||||||
|   | |||||||
| @@ -91,6 +91,7 @@ public: | |||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   virtual int CheckerBoarded(int dim)=0; |   virtual int CheckerBoarded(int dim)=0; | ||||||
|   virtual int CheckerBoard(const Coordinate &site)=0; |   virtual int CheckerBoard(const Coordinate &site)=0; | ||||||
|  |   virtual int CheckerDim(void){ return 0; }; | ||||||
|   virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; |   virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; | ||||||
|   virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; |   virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; | ||||||
|   virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; |   virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; | ||||||
|   | |||||||
| @@ -60,6 +60,7 @@ public: | |||||||
|   int              _checker_dim; |   int              _checker_dim; | ||||||
|   std::vector<int> _checker_board; |   std::vector<int> _checker_board; | ||||||
|  |  | ||||||
|  |   virtual int CheckerDim(void){ return _checker_dim; }; | ||||||
|   virtual int CheckerBoarded(int dim){ |   virtual int CheckerBoarded(int dim){ | ||||||
|     if( dim==_checker_dim) return 1; |     if( dim==_checker_dim) return 1; | ||||||
|     else return 0; |     else return 0; | ||||||
|   | |||||||
| @@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | |||||||
|   const uint64_t sites = grid->oSites(); |   const uint64_t sites = grid->oSites(); | ||||||
|    |    | ||||||
|   // Might make all code paths go this way. |   // Might make all code paths go this way. | ||||||
| #if 0 |  | ||||||
|   typedef decltype(innerProductD(vobj(),vobj())) inner_t; |  | ||||||
|   Vector<inner_t> inner_tmp(sites); |  | ||||||
|   auto inner_tmp_v = &inner_tmp[0]; |  | ||||||
|   { |  | ||||||
|     autoView( left_v , left, AcceleratorRead); |  | ||||||
|     autoView( right_v,right, AcceleratorRead); |  | ||||||
|     // This code could read coalesce |  | ||||||
|     // GPU - SIMT lane compliance... |  | ||||||
|     accelerator_for( ss, sites, nsimd,{ |  | ||||||
| 	auto x_l = left_v(ss); |  | ||||||
| 	auto y_l = right_v(ss); |  | ||||||
| 	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l)); |  | ||||||
|     }); |  | ||||||
|   } |  | ||||||
| #else |  | ||||||
|   typedef decltype(innerProduct(vobj(),vobj())) inner_t; |   typedef decltype(innerProduct(vobj(),vobj())) inner_t; | ||||||
|   Vector<inner_t> inner_tmp(sites); |   deviceVector<inner_t> inner_tmp(sites); | ||||||
|   auto inner_tmp_v = &inner_tmp[0]; |   auto inner_tmp_v = &inner_tmp[0]; | ||||||
|      |      | ||||||
|   { |   { | ||||||
| @@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | |||||||
| 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); | 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); | ||||||
|     }); |     }); | ||||||
|   } |   } | ||||||
| #endif |  | ||||||
|   // This is in single precision and fails some tests |   // This is in single precision and fails some tests | ||||||
|   auto anrm = sumD(inner_tmp_v,sites);   |   auto anrm = sumD(inner_tmp_v,sites);   | ||||||
|   nrm = anrm; |   nrm = anrm; | ||||||
| @@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt | |||||||
|   nrm = real(TensorRemove(sum(inner_tmp_v,sites))); |   nrm = real(TensorRemove(sum(inner_tmp_v,sites))); | ||||||
| #else | #else | ||||||
|   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; |   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; | ||||||
|   Vector<inner_t> inner_tmp(sites); |   deviceVector<inner_t> inner_tmp; | ||||||
|  |   inner_tmp.resize(sites); | ||||||
|   auto inner_tmp_v = &inner_tmp[0]; |   auto inner_tmp_v = &inner_tmp[0]; | ||||||
|  |  | ||||||
|   accelerator_for( ss, sites, nsimd,{ |   accelerator_for( ss, sites, nsimd,{ | ||||||
|   | |||||||
| @@ -43,49 +43,20 @@ inline void subdivides(GridBase *coarse,GridBase *fine) | |||||||
|   } |   } | ||||||
| } | } | ||||||
|   |   | ||||||
|   |  | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // remove and insert a half checkerboard | // remove and insert a half checkerboard | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full) | template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full) | ||||||
| { | { | ||||||
|   half.Checkerboard() = cb; |   acceleratorPickCheckerboard(cb,half,full); | ||||||
|  |  | ||||||
|   autoView( half_v, half, CpuWrite); |  | ||||||
|   autoView( full_v, full, CpuRead); |  | ||||||
|   thread_for(ss, full.Grid()->oSites(),{ |  | ||||||
|     int cbos; |  | ||||||
|     Coordinate coor; |  | ||||||
|     full.Grid()->oCoorFromOindex(coor,ss); |  | ||||||
|     cbos=half.Grid()->CheckerBoard(coor); |  | ||||||
|  |  | ||||||
|     if (cbos==cb) { |  | ||||||
|       int ssh=half.Grid()->oIndex(coor); |  | ||||||
|       half_v[ssh] = full_v[ss]; |  | ||||||
|     } |  | ||||||
|   }); |  | ||||||
| } | } | ||||||
| template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half) | template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half) | ||||||
| { | { | ||||||
|   int cb = half.Checkerboard(); |   acceleratorSetCheckerboard(full,half); | ||||||
|   autoView( half_v , half, CpuRead); |  | ||||||
|   autoView( full_v , full, CpuWrite); |  | ||||||
|   thread_for(ss,full.Grid()->oSites(),{ |  | ||||||
|  |  | ||||||
|     Coordinate coor; |  | ||||||
|     int cbos; |  | ||||||
|  |  | ||||||
|     full.Grid()->oCoorFromOindex(coor,ss); |  | ||||||
|     cbos=half.Grid()->CheckerBoard(coor); |  | ||||||
|        |  | ||||||
|     if (cbos==cb) { |  | ||||||
|       int ssh=half.Grid()->oIndex(coor); |  | ||||||
|       full_v[ss]=half_v[ssh]; |  | ||||||
|     } |  | ||||||
|   }); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0) | template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0) | ||||||
| { | { | ||||||
|   half.Checkerboard() = cb; |   half.Checkerboard() = cb; | ||||||
|   autoView(half_v, half, AcceleratorWrite); |   autoView(half_v, half, AcceleratorWrite); | ||||||
| @@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj | |||||||
|   unsigned long ndim_half          = half.Grid()->_ndimension; |   unsigned long ndim_half          = half.Grid()->_ndimension; | ||||||
|   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; |   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; | ||||||
|   Coordinate ostride_half          = half.Grid()->_ostride; |   Coordinate ostride_half          = half.Grid()->_ostride; | ||||||
|  |   int checker_dim_half             = half.Grid()->CheckerDim(); | ||||||
|   accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{ |   accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{ | ||||||
|      |      | ||||||
|     Coordinate coor; |     Coordinate coor; | ||||||
| @@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj | |||||||
|     } |     } | ||||||
|   }); |   }); | ||||||
| } | } | ||||||
| template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0) | template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0) | ||||||
| { | { | ||||||
|   int cb = half.Checkerboard(); |   int cb = half.Checkerboard(); | ||||||
|   autoView(half_v , half, AcceleratorRead); |   autoView(half_v , half, AcceleratorRead); | ||||||
| @@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full, | |||||||
|   unsigned long ndim_half          = half.Grid()->_ndimension; |   unsigned long ndim_half          = half.Grid()->_ndimension; | ||||||
|   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; |   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; | ||||||
|   Coordinate ostride_half          = half.Grid()->_ostride; |   Coordinate ostride_half          = half.Grid()->_ostride; | ||||||
|  |   int checker_dim_half             = half.Grid()->CheckerDim(); | ||||||
|   accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{ |   accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{ | ||||||
|  |  | ||||||
|     Coordinate coor; |     Coordinate coor; | ||||||
|   | |||||||
| @@ -90,6 +90,7 @@ public: | |||||||
|         exit(1); |         exit(1); | ||||||
|       } |       } | ||||||
|       Parameters.StartingType = arg; |       Parameters.StartingType = arg; | ||||||
|  |       std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) { |     if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) { | ||||||
| @@ -97,6 +98,7 @@ public: | |||||||
|       std::vector<int> ivec(0); |       std::vector<int> ivec(0); | ||||||
|       GridCmdOptionIntVector(arg, ivec); |       GridCmdOptionIntVector(arg, ivec); | ||||||
|       Parameters.StartTrajectory = ivec[0]; |       Parameters.StartTrajectory = ivec[0]; | ||||||
|  |       std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) { |     if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) { | ||||||
| @@ -104,6 +106,7 @@ public: | |||||||
|       std::vector<int> ivec(0); |       std::vector<int> ivec(0); | ||||||
|       GridCmdOptionIntVector(arg, ivec); |       GridCmdOptionIntVector(arg, ivec); | ||||||
|       Parameters.Trajectories = ivec[0]; |       Parameters.Trajectories = ivec[0]; | ||||||
|  |       std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) { |     if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) { | ||||||
| @@ -111,6 +114,7 @@ public: | |||||||
|       std::vector<int> ivec(0); |       std::vector<int> ivec(0); | ||||||
|       GridCmdOptionIntVector(arg, ivec); |       GridCmdOptionIntVector(arg, ivec); | ||||||
|       Parameters.NoMetropolisUntil = ivec[0]; |       Parameters.NoMetropolisUntil = ivec[0]; | ||||||
|  |       std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl; | ||||||
|     } |     } | ||||||
|     if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) { |     if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) { | ||||||
|       arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile"); |       arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile"); | ||||||
|   | |||||||
| @@ -137,9 +137,11 @@ public: | |||||||
|  |  | ||||||
|       double start_force = usecond(); |       double start_force = usecond(); | ||||||
|  |  | ||||||
|  |       MemoryManager::Print(); | ||||||
|       as[level].actions.at(a)->deriv_timer_start(); |       as[level].actions.at(a)->deriv_timer_start(); | ||||||
|       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta |       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta | ||||||
|       as[level].actions.at(a)->deriv_timer_stop(); |       as[level].actions.at(a)->deriv_timer_stop(); | ||||||
|  |       MemoryManager::Print(); | ||||||
|  |  | ||||||
|       auto name = as[level].actions.at(a)->action_name(); |       auto name = as[level].actions.at(a)->action_name(); | ||||||
|  |  | ||||||
| @@ -246,7 +248,11 @@ public: | |||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   virtual ~Integrator() {} |   virtual ~Integrator() | ||||||
|  |   { | ||||||
|  |     // Pain in the ass to clean up the Level pointers | ||||||
|  |     // Guido's design is at fault as per comment above in constructor | ||||||
|  |   } | ||||||
|  |  | ||||||
|   virtual std::string integrator_name() = 0; |   virtual std::string integrator_name() = 0; | ||||||
|    |    | ||||||
| @@ -460,6 +466,7 @@ public: | |||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { |       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||||
|  |  | ||||||
|  | 	MemoryManager::Print(); | ||||||
|         // get gauge field from the SmearingPolicy and |         // get gauge field from the SmearingPolicy and | ||||||
|         // based on the boolean is_smeared in actionID |         // based on the boolean is_smeared in actionID | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||||
| @@ -468,6 +475,7 @@ public: | |||||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); |    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||||
|         H += Hterm; |         H += Hterm; | ||||||
|  | 	MemoryManager::Print(); | ||||||
|  |  | ||||||
|       } |       } | ||||||
|       as[level].apply(S_hireps, Representations, level, H); |       as[level].apply(S_hireps, Representations, level, H); | ||||||
|   | |||||||
| @@ -32,7 +32,9 @@ private: | |||||||
|   //  Smear_Stout<Gimpl> *StoutSmearing; |   //  Smear_Stout<Gimpl> *StoutSmearing; | ||||||
|   //  std::vector<GaugeField> SmearedSet; |   //  std::vector<GaugeField> SmearedSet; | ||||||
|    |    | ||||||
|  |   GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object | ||||||
|   std::vector<LatticeLorentzComplex> masks; |   std::vector<LatticeLorentzComplex> masks; | ||||||
|  |   std::vector<int> cbs; | ||||||
|  |  | ||||||
|   typedef typename SU3Adjoint::AMatrix AdjMatrix; |   typedef typename SU3Adjoint::AMatrix AdjMatrix; | ||||||
|   typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField; |   typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField; | ||||||
| @@ -147,6 +149,25 @@ private: | |||||||
|     } |     } | ||||||
|     pokeLorentz(Fdet, Fdet_pol, nu); |     pokeLorentz(Fdet, Fdet_pol, nu); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   void Compute_MpInvJx_dNxxdSy(int cb, | ||||||
|  | 			       const GaugeLinkField &PlaqL, | ||||||
|  | 			       const GaugeLinkField &PlaqR, | ||||||
|  | 			       AdjMatrixField MpInvJx, | ||||||
|  | 			       AdjVectorField &Fdet2 ) | ||||||
|  |   { | ||||||
|  |     GaugeLinkField PlaqLeo(UrbGrid); | ||||||
|  |     GaugeLinkField PlaqReo(UrbGrid); | ||||||
|  |     AdjMatrixField MpInvJxeo(UrbGrid); | ||||||
|  |     AdjVectorField Fdet2eo(UrbGrid); | ||||||
|  |     pickCheckerboard(cb,PlaqLeo,PlaqL); | ||||||
|  |     pickCheckerboard(cb,PlaqReo,PlaqR); | ||||||
|  |     pickCheckerboard(cb,MpInvJxeo,MpInvJx); | ||||||
|  |     Fdet2eo.Checkerboard()=cb; | ||||||
|  |     Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo); | ||||||
|  |     setCheckerboard(Fdet2,Fdet2eo); | ||||||
|  |   } | ||||||
|  |    | ||||||
|   void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 ) |   void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 ) | ||||||
|   { |   { | ||||||
|     GaugeLinkField UtaU(PlaqL.Grid()); |     GaugeLinkField UtaU(PlaqL.Grid()); | ||||||
| @@ -278,6 +299,7 @@ public: | |||||||
|     //////////////////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////////////////// | ||||||
|     // Mask the gauge field |     // Mask the gauge field | ||||||
|     //////////////////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////////////////// | ||||||
|  |     int cb = cbs[smr]; | ||||||
|     auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask |     auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask | ||||||
|      |      | ||||||
|     Umsk = U; |     Umsk = U; | ||||||
| @@ -442,7 +464,7 @@ public: | |||||||
|     AdjMatrixField MpInvJx_nu(grid); |     AdjMatrixField MpInvJx_nu(grid); | ||||||
|     MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor |     MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor | ||||||
|  |  | ||||||
|     Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV); |     Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV); | ||||||
|     Fdet2_mu=FdetV; |     Fdet2_mu=FdetV; | ||||||
|     Fdet1_mu=Zero(); |     Fdet1_mu=Zero(); | ||||||
|      |      | ||||||
| @@ -499,7 +521,7 @@ public: | |||||||
|  |  | ||||||
| 	time=-usecond(); | 	time=-usecond(); | ||||||
| 	PlaqR=(-1.0)*PlaqR; | 	PlaqR=(-1.0)*PlaqR; | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV); | ||||||
| 	Fdet2_nu = FdetV; | 	Fdet2_nu = FdetV; | ||||||
| 	time+=usecond(); | 	time+=usecond(); | ||||||
| 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl; | 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl; | ||||||
| @@ -520,7 +542,7 @@ public: | |||||||
| 	 | 	 | ||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | 	Fdet2_nu = Fdet2_nu+FdetV; | ||||||
| 	 | 	 | ||||||
| 	///////////////// -ve nu ///////////////// | 	///////////////// -ve nu ///////////////// | ||||||
| @@ -539,7 +561,7 @@ public: | |||||||
| 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y; | 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y; | ||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | 	Fdet2_nu = Fdet2_nu+FdetV; | ||||||
| 	 | 	 | ||||||
| 	// x== | 	// x== | ||||||
| @@ -560,7 +582,7 @@ public: | |||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1); | 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1); | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | 	Fdet2_nu = Fdet2_nu+FdetV; | ||||||
|  |  | ||||||
| 	///////////////////////////////////////////////////////////////////// | 	///////////////////////////////////////////////////////////////////// | ||||||
| @@ -589,7 +611,7 @@ public: | |||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,-1); | 	MpInvJx_nu = Cshift(MpInvJx,nu,-1); | ||||||
|  |  | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_mu = Fdet2_mu+FdetV; | 	Fdet2_mu = Fdet2_mu+FdetV; | ||||||
|  |  | ||||||
| 	//  __ | 	//  __ | ||||||
| @@ -609,7 +631,7 @@ public: | |||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | ||||||
|  |  | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_mu = Fdet2_mu+FdetV; | 	Fdet2_mu = Fdet2_mu+FdetV; | ||||||
| 	 | 	 | ||||||
|       } |       } | ||||||
| @@ -931,6 +953,10 @@ private: | |||||||
| public: | public: | ||||||
|  |  | ||||||
|   /* Standard constructor */ |   /* Standard constructor */ | ||||||
|  |   virtual ~SmearedConfigurationMasked() | ||||||
|  |   { | ||||||
|  |     delete UrbGrid; | ||||||
|  |   } | ||||||
|   SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout) |   SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout) | ||||||
|     : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout) |     : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout) | ||||||
|   { |   { | ||||||
| @@ -939,7 +965,6 @@ public: | |||||||
|     // was resized in base class |     // was resized in base class | ||||||
|     assert(this->SmearedSet.size()==Nsmear); |     assert(this->SmearedSet.size()==Nsmear); | ||||||
|      |      | ||||||
|     GridRedBlackCartesian * UrbGrid; |  | ||||||
|     UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid); |     UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid); | ||||||
|     LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0); |     LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0); | ||||||
|     LatticeComplex tmp(_UGrid); |     LatticeComplex tmp(_UGrid); | ||||||
| @@ -947,11 +972,12 @@ public: | |||||||
|     for (unsigned int i = 0; i < this->smearingLevels; ++i) { |     for (unsigned int i = 0; i < this->smearingLevels; ++i) { | ||||||
|  |  | ||||||
|       masks.push_back(*(new LatticeLorentzComplex(_UGrid))); |       masks.push_back(*(new LatticeLorentzComplex(_UGrid))); | ||||||
|  |  | ||||||
|       int mu= (i/2) %Nd; |       int mu= (i/2) %Nd; | ||||||
|       int cb= (i%2); |       int cb= (i%2); | ||||||
|       LatticeComplex tmpcb(UrbGrid); |       LatticeComplex tmpcb(UrbGrid); | ||||||
|  |  | ||||||
|  |       cbs.push_back(cb); | ||||||
|  | 	 | ||||||
|       masks[i]=Zero(); |       masks[i]=Zero(); | ||||||
|       //////////////////// |       //////////////////// | ||||||
|       // Setup the mask |       // Setup the mask | ||||||
| @@ -962,7 +988,6 @@ public: | |||||||
|       PokeIndex<LorentzIndex>(masks[i],tmp, mu); |       PokeIndex<LorentzIndex>(masks[i],tmp, mu); | ||||||
| 	 | 	 | ||||||
|     } |     } | ||||||
|     delete UrbGrid; |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   virtual void smeared_force(GaugeField &SigmaTilde)  |   virtual void smeared_force(GaugeField &SigmaTilde)  | ||||||
|   | |||||||
| @@ -418,33 +418,33 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, | |||||||
|   int hNNm1= NNm1/2; |   int hNNm1= NNm1/2; | ||||||
|   RealD sqrt_2 = sqrt(2.0); |   RealD sqrt_2 = sqrt(2.0); | ||||||
|   Complex ci(0.0,1.0); |   Complex ci(0.0,1.0); | ||||||
|  |  | ||||||
|  |   const int nsimd=  Matrix::Nsimd(); | ||||||
|  |   accelerator_for(ss,grid->oSites(),nsimd,{ | ||||||
|       for(int su2Index=0;su2Index<hNNm1;su2Index++){ |       for(int su2Index=0;su2Index<hNNm1;su2Index++){ | ||||||
| 	int i1, i2; | 	int i1, i2; | ||||||
| 	su2SubGroupIndex(i1, i2, su2Index); | 	su2SubGroupIndex(i1, i2, su2Index); | ||||||
| 	int ax = su2Index*2; | 	int ax = su2Index*2; | ||||||
| 	int ay = su2Index*2+1; | 	int ay = su2Index*2+1; | ||||||
|     accelerator_for(ss,grid->oSites(),1,{ |  | ||||||
| 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian. | 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian. | ||||||
| 	// trace( Ta x Ci in) | 	// trace( Ta x Ci in) | ||||||
| 	// Bet I need to move to real part with mult by -i | 	// Bet I need to move to real part with mult by -i | ||||||
| 	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2))); | 	coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2)))); | ||||||
| 	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1))); | 	coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1)))); | ||||||
|       }); |  | ||||||
|       } |       } | ||||||
|       for(int diagIndex=0;diagIndex<N-1;diagIndex++){ |       for(int diagIndex=0;diagIndex<N-1;diagIndex++){ | ||||||
| 	int k = diagIndex + 1; // diagIndex starts from 0 | 	int k = diagIndex + 1; // diagIndex starts from 0 | ||||||
| 	int a = NNm1+diagIndex; | 	int a = NNm1+diagIndex; | ||||||
| 	RealD scale = 1.0/sqrt(2.0*k*(k+1)); | 	RealD scale = 1.0/sqrt(2.0*k*(k+1)); | ||||||
|     accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{ | 	auto tmp = in_v(ss)()()(0,0); | ||||||
| 	auto tmp = in_v[ss]()()(0,0); |  | ||||||
| 	for(int i=1;i<k;i++){ | 	for(int i=1;i<k;i++){ | ||||||
| 	  tmp=tmp+in_v[ss]()()(i,i); | 	  tmp=tmp+in_v(ss)()()(i,i); | ||||||
|  | 	} | ||||||
|  | 	tmp = tmp - in_v(ss)()()(k,k)*k; | ||||||
|  | 	coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale); | ||||||
|       } |       } | ||||||
| 	tmp = tmp - in_v[ss]()()(k,k)*k; |  | ||||||
| 	out_v[ss]()()(a,b) =imag(tmp) * scale; |  | ||||||
|     }); |     }); | ||||||
| } | } | ||||||
| } |  | ||||||
|  |  | ||||||
|    |    | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) { | |||||||
| //////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////// | ||||||
| // Map a su2 subgroup number to the pair of rows that are non zero | // Map a su2 subgroup number to the pair of rows that are non zero | ||||||
| //////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////// | ||||||
| static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { | static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { | ||||||
|   assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); |   assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); | ||||||
|  |  | ||||||
|   int spare = su2_index; |   int spare = su2_index; | ||||||
|   | |||||||
| @@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) { | |||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  | template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {}; | ||||||
|  | template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {}; | ||||||
|  | template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {}; | ||||||
|  | #endif | ||||||
|   | |||||||
| @@ -58,7 +58,7 @@ int main(int argc, char **argv) { | |||||||
|   HMCparameters HMCparams; |   HMCparameters HMCparams; | ||||||
|   HMCparams.StartTrajectory  = 0; |   HMCparams.StartTrajectory  = 0; | ||||||
|   HMCparams.Trajectories     = 200; |   HMCparams.Trajectories     = 200; | ||||||
|   HMCparams.NoMetropolisUntil=  20; |   HMCparams.NoMetropolisUntil=  0; | ||||||
|   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; |   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; | ||||||
|   HMCparams.StartingType     =std::string("ColdStart"); |   HMCparams.StartingType     =std::string("ColdStart"); | ||||||
|   HMCparams.MD = MD; |   HMCparams.MD = MD; | ||||||
| @@ -70,7 +70,7 @@ int main(int argc, char **argv) { | |||||||
|   CheckpointerParameters CPparams; |   CheckpointerParameters CPparams; | ||||||
|   CPparams.config_prefix = "ckpoint_EODWF_lat"; |   CPparams.config_prefix = "ckpoint_EODWF_lat"; | ||||||
|   CPparams.rng_prefix    = "ckpoint_EODWF_rng"; |   CPparams.rng_prefix    = "ckpoint_EODWF_rng"; | ||||||
|   CPparams.saveInterval  = 10; |   CPparams.saveInterval  = 1; | ||||||
|   CPparams.format        = "IEEE64BIG"; |   CPparams.format        = "IEEE64BIG"; | ||||||
|   TheHMC.Resources.LoadNerscCheckpointer(CPparams); |   TheHMC.Resources.LoadNerscCheckpointer(CPparams); | ||||||
|  |  | ||||||
| @@ -186,6 +186,8 @@ int main(int argc, char **argv) { | |||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
|   // HMC parameters are serialisable |   // HMC parameters are serialisable | ||||||
|  |   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file                                                                      | ||||||
|  |   TheHMC.initializeGaugeFieldAndRNGs(U); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " Running the HMC "<< std::endl; |   std::cout << GridLogMessage << " Running the HMC "<< std::endl; | ||||||
|   TheHMC.Run();  // no smearing |   TheHMC.Run();  // no smearing | ||||||
|   | |||||||
| @@ -261,23 +261,25 @@ public: | |||||||
|     fprintf(FP,"\n\n"); |     fprintf(FP,"\n\n"); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   template<class CComplex> | ||||||
|   static void BLAS(void) |   static void BLAS(void) | ||||||
|   { |   { | ||||||
|     //int nbasis, int nrhs, int coarseVol |     //int nbasis, int nrhs, int coarseVol | ||||||
|     int  basis[] = { 16,32,64 }; |     int  basis[] = { 16,32,64 }; | ||||||
|     int  rhs[]   = { 8,16,32 }; |     int  rhs[]   = { 8,12,16 }; | ||||||
|     int  vol  = 4*4*4*4; |     int  vol  = 8*8*8*8; | ||||||
|  |     int  blk  = 4*4*4*4; | ||||||
|  |  | ||||||
|     GridBLAS blas; |     GridBLAS blas; | ||||||
|  |  | ||||||
|  |     int fpbits = sizeof(CComplex)*4; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl; |     std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl; |     std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|    |    | ||||||
|     fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n"); |     fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits); | ||||||
|  |  | ||||||
|     for(int b=0;b<3;b++){ |     for(int b=0;b<3;b++){ | ||||||
|     for(int r=0;r<3;r++){ |     for(int r=0;r<3;r++){ | ||||||
| @@ -285,7 +287,7 @@ public: | |||||||
|       int N=rhs[r]; |       int N=rhs[r]; | ||||||
|       int K=basis[b]; |       int K=basis[b]; | ||||||
|       int BATCH=vol; |       int BATCH=vol; | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||||
|        |        | ||||||
| @@ -299,9 +301,9 @@ public: | |||||||
|     for(int r=0;r<3;r++){ |     for(int r=0;r<3;r++){ | ||||||
|       int M=basis[b]; |       int M=basis[b]; | ||||||
|       int N=rhs[r]; |       int N=rhs[r]; | ||||||
|       int K=vol; |       int K=blk; | ||||||
|       int BATCH=vol; |       int BATCH=vol; | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||||
| @@ -313,10 +315,10 @@ public: | |||||||
|     for(int b=0;b<3;b++){ |     for(int b=0;b<3;b++){ | ||||||
|     for(int r=0;r<3;r++){ |     for(int r=0;r<3;r++){ | ||||||
|       int M=rhs[r]; |       int M=rhs[r]; | ||||||
|       int N=vol; |       int N=blk; | ||||||
|       int K=basis[b]; |       int K=basis[b]; | ||||||
|       int BATCH=vol; |       int BATCH=vol; | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||||
| @@ -867,6 +869,7 @@ int main (int argc, char ** argv) | |||||||
|   int do_memory=1; |   int do_memory=1; | ||||||
|   int do_comms =1; |   int do_comms =1; | ||||||
|   int do_blas  =1; |   int do_blas  =1; | ||||||
|  |   int do_dslash=1; | ||||||
|  |  | ||||||
|   int sel=4; |   int sel=4; | ||||||
|   std::vector<int> L_list({8,12,16,24,32}); |   std::vector<int> L_list({8,12,16,24,32}); | ||||||
| @@ -877,6 +880,7 @@ int main (int argc, char ** argv) | |||||||
|   std::vector<double> staggered; |   std::vector<double> staggered; | ||||||
|  |  | ||||||
|   int Ls=1; |   int Ls=1; | ||||||
|  |   if (do_dslash){ | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl; |   std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl; | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
| @@ -901,6 +905,7 @@ int main (int argc, char ** argv) | |||||||
|     staggered.push_back(result); |     staggered.push_back(result); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; |   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
| @@ -909,8 +914,33 @@ int main (int argc, char ** argv) | |||||||
|     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl; |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl; | ||||||
|   } |   } | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   int NN=NN_global; |   int NN=NN_global; | ||||||
|  |   if(do_dslash){ | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl; | ||||||
|  |     fprintf(FP,"Per node summary table\n"); | ||||||
|  |     fprintf(FP,"\n"); | ||||||
|  |     fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n"); | ||||||
|  |     fprintf(FP,"\n"); | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl; | ||||||
|  |       fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.); | ||||||
|  |     } | ||||||
|  |     fprintf(FP,"\n"); | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl; | ||||||
|  |     std::cout<<std::setprecision(3); | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |    | ||||||
|   if ( do_memory ) { |   if ( do_memory ) { | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; |     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; | ||||||
| @@ -918,15 +948,6 @@ int main (int argc, char ** argv) | |||||||
|     Benchmark::Memory(); |     Benchmark::Memory(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( do_blas ) { |  | ||||||
| #if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)    |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     Benchmark::BLAS(); |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if ( do_su4 ) { |   if ( do_su4 ) { | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl; |     std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl; | ||||||
| @@ -941,27 +962,13 @@ int main (int argc, char ** argv) | |||||||
|     Benchmark::Comms(); |     Benchmark::Comms(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   if ( do_blas ) { | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; |     std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl; |     Benchmark::BLAS<ComplexD>(); | ||||||
|     fprintf(FP,"Per node summary table\n"); |     Benchmark::BLAS<ComplexF>(); | ||||||
|     fprintf(FP,"\n"); |  | ||||||
|     fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n"); |  | ||||||
|     fprintf(FP,"\n"); |  | ||||||
|     for(int l=0;l<L_list.size();l++){ |  | ||||||
|       std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl; |  | ||||||
|       fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.); |  | ||||||
|   } |   } | ||||||
|     fprintf(FP,"\n"); |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl; |  | ||||||
|     std::cout<<std::setprecision(3); |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|    |    | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
|   fclose(FP); |   fclose(FP); | ||||||
|   | |||||||
| @@ -1,16 +1,18 @@ | |||||||
|  |  | ||||||
|  | export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl "  | ||||||
|  | export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc " | ||||||
| ../../configure \ | ../../configure \ | ||||||
| 	--enable-simd=GPU \ | 	--enable-simd=GPU \ | ||||||
| 	--enable-gen-simd-width=64 \ | 	--enable-gen-simd-width=64 \ | ||||||
| 	--enable-comms=mpi-auto \ | 	--enable-comms=mpi-auto \ | ||||||
|  | 	--enable-debug \ | ||||||
| 	--disable-gparity \ | 	--disable-gparity \ | ||||||
| 	--disable-fermion-reps \ | 	--disable-fermion-reps \ | ||||||
|  | 	--with-lime=$CLIME \ | ||||||
| 	--enable-shm=nvlink \ | 	--enable-shm=nvlink \ | ||||||
| 	--enable-accelerator=sycl \ | 	--enable-accelerator=sycl \ | ||||||
| 	--enable-accelerator-aware-mpi=yes\ | 	--enable-accelerator-aware-mpi=yes\ | ||||||
| 	--enable-unified=no \ | 	--enable-unified=no \ | ||||||
| 	MPICXX=mpicxx \ | 	MPICXX=mpicxx \ | ||||||
| 	CXX=icpx \ | 	CXX=icpx  | ||||||
| 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ |  | ||||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										23
									
								
								systems/Aurora/config-command-leak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								systems/Aurora/config-command-leak
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | |||||||
|  | source ~/spack/share/spack/setup-env.sh  | ||||||
|  | spack load c-lime | ||||||
|  | export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` | ||||||
|  | export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` | ||||||
|  | export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||||
|  |  | ||||||
|  | ../../configure \ | ||||||
|  | 	--enable-debug \ | ||||||
|  | 	--enable-simd=GPU \ | ||||||
|  | 	--enable-gen-simd-width=64 \ | ||||||
|  | 	--enable-comms=mpi-auto \ | ||||||
|  | 	--disable-gparity \ | ||||||
|  | 	--disable-fermion-reps \ | ||||||
|  | 	--with-lime=$CLIME \ | ||||||
|  | 	--enable-shm=nvlink \ | ||||||
|  | 	--enable-accelerator=sycl \ | ||||||
|  | 	--enable-accelerator-aware-mpi=yes\ | ||||||
|  | 	--enable-unified=no \ | ||||||
|  | 	MPICXX=mpicxx \ | ||||||
|  | 	CXX=icpx \ | ||||||
|  | 	LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl  -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \ | ||||||
|  | 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host  -fsycl -fsanitize=leak " | ||||||
|  |  | ||||||
							
								
								
									
										22
									
								
								systems/Aurora/config-command-sanitize
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								systems/Aurora/config-command-sanitize
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | |||||||
|  | # -fsycl-targets=spir64_gen -Xs\" -device pvc \" | ||||||
|  | # -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc | ||||||
|  | # -fsycl-targets=intel_gpu_pvc | ||||||
|  |  | ||||||
|  | unset DEVICE | ||||||
|  | export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -Xarch_host -fsanitize=address"  | ||||||
|  | export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -Xarch_host -fsanitize=address  -fsycl-targets=spir64_gen -Xs -device -Xs pvc " | ||||||
|  | ../../configure \ | ||||||
|  | 	--enable-simd=GPU \ | ||||||
|  | 	--enable-gen-simd-width=64 \ | ||||||
|  | 	--enable-comms=mpi-auto \ | ||||||
|  | 	--enable-debug \ | ||||||
|  | 	--disable-gparity \ | ||||||
|  | 	--disable-fermion-reps \ | ||||||
|  | 	--with-lime=$CLIME \ | ||||||
|  | 	--enable-shm=nvlink \ | ||||||
|  | 	--enable-accelerator=sycl \ | ||||||
|  | 	--enable-accelerator-aware-mpi=yes\ | ||||||
|  | 	--enable-unified=no \ | ||||||
|  | 	MPICXX=mpicxx \ | ||||||
|  | 	CXX=icpx  | ||||||
|  |  | ||||||
| @@ -1,14 +1,22 @@ | |||||||
|  | source ~/spack/share/spack/setup-env.sh  | ||||||
|  | spack load c-lime | ||||||
|  | export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` | ||||||
|  | #spack load libefence | ||||||
|  | #export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' ` | ||||||
|  | #export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH | ||||||
|  | #spack load gperftools | ||||||
|  | export TCMALLOC=/home/paboyle/gperftools/install | ||||||
|  | export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||||
|  | export INTELGT_AUTO_ATTACH_DISABLE=1 | ||||||
|  |  | ||||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 | #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 | ||||||
|  | #module load oneapi/release/2023.12.15.001 | ||||||
| module load oneapi/release/2023.12.15.001 |  | ||||||
|  |  | ||||||
| #module use /soft/modulefiles | #module use /soft/modulefiles | ||||||
| #module load intel_compute_runtime/release/agama-devel-682.22 | #module load intel_compute_runtime/release/agama-devel-682.22 | ||||||
|  |  | ||||||
| export FI_CXI_DEFAULT_CQ_SIZE=131072 | #export FI_CXI_DEFAULT_CQ_SIZE=131072 | ||||||
| export FI_CXI_CQ_FILL_PERCENT=20 | #export FI_CXI_CQ_FILL_PERCENT=20 | ||||||
|  | #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||||
| export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" |  | ||||||
| #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" | #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" | ||||||
|  |  | ||||||
| # | # | ||||||
| @@ -16,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | |||||||
| # -ftarget-register-alloc-mode=pvc:small | # -ftarget-register-alloc-mode=pvc:small | ||||||
| # -ftarget-register-alloc-mode=pvc:large | # -ftarget-register-alloc-mode=pvc:large | ||||||
| # -ftarget-register-alloc-mode=pvc:auto | # -ftarget-register-alloc-mode=pvc:auto | ||||||
| # | #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 | ||||||
|  |  | ||||||
| export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 | export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 | ||||||
| export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 | export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 | ||||||
| export http_proxy=http://proxy.alcf.anl.gov:3128 | export http_proxy=http://proxy.alcf.anl.gov:3128 | ||||||
| export https_proxy=http://proxy.alcf.anl.gov:3128 | export https_proxy=http://proxy.alcf.anl.gov:3128 | ||||||
| #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 |  | ||||||
| git config --global http.proxy http://proxy.alcf.anl.gov:3128 | git config --global http.proxy http://proxy.alcf.anl.gov:3128 | ||||||
|  |  | ||||||
|  | #source ~/spack/share/spack/setup-env.sh | ||||||
|  | #spack load gperftools | ||||||
|  | #export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` | ||||||
|  | #export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||||
|  |  | ||||||
| export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||||
|   | |||||||
| @@ -3,7 +3,7 @@ spack load c-lime | |||||||
| module load emacs  | module load emacs  | ||||||
| module load PrgEnv-gnu | module load PrgEnv-gnu | ||||||
| module load rocm | module load rocm | ||||||
| module load cray-mpich/8.1.23 | module load cray-mpich | ||||||
| module load gmp | module load gmp | ||||||
| module load cray-fftw | module load cray-fftw | ||||||
| module load craype-accel-amd-gfx90a | module load craype-accel-amd-gfx90a | ||||||
|   | |||||||
							
								
								
									
										118
									
								
								tests/debug/Test_8888.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										118
									
								
								tests/debug/Test_8888.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,118 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./tests/Test_general_coarse_hdcg.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2023 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <pboyle@bnl.gov> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  | #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h> | ||||||
|  | #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h> | ||||||
|  | #include <Grid/algorithms/iterative/AdefMrhs.h> | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |   const int Ls=8; | ||||||
|  |   const int nbasis = 40; | ||||||
|  |   const int cb = 0 ; | ||||||
|  |   RealD mass=0.01; | ||||||
|  |   RealD M5=1.8; | ||||||
|  |   RealD b=1.0; | ||||||
|  |   RealD c=0.0; | ||||||
|  |  | ||||||
|  |   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), | ||||||
|  | 								   GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||||
|  | 								   GridDefaultMpi()); | ||||||
|  |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
|  |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |   ///////////////////////// RNGs ///////////////////////////////// | ||||||
|  |   std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |   std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |   std::vector<int> cseeds({5,6,7,8}); | ||||||
|  |  | ||||||
|  |   GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5); | ||||||
|  |   GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |  | ||||||
|  |   ///////////////////////// Configuration ///////////////////////////////// | ||||||
|  |   LatticeGaugeField Umu(UGrid); | ||||||
|  |  | ||||||
|  |   FieldMetaData header; | ||||||
|  |   std::string file("ckpoint_EODWF_lat.125"); | ||||||
|  |   NerscIO::readConfiguration(Umu,header,file); | ||||||
|  |  | ||||||
|  |   //////////////////////// Fermion action ////////////////////////////////// | ||||||
|  |   MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); | ||||||
|  |  | ||||||
|  |   MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf); | ||||||
|  |  | ||||||
|  |    | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "         Fine Power method            "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |  | ||||||
|  |   LatticeFermionD pm_src(FGrid); | ||||||
|  |   pm_src = ComplexD(1.0); | ||||||
|  |   PowerMethod<LatticeFermionD>       fPM; | ||||||
|  |   fPM(HermOp,pm_src); | ||||||
|  |  | ||||||
|  |    | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "         Fine Lanczos  (poly, low)    "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |    | ||||||
|  |   int Nk=80; | ||||||
|  |   int Nm=Nk*3; | ||||||
|  |   int Nstop=8; | ||||||
|  |   int Nconv_test_interval=1; | ||||||
|  |    | ||||||
|  |   //  Chebyshev<LatticeFermionD>      IRLChebyLo(0.2,64.0,201);  // 1 iter | ||||||
|  |   Chebyshev<LatticeFermionD>      IRLChebyLo(0.0,55.0,101);  // 1 iter | ||||||
|  |   FunctionHermOp<LatticeFermionD>    PolyOp(IRLChebyLo,HermOp); | ||||||
|  |   PlainHermOp<LatticeFermionD>          Op(HermOp); | ||||||
|  |  | ||||||
|  |   ImplicitlyRestartedLanczos IRL(PolyOp, | ||||||
|  | 				 Op, | ||||||
|  | 				 Nk, // sought vecs | ||||||
|  | 				 Nk, // sought vecs | ||||||
|  | 				 Nm, // spare vecs | ||||||
|  | 				 1.0e-8, | ||||||
|  | 				 10 // Max iterations | ||||||
|  | 				 ); | ||||||
|  |  | ||||||
|  |   int Nconv; | ||||||
|  |   std::vector<RealD>            eval(Nm); | ||||||
|  |   std::vector<LatticeFermionD>     evec(Nm,FGrid); | ||||||
|  |   LatticeFermionD     irl_src(FGrid); | ||||||
|  |  | ||||||
|  |   IRL.calc(eval,evec,irl_src,Nconv); | ||||||
|  |  | ||||||
|  |   Grid_finalize(); | ||||||
|  |   return 0; | ||||||
|  | } | ||||||
| @@ -392,9 +392,27 @@ void  TestCGschur(What & Ddwf, | |||||||
| 		   GridParallelRNG *RNG5) | 		   GridParallelRNG *RNG5) | ||||||
| { | { | ||||||
|   LatticeFermion src   (FGrid); random(*RNG5,src); |   LatticeFermion src   (FGrid); random(*RNG5,src); | ||||||
|   LatticeFermion result(FGrid); result=Zero(); |   LatticeFermion result1(FGrid); result1=Zero(); | ||||||
|  |   LatticeFermion result2(FGrid); result2=Zero(); | ||||||
|  |   LatticeFermion result3(FGrid); result3=Zero(); | ||||||
|  |  | ||||||
|   ConjugateGradient<LatticeFermion> CG(1.0e-8,10000); |   ConjugateGradient<LatticeFermion> CG(1.0e-8,10000); | ||||||
|   SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG); |   SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG); | ||||||
|   SchurSolver(Ddwf,src,result); |   SchurSolver(Ddwf,src,result1); | ||||||
|  |  | ||||||
|  |   SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG); | ||||||
|  |   SchurSolverSymm1(Ddwf,src,result2); | ||||||
|  |  | ||||||
|  |   SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG); | ||||||
|  |   SchurSolverSymm2(Ddwf,src,result3); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << " Symm1    " <<norm2(result2)<<std::endl;  | ||||||
|  |   result2=result2-result1; | ||||||
|  |   std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl;  | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << " Symm2    " <<norm2(result3)<<std::endl;  | ||||||
|  |   result3=result3-result1; | ||||||
|  |   std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl;  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -244,7 +244,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  |   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  | ||||||
|  |  | ||||||
|    | #if 0   | ||||||
|   MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs); |   MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs); | ||||||
|   typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t; |   typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t; | ||||||
|    |    | ||||||
| @@ -308,6 +308,7 @@ int main (int argc, char ** argv) | |||||||
|     mrhsCG(MrhsCoarseOp,rh_src,rh_res); |     mrhsCG(MrhsCoarseOp,rh_src,rh_res); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  | #endif | ||||||
|   std::cout<<GridLogMessage<<std::endl; |   std::cout<<GridLogMessage<<std::endl; | ||||||
|   std::cout<<GridLogMessage<<std::endl; |   std::cout<<GridLogMessage<<std::endl; | ||||||
|   std::cout<<GridLogMessage<<"*******************************************"<<std::endl; |   std::cout<<GridLogMessage<<"*******************************************"<<std::endl; | ||||||
|   | |||||||
| @@ -145,7 +145,7 @@ int main (int argc, char ** argv) | |||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|   const int Ls=24; |   const int Ls=24; | ||||||
|   const int nbasis = 60; |   const int nbasis = 62; | ||||||
|   const int cb = 0 ; |   const int cb = 0 ; | ||||||
|   RealD mass=0.00078; |   RealD mass=0.00078; | ||||||
|   RealD M5=1.8; |   RealD M5=1.8; | ||||||
| @@ -160,7 +160,7 @@ int main (int argc, char ** argv) | |||||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|   // Construct a coarsened grid with 4^4 cell |   // Construct a coarsened grid with 4^4 cell | ||||||
|   Coordinate Block({4,4,4,4}); |   Coordinate Block({4,4,6,4}); | ||||||
|   Coordinate clatt = GridDefaultLatt(); |   Coordinate clatt = GridDefaultLatt(); | ||||||
|   for(int d=0;d<clatt.size();d++){ |   for(int d=0;d<clatt.size();d++){ | ||||||
|     clatt[d] = clatt[d]/Block[d]; |     clatt[d] = clatt[d]/Block[d]; | ||||||
|   | |||||||
| @@ -160,7 +160,8 @@ int main (int argc, char ** argv) | |||||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|   // Construct a coarsened grid with 4^4 cell |   // Construct a coarsened grid with 4^4 cell | ||||||
|   Coordinate Block({4,4,6,6}); |   //  Coordinate Block({4,4,6,4}); | ||||||
|  |   Coordinate Block({4,4,4,4}); | ||||||
|   Coordinate clatt = GridDefaultLatt(); |   Coordinate clatt = GridDefaultLatt(); | ||||||
|   for(int d=0;d<clatt.size();d++){ |   for(int d=0;d<clatt.size();d++){ | ||||||
|     clatt[d] = clatt[d]/Block[d]; |     clatt[d] = clatt[d]/Block[d]; | ||||||
| @@ -217,7 +218,7 @@ int main (int argc, char ** argv) | |||||||
|   std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac"); |   std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac"); | ||||||
|   std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml"); |   std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml"); | ||||||
|   bool load_agg=true; |   bool load_agg=true; | ||||||
|   bool load_refine=false; |   bool load_refine=true; | ||||||
|   bool load_mat=false; |   bool load_mat=false; | ||||||
|   bool load_evec=false; |   bool load_evec=false; | ||||||
|  |  | ||||||
| @@ -276,17 +277,25 @@ int main (int argc, char ** argv) | |||||||
|   std::cout << "**************************************"<<std::endl; |   std::cout << "**************************************"<<std::endl; | ||||||
|  |  | ||||||
|   typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix; |   typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix; | ||||||
|   Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,301);  // 1 iter |   //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,301);  // 4.4.6.4 | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,501);  // for 4.4.4.4 blocking 350 evs | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.0014,42.0,501);  // for 4.4.4.4 blocking 700 evs | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.002,42.0,501);  // for 4.4.4.4 blocking 1226 evs | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.0025,42.0,501);  // for 4.4.4.4 blocking 1059 evs | ||||||
|  | 							  //							  3e-4,2); | ||||||
|  |   Chebyshev<CoarseVector>      IRLCheby(0.0018,42.0,301);  // for 4.4.4.4 blocking  // 790 evs | ||||||
|  |    | ||||||
|   MrhsHermMatrix MrhsCoarseOp     (mrhs); |   MrhsHermMatrix MrhsCoarseOp     (mrhs); | ||||||
|  |  | ||||||
|   CoarseVector pm_src(CoarseMrhs); |   CoarseVector pm_src(CoarseMrhs); | ||||||
|   pm_src = ComplexD(1.0); |   pm_src = ComplexD(1.0); | ||||||
|   PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src); |   PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src); | ||||||
|  |  | ||||||
|   int Nk=nrhs*30; |   //  int Nk=nrhs*30; // 4.4.6.4 | ||||||
|   //  int Nk=nrhs*80; |   //  int Nk=nrhs*80; | ||||||
|   int Nm=Nk*4; |   int Nk=nrhs*60; // 720 | ||||||
|   int Nstop=Nk; |   int Nm=Nk*4;    // 2880 ; generally finishes at 1440 | ||||||
|  |   int Nstop=512; | ||||||
|   int Nconv_test_interval=1; |   int Nconv_test_interval=1; | ||||||
|    |    | ||||||
|   ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp, |   ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp, | ||||||
| @@ -299,7 +308,7 @@ int main (int argc, char ** argv) | |||||||
| 							  nrhs, | 							  nrhs, | ||||||
| 							  Nk, | 							  Nk, | ||||||
| 							  Nm, | 							  Nm, | ||||||
| 							  1e-4,20); | 							  3e-4,2); | ||||||
|  |  | ||||||
|   std::vector<RealD>            eval(Nm); |   std::vector<RealD>            eval(Nm); | ||||||
|   std::vector<CoarseVector>     evec(Nm,Coarse5d); |   std::vector<CoarseVector>     evec(Nm,Coarse5d); | ||||||
| @@ -331,7 +340,7 @@ int main (int argc, char ** argv) | |||||||
|   // Extra HDCG parameters |   // Extra HDCG parameters | ||||||
|   ////////////////////////// |   ////////////////////////// | ||||||
|   int maxit=3000; |   int maxit=3000; | ||||||
|   ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false); |   ConjugateGradient<CoarseVector>  CG(7.5e-2,maxit,false); | ||||||
|   RealD lo=2.0; |   RealD lo=2.0; | ||||||
|   int ord = 7; |   int ord = 7; | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user