mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 19:44:32 +00:00 
			
		
		
		
	Compare commits
	
		
			66 Commits
		
	
	
		
			9a1ad6a5eb
			...
			feature/ft
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | bffd30abec | ||
|  | da919949f9 | ||
|  | b12b4fdaff | ||
|  | 557fa483ff | ||
|  | fc15d55df6 | ||
|  | 53573d7d94 | ||
|  | bb3c177000 | ||
|  | a3322b470f | ||
|  | f8f408e7a9 | ||
|  | baac1127d0 | ||
|  | 6f1328160c | ||
|  | 04cf902791 | ||
|  | 7a5b1c1a19 | ||
|  | 18d2d7da4a | ||
|  | b461184797 | ||
|  | 4563b39305 | ||
|  | c9d5674d5b | ||
|  | 486412635a | ||
|  | 8b23a1546a | ||
|  | a901e4e369 | ||
|  | 804d9367d4 | ||
|  | 41d8adca95 | ||
|  | 059e8e5bb0 | ||
|  | b3ee8ded96 | ||
|  | cf3584ad15 | ||
|  | a66973163f | ||
|  | 4502a8c8a1 | ||
|  | 9c902e4c2d | ||
|  | f3eb36adcf | ||
|  | 7c246606c1 | ||
|  | 172c75029e | ||
|  | 6ae52da571 | ||
|  | 4ee9c68053 | ||
|  | a15b4378a3 | ||
|  | 89fdd7f8dd | ||
|  | c328be24b7 | ||
|  | a73dc6dbf4 | ||
|  | eee2a2657f | ||
|  | 12b8be7cb9 | ||
|  | 63c223ea5d | ||
|  | 2877fb4a2c | ||
|  | d299c86633 | ||
|  | 6ce52092e8 | ||
|  | b5926c1d21 | ||
|  | 9563238e9b | ||
|  | fb9b1d76ca | ||
|  | 1739146599 | ||
|  | ed20b39ab3 | ||
|  | 284fc05f15 | ||
|  | 07a07b6fa3 | ||
|  | dc80b08969 | ||
|  | a49a161f8d | ||
|  | a6479ca50f | ||
|  | 0e607a55e7 | ||
|  | c4b9f71357 | ||
|  | 394e506aea | ||
|  | e19b26341b | ||
|  | cfe1b13225 | ||
|  | 890c5ea1cd | ||
|  | a87378d3b6 | ||
|  | 832fc08809 | ||
|  | 461cd045c6 | ||
|  | fee65d7a75 | ||
|  | 31f9971dbf | ||
|  | d87296f3e8 | ||
|  | be94cf1c6f | 
							
								
								
									
										1052
									
								
								BLAS_benchmark/BatchBlasBench.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1052
									
								
								BLAS_benchmark/BatchBlasBench.cc
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										2
									
								
								BLAS_benchmark/compile-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								BLAS_benchmark/compile-command
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
|  | ||||
| mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench | ||||
| @@ -30,9 +30,14 @@ directory | ||||
|  | ||||
| #include <type_traits> | ||||
| #include <cassert> | ||||
| #include <exception> | ||||
|  | ||||
| #define NAMESPACE_BEGIN(A) namespace A { | ||||
| #define NAMESPACE_END(A)   } | ||||
| #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) | ||||
| #define GRID_NAMESPACE_END   NAMESPACE_END(Grid) | ||||
| #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  );  | ||||
|  | ||||
| #define EXCEPTION_CHECK_BEGIN(A) try { | ||||
| #define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; } | ||||
|  | ||||
|   | ||||
| @@ -89,9 +89,10 @@ public: | ||||
|       gridblasHandle = theGridAccelerator; | ||||
| #endif | ||||
| #ifdef GRID_ONE_MKL | ||||
|       cl::sycl::cpu_selector selector; | ||||
|       cl::sycl::gpu_selector selector; | ||||
|       cl::sycl::device selectedDevice { selector }; | ||||
|       gridblasHandle =new sycl::queue (selectedDevice); | ||||
|       cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; | ||||
|       gridblasHandle =new sycl::queue (selectedDevice,q_prop); | ||||
| #endif | ||||
|       gridblasInit=1; | ||||
|     } | ||||
| @@ -207,6 +208,9 @@ public: | ||||
|     assert(Bkn.size()==batchCount); | ||||
|     assert(Cmn.size()==batchCount); | ||||
|  | ||||
|     assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose | ||||
|     assert(OpB!=GridBLAS_OP_T); | ||||
|  | ||||
|     int lda = m; // m x k column major | ||||
|     int ldb = k; // k x n column major | ||||
|     int ldc = m; // m x b column major | ||||
| @@ -266,26 +270,130 @@ public: | ||||
|     assert(err==CUBLAS_STATUS_SUCCESS); | ||||
| #endif | ||||
| #ifdef GRID_SYCL | ||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI | ||||
| #warning "oneMKL implementation not built " | ||||
| #endif | ||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||
|     // Need a default/reference implementation | ||||
|     int sda = lda*k; | ||||
|     int sdb = ldb*k; | ||||
|     int sdc = ldc*n; | ||||
|     for (int p = 0; p < batchCount; ++p) { | ||||
|       for (int mm = 0; mm < m; ++mm) { | ||||
| 	for (int nn = 0; nn < n; ++nn) { | ||||
| 	  ComplexD c_mn(0.0); | ||||
| 	  for (int kk = 0; kk < k; ++kk) | ||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | ||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; | ||||
|       int64_t m64=m; | ||||
|       int64_t n64=n; | ||||
|       int64_t k64=k; | ||||
|       int64_t lda64=lda; | ||||
|       int64_t ldb64=ldb; | ||||
|       int64_t ldc64=ldc; | ||||
|       int64_t batchCount64=batchCount; | ||||
|  | ||||
|       oneapi::mkl::transpose iOpA; | ||||
|       oneapi::mkl::transpose iOpB; | ||||
|        | ||||
|       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||
|       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||
|       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||
|       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||
|       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||
|       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||
|  | ||||
|       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||
| 						  &iOpA, | ||||
| 						  &iOpB, | ||||
| 						  &m64,&n64,&k64, | ||||
| 						  (ComplexD *) &alpha_p[0], | ||||
| 						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64, | ||||
| 						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64, | ||||
| 						  (ComplexD *) &beta_p[0], | ||||
| 						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64, | ||||
| 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||
|       synchronise(); | ||||
| #if 0 | ||||
|       // This code was used to check the mat mul on Sunspot/OneMKL | ||||
|       std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl; | ||||
|       std::vector<ComplexD> A(m*k);  // pointer list to matrices | ||||
|       std::vector<ComplexD> B(k*n); | ||||
|       std::vector<ComplexD> C(m*n); | ||||
|       //      int sda = lda*k; | ||||
|       //      int sdb = ldb*k; | ||||
|       //      int sdc = ldc*n; | ||||
|       std::cerr << " Checking the GEMM results "<<std::endl; | ||||
|       for (int p = 0; p < 1; ++p) { | ||||
| 	ComplexD * Amk_p;  // pointer list to matrices | ||||
| 	ComplexD * Bkn_p;  // pointer list to matrices | ||||
| 	ComplexD * Cmn_p;  // pointer list to matrices | ||||
| 	acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*)); | ||||
| 	acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*)); | ||||
| 	acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*)); | ||||
| 	std::cerr << " p " << p << " copied pointers "<<std::endl; | ||||
| 	acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD)); | ||||
| 	acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD)); | ||||
| 	acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD)); | ||||
| 	std::cerr << " p " << p << " copied matrices "<<std::endl; | ||||
| 	std::cerr << " C[0] "<<C[0]<<std::endl; | ||||
| 	std::cerr << " A[0] "<<A[0]<<std::endl; | ||||
| 	std::cerr << " B[0] "<<B[0]<<std::endl; | ||||
| 	std::cerr << " m "<<m<<std::endl; | ||||
| 	std::cerr << " n "<<n<<std::endl; | ||||
| 	std::cerr << " k "<<k<<std::endl; | ||||
| 	for (int mm = 0; mm < m; ++mm) { | ||||
| 	  for (int nn = 0; nn < n; ++nn) { | ||||
| 	    ComplexD c_mn(0.0); | ||||
| 	    for (int kk = 0; kk < k; ++kk) { | ||||
| 	      int idx_a, idx_b; | ||||
| 	      //    int lda = m; // m x k column major | ||||
| 	      //    int ldb = k; // k x n column major | ||||
| 	      //    int ldc = m; // m x b column major | ||||
| 	      if(OpA!=GridBLAS_OP_N) { | ||||
| 		idx_a =kk + mm*lda; | ||||
| 	      } else { | ||||
| 		idx_a =mm + kk*lda; | ||||
| 	      } | ||||
| 	      if(OpB!=GridBLAS_OP_N) { | ||||
| 		idx_b =nn + kk*ldb; | ||||
| 	      } else { | ||||
| 		idx_b =kk + nn*ldb; | ||||
| 	      } | ||||
| 	      //	      std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl; | ||||
|  | ||||
| 	      ComplexD Ac = A[idx_a]; | ||||
| 	      ComplexD Bc = B[idx_b]; | ||||
| 	      if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac); | ||||
| 	      if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc); | ||||
| 	       | ||||
| 	      c_mn += Ac*Bc; | ||||
| 	    } | ||||
| 	    std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl; | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
|     //    synchronise(); | ||||
| #endif | ||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||
|     // Need a default/reference implementation; use Eigen | ||||
|       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||
|         }); | ||||
|       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; | ||||
| 	  } ); | ||||
|       } else {  | ||||
| 	assert(0); | ||||
|       } | ||||
| #endif | ||||
|      RealD t1=usecond(); | ||||
|      RealD flops = 8.0*m*n*k*batchCount; | ||||
|      RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; | ||||
| @@ -306,6 +414,9 @@ public: | ||||
|     RealD t2=usecond(); | ||||
|     int32_t batchCount = Amk.size(); | ||||
|  | ||||
|     assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose | ||||
|     assert(OpB!=GridBLAS_OP_T); | ||||
|  | ||||
|     int lda = m; // m x k column major | ||||
|     int ldb = k; // k x n column major | ||||
|     int ldc = m; // m x b column major | ||||
| @@ -366,26 +477,69 @@ public: | ||||
|     assert(err==CUBLAS_STATUS_SUCCESS); | ||||
| #endif | ||||
| #ifdef GRID_SYCL | ||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI | ||||
| #warning "oneMKL implementation not built " | ||||
|       int64_t m64=m; | ||||
|       int64_t n64=n; | ||||
|       int64_t k64=k; | ||||
|       int64_t lda64=lda; | ||||
|       int64_t ldb64=ldb; | ||||
|       int64_t ldc64=ldc; | ||||
|       int64_t batchCount64=batchCount; | ||||
|  | ||||
|       oneapi::mkl::transpose iOpA; | ||||
|       oneapi::mkl::transpose iOpB; | ||||
|        | ||||
|       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||
|       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||
|       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||
|       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||
|       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||
|       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||
|  | ||||
|       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||
| 						  &iOpA, | ||||
| 						  &iOpB, | ||||
| 						  &m64,&n64,&k64, | ||||
| 						  (ComplexF *) &alpha_p[0], | ||||
| 						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64, | ||||
| 						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, | ||||
| 						  (ComplexF *) &beta_p[0], | ||||
| 						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, | ||||
| 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||
|     synchronise(); | ||||
| #endif | ||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||
|     int sda = lda*k; | ||||
|     int sdb = ldb*k; | ||||
|     int sdc = ldc*n; | ||||
|     ComplexF alphaf(real(alpha),imag(alpha)); | ||||
|     ComplexF betaf(real(beta),imag(beta)); | ||||
|     // Need a default/reference implementation | ||||
|     for (int p = 0; p < batchCount; ++p) { | ||||
|       for (int mm = 0; mm < m; ++mm) { | ||||
| 	for (int nn = 0; nn < n; ++nn) { | ||||
| 	  ComplexF c_mn(0.0); | ||||
| 	  for (int kk = 0; kk < k; ++kk) | ||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | ||||
| 	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ]; | ||||
| 	} | ||||
|     // Need a default/reference implementation; use Eigen | ||||
|       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; | ||||
| 	  } ); | ||||
|       } else {  | ||||
| 	assert(0); | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
|      RealD t1=usecond(); | ||||
|      RealD flops = 8.0*m*n*k*batchCount; | ||||
| @@ -408,6 +562,9 @@ public: | ||||
|     RealD t2=usecond(); | ||||
|     int32_t batchCount = Amk.size(); | ||||
|  | ||||
|     assert(OpA!=GridBLAS_OP_C); // Real case no conjugate | ||||
|     assert(OpB!=GridBLAS_OP_C); | ||||
|  | ||||
|     int lda = m; // m x k column major | ||||
|     int ldb = k; // k x n column major | ||||
|     int ldc = m; // m x b column major | ||||
| @@ -467,24 +624,69 @@ public: | ||||
|     assert(err==CUBLAS_STATUS_SUCCESS); | ||||
| #endif | ||||
| #ifdef GRID_SYCL | ||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI | ||||
| #warning "oneMKL implementation not built " | ||||
|       int64_t m64=m; | ||||
|       int64_t n64=n; | ||||
|       int64_t k64=k; | ||||
|       int64_t lda64=lda; | ||||
|       int64_t ldb64=ldb; | ||||
|       int64_t ldc64=ldc; | ||||
|       int64_t batchCount64=batchCount; | ||||
|  | ||||
|       oneapi::mkl::transpose iOpA; | ||||
|       oneapi::mkl::transpose iOpB; | ||||
|        | ||||
|       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||
|       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||
|       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||
|       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||
|       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||
|       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||
|  | ||||
|       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||
| 						  &iOpA, | ||||
| 						  &iOpB, | ||||
| 						  &m64,&n64,&k64, | ||||
| 						  (float *) &alpha_p[0], | ||||
| 						  (const float **)&Amk[0], (const int64_t *)&lda64, | ||||
| 						  (const float **)&Bkn[0], (const int64_t *)&ldb64, | ||||
| 						  (float *) &beta_p[0], | ||||
| 						  (float **)&Cmn[0], (const int64_t *)&ldc64, | ||||
| 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||
|       synchronise(); | ||||
| #endif | ||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||
|     int sda = lda*k; | ||||
|     int sdb = ldb*k; | ||||
|     int sdc = ldc*n; | ||||
|     // Need a default/reference implementation | ||||
|     for (int p = 0; p < batchCount; ++p) { | ||||
|       for (int mm = 0; mm < m; ++mm) { | ||||
| 	for (int nn = 0; nn < n; ++nn) { | ||||
| 	  RealD c_mn(0.0); | ||||
| 	  for (int kk = 0; kk < k; ++kk) | ||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | ||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; | ||||
| 	} | ||||
|     // Need a default/reference implementation; use Eigen | ||||
|       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; | ||||
| 	  } ); | ||||
|       } else {  | ||||
| 	assert(0); | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
|      RealD t1=usecond(); | ||||
|      RealD flops = 2.0*m*n*k*batchCount; | ||||
| @@ -495,7 +697,6 @@ public: | ||||
|   /////////////////////////////////////////////////////////////////////////// | ||||
|   // Double precision real GEMM | ||||
|   /////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
|   void gemmBatched(GridBLASOperation_t OpA, | ||||
| 		   GridBLASOperation_t OpB, | ||||
| 		   int m,int n, int k, | ||||
| @@ -508,6 +709,9 @@ public: | ||||
|     RealD t2=usecond(); | ||||
|     int32_t batchCount = Amk.size(); | ||||
|  | ||||
|     assert(OpA!=GridBLAS_OP_C); // Real case no conjugate | ||||
|     assert(OpB!=GridBLAS_OP_C); | ||||
|  | ||||
|     int lda = m; // m x k column major | ||||
|     int ldb = k; // k x n column major | ||||
|     int ldc = m; // m x b column major | ||||
| @@ -568,160 +772,124 @@ public: | ||||
|     assert(err==CUBLAS_STATUS_SUCCESS); | ||||
| #endif | ||||
| #ifdef GRID_SYCL | ||||
|     /* | ||||
|       int64_t m64=m; | ||||
|       int64_t n64=n; | ||||
|       int64_t k64=k; | ||||
|       int64_t lda64=lda; | ||||
|       int64_t ldb64=ldb; | ||||
|       int64_t ldc64=ldc; | ||||
|       int64_t batchCount64=batchCount; | ||||
|       oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, | ||||
|       onemkl::transpose::N, | ||||
|       onemkl::transpose::N, | ||||
|       &m64,&n64,&k64, | ||||
|       (double *) &alpha_p[0], | ||||
|       (double **)&Amk[0], lda, | ||||
|       (double **)&Bkn[0], ldb, | ||||
|       (double *) &beta_p[0], | ||||
|       (double **)&Cmn[0], ldc, | ||||
|       1,&batchCount64); | ||||
|      */ | ||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI | ||||
| #warning "oneMKL implementation not built " | ||||
|  | ||||
|       oneapi::mkl::transpose iOpA; | ||||
|       oneapi::mkl::transpose iOpB; | ||||
|        | ||||
|       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||
|       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||
|       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||
|       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||
|       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||
|       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||
|  | ||||
|       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||
| 						  &iOpA, | ||||
| 						  &iOpB, | ||||
| 						  &m64,&n64,&k64, | ||||
| 						  (double *) &alpha_p[0], | ||||
| 						  (const double **)&Amk[0], (const int64_t *)&lda64, | ||||
| 						  (const double **)&Bkn[0], (const int64_t *)&ldb64, | ||||
| 						  (double *) &beta_p[0], | ||||
| 						  (double **)&Cmn[0], (const int64_t *)&ldc64, | ||||
| 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||
|       synchronise(); | ||||
| #endif | ||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||
|     int sda = lda*k; | ||||
|     int sdb = ldb*k; | ||||
|     int sdc = ldc*n; | ||||
|     // Need a default/reference implementation | ||||
|     for (int p = 0; p < batchCount; ++p) { | ||||
|       for (int mm = 0; mm < m; ++mm) { | ||||
| 	for (int nn = 0; nn < n; ++nn) { | ||||
| 	  RealD c_mn(0.0); | ||||
| 	  for (int kk = 0; kk < k; ++kk) | ||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | ||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; | ||||
| 	} | ||||
|     // Need a default/reference implementation; use Eigen | ||||
|       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; | ||||
| 	  }); | ||||
|       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { | ||||
| 	thread_for (p, batchCount, { | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); | ||||
| 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||
| 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; | ||||
| 	  }); | ||||
|       } else {  | ||||
| 	assert(0); | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
|      RealD t1=usecond(); | ||||
|      RealD flops = 2.0*m*n*k*batchCount; | ||||
|      RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; | ||||
|   } | ||||
|    | ||||
|  | ||||
|    | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Strided case used by benchmark, but generally unused in Grid | ||||
|   // Keep a code example in double complex, but don't generate the single and real variants for now | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|    | ||||
|   void gemmStridedBatched(int m,int n, int k, | ||||
| 			  ComplexD alpha, | ||||
| 			  ComplexD* Amk,  // pointer list to matrices | ||||
| 			  ComplexD* Bkn, | ||||
| 			  ComplexD beta, | ||||
| 			  ComplexD* Cmn, | ||||
| 			  int batchCount) | ||||
|   { | ||||
|     // Use C-row major storage, so transpose calls | ||||
|     int lda = m; // m x k column major | ||||
|     int ldb = k; // k x n column major | ||||
|     int ldc = m; // m x b column major | ||||
|     int sda = m*k; | ||||
|     int sdb = k*n; | ||||
|     int sdc = m*n; | ||||
|     deviceVector<ComplexD> alpha_p(1); | ||||
|     deviceVector<ComplexD> beta_p(1); | ||||
|     acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); | ||||
|     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); | ||||
|     //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl; | ||||
|     //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl; | ||||
|     //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl; | ||||
| #ifdef GRID_HIP | ||||
|     auto err = hipblasZgemmStridedBatched(gridblasHandle, | ||||
| 					  HIPBLAS_OP_N, | ||||
| 					  HIPBLAS_OP_N, | ||||
| 					  m,n,k, | ||||
| 					  (hipblasDoubleComplex *) &alpha_p[0], | ||||
| 					  (hipblasDoubleComplex *) Amk, lda, sda, | ||||
| 					  (hipblasDoubleComplex *) Bkn, ldb, sdb, | ||||
| 					  (hipblasDoubleComplex *) &beta_p[0], | ||||
| 					  (hipblasDoubleComplex *) Cmn, ldc, sdc, | ||||
| 					  batchCount); | ||||
|     assert(err==HIPBLAS_STATUS_SUCCESS); | ||||
| #endif | ||||
| #ifdef GRID_CUDA | ||||
|     cublasZgemmStridedBatched(gridblasHandle, | ||||
| 			      CUBLAS_OP_N, | ||||
| 			      CUBLAS_OP_N, | ||||
| 			      m,n,k, | ||||
| 			      (cuDoubleComplex *) &alpha_p[0], | ||||
| 			      (cuDoubleComplex *) Amk, lda, sda, | ||||
| 			      (cuDoubleComplex *) Bkn, ldb, sdb, | ||||
| 			      (cuDoubleComplex *) &beta_p[0], | ||||
| 			      (cuDoubleComplex *) Cmn, ldc, sdc, | ||||
| 			      batchCount); | ||||
| #endif | ||||
| #if defined(GRID_SYCL) || defined(GRID_ONE_MKL) | ||||
|     oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||
| 						oneapi::mkl::transpose::N, | ||||
| 						oneapi::mkl::transpose::N, | ||||
| 						m,n,k, | ||||
| 						alpha, | ||||
| 						(const ComplexD *)Amk,lda,sda, | ||||
| 						(const ComplexD *)Bkn,ldb,sdb, | ||||
| 						beta, | ||||
| 						(ComplexD *)Cmn,ldc,sdc, | ||||
| 						batchCount); | ||||
| #endif | ||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) | ||||
|      // Need a default/reference implementation | ||||
|      for (int p = 0; p < batchCount; ++p) { | ||||
|        for (int mm = 0; mm < m; ++mm) { | ||||
| 	 for (int nn = 0; nn < n; ++nn) { | ||||
| 	   ComplexD c_mn(0.0); | ||||
| 	   for (int kk = 0; kk < k; ++kk) | ||||
| 	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; | ||||
| 	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc]; | ||||
| 	 } | ||||
|        } | ||||
|      } | ||||
| #endif | ||||
|   } | ||||
|  | ||||
|   template<class CComplex> | ||||
|   double benchmark(int M, int N, int K, int BATCH) | ||||
|   { | ||||
|     int32_t N_A = M*K*BATCH; | ||||
|     int32_t N_B = K*N*BATCH; | ||||
|     int32_t N_C = M*N*BATCH; | ||||
|     deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); | ||||
|     deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); | ||||
|     deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); | ||||
|     ComplexD alpha(1.0); | ||||
|     ComplexD beta (1.0); | ||||
|     deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex)); | ||||
|     deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex)); | ||||
|     deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex)); | ||||
|     CComplex alpha(1.0); | ||||
|     CComplex beta (1.0); | ||||
|     RealD flops = 8.0*M*N*K*BATCH; | ||||
|     int ncall=10; | ||||
|     int ncall=1000; | ||||
|     deviceVector<CComplex *> As(BATCH); | ||||
|     deviceVector<CComplex *> Bs(BATCH); | ||||
|     deviceVector<CComplex *> Cs(BATCH); | ||||
|     for(int b = 0 ; b < BATCH;b++) { | ||||
|       CComplex *ptr; | ||||
|       ptr = &A[b*M*K];      acceleratorPut(As[b],ptr); | ||||
|       ptr = &B[b*K*N];      acceleratorPut(Bs[b],ptr); | ||||
|       ptr = &C[b*M*N];      acceleratorPut(Cs[b],ptr); | ||||
|     } | ||||
|  | ||||
|     // Warm up call | ||||
|     gemmBatched(M,N,K, | ||||
| 		alpha, | ||||
| 		As, // m x k  | ||||
| 		Bs, // k x n | ||||
| 		beta,  | ||||
| 		Cs); | ||||
|     synchronise(); | ||||
|  | ||||
|     RealD t0 = usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
|       gemmStridedBatched(M,N,K, | ||||
| 			 alpha, | ||||
| 			 &A[0], // m x k  | ||||
| 			 &B[0], // k x n | ||||
| 			 beta,  | ||||
| 			 &C[0], // m x n | ||||
| 			 BATCH); | ||||
|       gemmBatched(M,N,K, | ||||
| 		  alpha, | ||||
| 		  As, // m x k  | ||||
| 		  Bs, // k x n | ||||
| 		  beta,  | ||||
| 		  Cs); | ||||
|       synchronise(); | ||||
|     } | ||||
|     synchronise(); | ||||
|     RealD t1 = usecond(); | ||||
|     RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH; | ||||
|     RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH; | ||||
|     flops = 8.0*M*N*K*BATCH*ncall; | ||||
|     flops = flops/(t1-t0)/1.e3; | ||||
|     return flops; // Returns gigaflops | ||||
|   } | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|   | ||||
| @@ -279,11 +279,11 @@ public: | ||||
|       Qt = Eigen::MatrixXcd::Identity(Nm,Nm); | ||||
|       diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid); | ||||
|       _sort.push(eval2,Nm); | ||||
|       //      Glog << "#Ritz value before shift: "<< std::endl; | ||||
|       Glog << "#Ritz value before shift: "<< std::endl; | ||||
|       for(int i=0; i<Nm; ++i){ | ||||
| 	//        std::cout.precision(13); | ||||
| 	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||
| 	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | ||||
| 	std::cout.precision(13); | ||||
| 	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||
| 	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | ||||
|       } | ||||
|        | ||||
|       //---------------------------------------------------------------------- | ||||
| @@ -297,7 +297,8 @@ public: | ||||
|          | ||||
|         unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM); | ||||
|  | ||||
|         for(int ip=Nk; ip<Nm; ++ip){  | ||||
|         for(int ip=Nk; ip<Nm; ++ip){ | ||||
| 	  Glog << " ip "<<ip<<" / "<<Nm<<std::endl; | ||||
|           shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q); | ||||
|         } | ||||
|          | ||||
| @@ -325,7 +326,7 @@ public: | ||||
|         Qt = Eigen::MatrixXcd::Identity(Nm,Nm); | ||||
|         diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid); | ||||
|         _sort.push(eval2,Nk); | ||||
| 	//        Glog << "#Ritz value after shift: "<< std::endl; | ||||
| 	Glog << "#Ritz value after shift: "<< std::endl; | ||||
|         for(int i=0; i<Nk; ++i){ | ||||
| 	  //          std::cout.precision(13); | ||||
| 	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||
| @@ -467,10 +468,10 @@ public: | ||||
|    | ||||
|     // set initial vector | ||||
|     for (int i=0; i<Nu; ++i) { | ||||
|       //      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl; | ||||
|       Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl; | ||||
|       evec[i] = src[i]; | ||||
|       orthogonalize(evec[i],evec,i); | ||||
|       //      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl; | ||||
|       Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl; | ||||
|     } | ||||
| //    exit(-43); | ||||
|      | ||||
| @@ -506,11 +507,11 @@ public: | ||||
|       Qt = Eigen::MatrixXcd::Identity(Nr,Nr); | ||||
|       diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid); | ||||
|       _sort.push(eval2,Nr); | ||||
|       //      Glog << "#Ritz value: "<< std::endl; | ||||
|       Glog << "#Ritz value: "<< std::endl; | ||||
|       for(int i=0; i<Nr; ++i){ | ||||
| 	//        std::cout.precision(13); | ||||
| 	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||
| 	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | ||||
| 	std::cout.precision(13); | ||||
| 	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||
| 	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | ||||
|       } | ||||
|        | ||||
|       // Convergence test | ||||
| @@ -570,6 +571,7 @@ public: | ||||
|       Glog << fname + " NOT converged ; Summary :\n"; | ||||
|     } else { | ||||
|       Glog << fname + " CONVERGED ; Summary :\n"; | ||||
|       Nstop = Nconv_guess; // Just take them all | ||||
|       // Sort convered eigenpairs. | ||||
|       std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating | ||||
|  | ||||
| @@ -642,7 +644,7 @@ private: | ||||
|       //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl; | ||||
|       k_start +=mrhs; | ||||
|     } | ||||
|     //    Glog << "LinAlg "<< std::endl; | ||||
|     Glog << "LinAlg "<< std::endl; | ||||
|      | ||||
|     if (b>0) { | ||||
|       for (int u=0; u<Nu; ++u) { | ||||
| @@ -676,7 +678,7 @@ private: | ||||
|       } | ||||
|       w_copy[u] = w[u]; | ||||
|     } | ||||
|     //    Glog << "LinAlg done"<< std::endl; | ||||
|     Glog << "LinAlg done"<< std::endl; | ||||
|      | ||||
|     // In block version, the steps 6 and 7 in Lanczos construction is | ||||
|     // replaced by the QR decomposition of new basis block. | ||||
| @@ -689,15 +691,15 @@ private: | ||||
|     } | ||||
|  | ||||
|     // re-orthogonalization for numerical stability | ||||
|     //    Glog << "Gram Schmidt"<< std::endl; | ||||
|     Glog << "Gram Schmidt"<< std::endl; | ||||
|     orthogonalize(w,Nu,evec,R); | ||||
|     // QR part | ||||
|     for (int u=1; u<Nu; ++u) { | ||||
|       orthogonalize(w[u],w,u); | ||||
|     } | ||||
|     //    Glog << "Gram Schmidt done "<< std::endl; | ||||
|     Glog << "Gram Schmidt done "<< std::endl; | ||||
|      | ||||
|     //    Glog << "LinAlg "<< std::endl; | ||||
|     Glog << "LinAlg "<< std::endl; | ||||
|     for (int u=0; u<Nu; ++u) { | ||||
|       //for (int v=0; v<Nu; ++v) { | ||||
|       for (int v=u; v<Nu; ++v) { | ||||
| @@ -714,7 +716,7 @@ private: | ||||
| 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; | ||||
|       } | ||||
|     } | ||||
|     //    Glog << "LinAlg done "<< std::endl; | ||||
|     Glog << "LinAlg done "<< std::endl; | ||||
|  | ||||
|     if (b < Nm/Nu-1) { | ||||
|       for (int u=0; u<Nu; ++u) { | ||||
| @@ -779,7 +781,7 @@ private: | ||||
|      | ||||
|     for ( int u=0; u<Nu; ++u ) { | ||||
|       for (int k=0; k<Nk; ++k ) { | ||||
| //        Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl; | ||||
| 	//	Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl; | ||||
|         BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k]; | ||||
|       } | ||||
|     } | ||||
| @@ -933,7 +935,7 @@ if (1){ | ||||
|          int Nu, int Nb, int Nk, int Nm, | ||||
|          Eigen::MatrixXcd& M) | ||||
|   { | ||||
|     //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';  | ||||
|     Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';  | ||||
|     assert( Nk%Nu == 0 && Nm%Nu == 0 ); | ||||
|     assert( Nk <= Nm ); | ||||
|     M = Eigen::MatrixXcd::Zero(Nk,Nk); | ||||
| @@ -951,7 +953,7 @@ if (1){ | ||||
|         M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; | ||||
|       } | ||||
|     } | ||||
|     //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl;  | ||||
|     Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;  | ||||
|   } | ||||
|   | ||||
|  | ||||
| @@ -961,7 +963,7 @@ if (1){ | ||||
|          int Nu, int Nb, int Nk, int Nm, | ||||
|          Eigen::MatrixXcd& M) | ||||
|   { | ||||
|     //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';  | ||||
|     Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';  | ||||
|     assert( Nk%Nu == 0 && Nm%Nu == 0 ); | ||||
|     assert( Nk <= Nm ); | ||||
|      | ||||
| @@ -977,7 +979,7 @@ if (1){ | ||||
|         lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); | ||||
|       } | ||||
|     } | ||||
|     //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl;  | ||||
|     Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;  | ||||
|   } | ||||
|  | ||||
|  | ||||
| @@ -986,7 +988,7 @@ if (1){ | ||||
| 		            RealD Dsh, | ||||
| 		            Eigen::MatrixXcd& Qprod) | ||||
|   { | ||||
|     //Glog << "shiftedQRDecompEigen() begin" << '\n';  | ||||
|     Glog << "shiftedQRDecompEigen() begin" << '\n';  | ||||
|     Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||
|     Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||
|     Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||
| @@ -1002,6 +1004,7 @@ if (1){ | ||||
|                         // lower triangular part used to represent series | ||||
|                         // of Q sequence. | ||||
|  | ||||
|     Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';  | ||||
|     // equivalent operation of Qprod *= Q | ||||
|     //M = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||
|      | ||||
| @@ -1022,6 +1025,7 @@ if (1){ | ||||
|      | ||||
|     Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||
|  | ||||
|     Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';  | ||||
|     for (int i=0; i<Nm; ++i) { | ||||
|       for (int j=0; j<Nm-(Nu+1); ++j) { | ||||
|         for (int k=0; k<Nu+1+j; ++k) { | ||||
| @@ -1029,6 +1033,7 @@ if (1){ | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|     Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';  | ||||
|     for (int i=0; i<Nm; ++i) { | ||||
|       for (int j=Nm-(Nu+1); j<Nm; ++j) { | ||||
|         for (int k=0; k<Nm; ++k) { | ||||
| @@ -1036,6 +1041,7 @@ if (1){ | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|     Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';  | ||||
|      | ||||
|     //static int ntimes = 2; | ||||
|     //for (int j=0; j<Nm-(ntimes*Nu); ++j) { | ||||
| @@ -1061,11 +1067,13 @@ if (1){ | ||||
|         Mtmp(j,i) = conj(Mtmp(i,j)); | ||||
|       } | ||||
|     } | ||||
|     Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';  | ||||
|  | ||||
|     for (int i=0; i<Nm; ++i) { | ||||
|       Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; | ||||
|     } | ||||
|      | ||||
|     Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';  | ||||
|     M = Mtmp; | ||||
|  | ||||
|     //M = Q.adjoint()*(M*Q); | ||||
| @@ -1077,7 +1085,7 @@ if (1){ | ||||
|     //  } | ||||
|     //} | ||||
|      | ||||
|     //Glog << "shiftedQRDecompEigen() end" << endl;  | ||||
|     Glog << "shiftedQRDecompEigen() end" <<std::endl;  | ||||
|   } | ||||
|  | ||||
|   void exampleQRDecompEigen(void) | ||||
|   | ||||
| @@ -499,6 +499,87 @@ namespace Grid { | ||||
|       } | ||||
|   }; | ||||
|  | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Site diagonal is identity, left preconditioned by Mee^inv | ||||
|   // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta | ||||
|   // | ||||
|   // Solve: | ||||
|   // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta | ||||
|   // | ||||
|   // Old notation e<->o | ||||
|   // | ||||
|   // Left precon by Moo^-1 | ||||
|   //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o | ||||
|   //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e) | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> { | ||||
|   public: | ||||
|     typedef CheckerBoardedSparseMatrixBase<Field> Matrix; | ||||
|  | ||||
|     ///////////////////////////////////////////////////// | ||||
|     // Wrap the usual normal equations Schur trick | ||||
|     ///////////////////////////////////////////////////// | ||||
|   SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false, | ||||
|       const bool _solnAsInitGuess = false)   | ||||
|     : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {}; | ||||
|  | ||||
|     virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) | ||||
|     { | ||||
|       GridBase *grid = _Matrix.RedBlackGrid(); | ||||
|       GridBase *fgrid= _Matrix.Grid(); | ||||
|  | ||||
|       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||
|        | ||||
|       Field   tmp(grid); | ||||
|       Field  Mtmp(grid); | ||||
|  | ||||
|       pickCheckerboard(Even,src_e,src); | ||||
|       pickCheckerboard(Odd ,src_o,src); | ||||
|      | ||||
|       ///////////////////////////////////////////////////// | ||||
|       // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e) | ||||
|       ///////////////////////////////////////////////////// | ||||
|       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even); | ||||
|       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);      | ||||
|       Mtmp=src_o-Mtmp;                  | ||||
|       _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);      | ||||
|        | ||||
|       // get the right MpcDag | ||||
|       _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);        | ||||
|     } | ||||
|  | ||||
|     virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) | ||||
|     { | ||||
|       GridBase *grid = _Matrix.RedBlackGrid(); | ||||
|       GridBase *fgrid= _Matrix.Grid(); | ||||
|  | ||||
|       Field   tmp(grid); | ||||
|       Field   sol_e(grid); | ||||
|  | ||||
|  | ||||
|       /////////////////////////////////////////////////// | ||||
|       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... | ||||
|       /////////////////////////////////////////////////// | ||||
|       _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even); | ||||
|       tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even); | ||||
|       _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even); | ||||
|       | ||||
|       setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even); | ||||
|       setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd ); | ||||
|     }; | ||||
|  | ||||
|     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o) | ||||
|     { | ||||
|       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||
|       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); | ||||
|     }; | ||||
|     virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o) | ||||
|     { | ||||
|       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||
|       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Site diagonal is identity, right preconditioned by Mee^inv | ||||
|   // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta | ||||
|   | ||||
| @@ -54,6 +54,9 @@ public: | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|     profilerAllocate(bytes); | ||||
|     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); | ||||
|     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||
|       printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||
|     } | ||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||
|     return ptr; | ||||
|   } | ||||
| @@ -100,6 +103,9 @@ public: | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|     profilerAllocate(bytes); | ||||
|     _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); | ||||
|     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||
|       printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||
|     } | ||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||
|     return ptr; | ||||
|   } | ||||
| @@ -145,6 +151,9 @@ public: | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|     profilerAllocate(bytes); | ||||
|     _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); | ||||
|     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||
|       printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||
|     } | ||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||
|     return ptr; | ||||
|   } | ||||
|   | ||||
| @@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid); | ||||
| uint64_t total_shared; | ||||
| uint64_t total_device; | ||||
| uint64_t total_host;; | ||||
|  | ||||
| #if defined(__has_feature) | ||||
| #if __has_feature(leak_sanitizer) | ||||
| #define ASAN_LEAK_CHECK | ||||
| #endif | ||||
| #endif | ||||
|  | ||||
| #ifdef ASAN_LEAK_CHECK | ||||
| #include <sanitizer/asan_interface.h> | ||||
| #include <sanitizer/common_interface_defs.h> | ||||
| #include <sanitizer/lsan_interface.h> | ||||
| #define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); } | ||||
| #else | ||||
| #define LEAK_CHECK(A) { } | ||||
| #endif | ||||
|  | ||||
| void MemoryManager::DisplayMallinfo(void) | ||||
| { | ||||
| #ifdef __linux__ | ||||
|   struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform | ||||
|    | ||||
|   mi = mallinfo(); | ||||
|  | ||||
|   std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl; | ||||
|   std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl; | ||||
|   std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl; | ||||
|   std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl; | ||||
|   std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl; | ||||
|   std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl; | ||||
|   std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl; | ||||
|   std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl; | ||||
|   std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl; | ||||
|   std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl; | ||||
| #endif | ||||
|   LEAK_CHECK(); | ||||
|   | ||||
| } | ||||
|  | ||||
| void MemoryManager::PrintBytes(void) | ||||
| { | ||||
|   std::cout << " MemoryManager : ------------------------------------ "<<std::endl; | ||||
| @@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void) | ||||
| #ifdef GRID_CUDA | ||||
|   cuda_mem(); | ||||
| #endif | ||||
|    | ||||
|   DisplayMallinfo(); | ||||
| } | ||||
|  | ||||
| uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | ||||
|   | ||||
| @@ -211,6 +211,7 @@ private: | ||||
| #endif | ||||
|  | ||||
|  public: | ||||
|   static void DisplayMallinfo(void); | ||||
|   static void NotifyDeletion(void * CpuPtr); | ||||
|   static void Print(void); | ||||
|   static void PrintAll(void); | ||||
|   | ||||
| @@ -91,6 +91,7 @@ public: | ||||
|   //////////////////////////////////////////////////////////////// | ||||
|   virtual int CheckerBoarded(int dim)=0; | ||||
|   virtual int CheckerBoard(const Coordinate &site)=0; | ||||
|   virtual int CheckerDim(void){ return 0; }; | ||||
|   virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; | ||||
|   virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; | ||||
|   virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; | ||||
|   | ||||
| @@ -60,6 +60,7 @@ public: | ||||
|   int              _checker_dim; | ||||
|   std::vector<int> _checker_board; | ||||
|  | ||||
|   virtual int CheckerDim(void){ return _checker_dim; }; | ||||
|   virtual int CheckerBoarded(int dim){ | ||||
|     if( dim==_checker_dim) return 1; | ||||
|     else return 0; | ||||
|   | ||||
| @@ -236,11 +236,18 @@ public: | ||||
|   template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ | ||||
|     vobj vtmp; | ||||
|     vtmp = r; | ||||
| #if 1 | ||||
|     auto me  = View(CpuWrite); | ||||
|     thread_for(ss,me.size(),{ | ||||
|        me[ss]= r; | ||||
|       }); | ||||
| #else     | ||||
|     auto me  = View(AcceleratorWrite); | ||||
|     accelerator_for(ss,me.size(),vobj::Nsimd(),{ | ||||
| 	auto stmp=coalescedRead(vtmp); | ||||
| 	coalescedWrite(me[ss],stmp); | ||||
|     }); | ||||
| #endif     | ||||
|     me.ViewClose(); | ||||
|     return *this; | ||||
|   } | ||||
|   | ||||
| @@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | ||||
|   const uint64_t sites = grid->oSites(); | ||||
|    | ||||
|   // Might make all code paths go this way. | ||||
| #if 0 | ||||
|   typedef decltype(innerProductD(vobj(),vobj())) inner_t; | ||||
|   Vector<inner_t> inner_tmp(sites); | ||||
|   auto inner_tmp_v = &inner_tmp[0]; | ||||
|   { | ||||
|     autoView( left_v , left, AcceleratorRead); | ||||
|     autoView( right_v,right, AcceleratorRead); | ||||
|     // This code could read coalesce | ||||
|     // GPU - SIMT lane compliance... | ||||
|     accelerator_for( ss, sites, nsimd,{ | ||||
| 	auto x_l = left_v(ss); | ||||
| 	auto y_l = right_v(ss); | ||||
| 	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l)); | ||||
|     }); | ||||
|   } | ||||
| #else | ||||
|   typedef decltype(innerProduct(vobj(),vobj())) inner_t; | ||||
|   Vector<inner_t> inner_tmp(sites); | ||||
|   deviceVector<inner_t> inner_tmp(sites); | ||||
|   auto inner_tmp_v = &inner_tmp[0]; | ||||
|      | ||||
|   { | ||||
| @@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | ||||
| 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); | ||||
|     }); | ||||
|   } | ||||
| #endif | ||||
|   // This is in single precision and fails some tests | ||||
|   auto anrm = sumD(inner_tmp_v,sites);   | ||||
|   nrm = anrm; | ||||
| @@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt | ||||
|   nrm = real(TensorRemove(sum(inner_tmp_v,sites))); | ||||
| #else | ||||
|   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; | ||||
|   Vector<inner_t> inner_tmp(sites); | ||||
|   deviceVector<inner_t> inner_tmp; | ||||
|   inner_tmp.resize(sites); | ||||
|   auto inner_tmp_v = &inner_tmp[0]; | ||||
|  | ||||
|   accelerator_for( ss, sites, nsimd,{ | ||||
|   | ||||
| @@ -9,14 +9,18 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_objectD sobjD; | ||||
|   sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator); | ||||
|   static Vector<sobj> mysum; | ||||
|   mysum.resize(1); | ||||
|   sobj *mysum_p = & mysum[0]; | ||||
|   sobj identity; zeroit(identity); | ||||
|   mysum[0] = identity; | ||||
|   sobj ret ;  | ||||
|  | ||||
|   Integer nsimd= vobj::Nsimd(); | ||||
|    | ||||
|  | ||||
|   const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); | ||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||
|      auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>()); | ||||
|     auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList); | ||||
|      cgh.parallel_for(cl::sycl::range<1>{osites}, | ||||
| 		      Reduction, | ||||
| 		      [=] (cl::sycl::id<1> item, auto &sum) { | ||||
| @@ -26,7 +30,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os | ||||
|    }); | ||||
|   theGridAccelerator->wait(); | ||||
|   ret = mysum[0]; | ||||
|   free(mysum,*theGridAccelerator); | ||||
|   //  free(mysum,*theGridAccelerator); | ||||
|   sobjD dret; convertType(dret,ret); | ||||
|   return dret; | ||||
| } | ||||
| @@ -73,19 +77,23 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite | ||||
| template<class Word> Word svm_xor(Word *vec,uint64_t L) | ||||
| { | ||||
|   Word xorResult; xorResult = 0; | ||||
|   Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); | ||||
|   static Vector<Word> d_sum; | ||||
|   d_sum.resize(1); | ||||
|   Word *d_sum_p=&d_sum[0]; | ||||
|   Word identity;  identity=0; | ||||
|   d_sum[0] = identity; | ||||
|   const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); | ||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||
|      auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); | ||||
|     auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); | ||||
|      cgh.parallel_for(cl::sycl::range<1>{L}, | ||||
| 		      Reduction, | ||||
| 		      [=] (cl::sycl::id<1> index, auto &sum) { | ||||
| 	 sum ^=vec[index]; | ||||
| 	 sum^=vec[index]; | ||||
|      }); | ||||
|    }); | ||||
|   theGridAccelerator->wait(); | ||||
|   Word ret = d_sum[0]; | ||||
|   free(d_sum,*theGridAccelerator); | ||||
|   //  free(d_sum,*theGridAccelerator); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| #pragma once | ||||
| #include <type_traits> | ||||
|  | ||||
| #if defined(GRID_CUDA) | ||||
|  | ||||
| #include <cub/cub.cuh> | ||||
| @@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V | ||||
|    | ||||
|  | ||||
| } | ||||
| #endif  | ||||
|  | ||||
| template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | ||||
|  | ||||
| #if defined(GRID_SYCL) | ||||
| template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||
| { | ||||
|   size_t subvol_size = e1*e2; | ||||
|  | ||||
|   vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); | ||||
|   vobj vobj_zero; | ||||
|   zeroit(vobj_zero); | ||||
|   for (int r = 0; r<rd; r++) {  | ||||
|     mysum[r] = vobj_zero;  | ||||
|   } | ||||
|  | ||||
|   commVector<vobj> reduction_buffer(rd*subvol_size);     | ||||
|  | ||||
|   auto rb_p = &reduction_buffer[0]; | ||||
|  | ||||
|   // autoView(Data_v, Data, AcceleratorRead); | ||||
|  | ||||
|   //prepare reduction buffer  | ||||
|   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  | ||||
|    | ||||
|       int n = s / e2; | ||||
|       int b = s % e2; | ||||
|       int so=r*ostride; // base offset for start of plane  | ||||
|       int ss= so+n*stride+b; | ||||
|  | ||||
|       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); | ||||
|  | ||||
|   }); | ||||
|  | ||||
|   for (int r = 0; r < rd; r++) { | ||||
|       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||
|           auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); | ||||
|           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, | ||||
|           Reduction, | ||||
|           [=](cl::sycl::id<1> item, auto &sum) { | ||||
|               auto s = item[0]; | ||||
|               sum += rb_p[r*subvol_size+s]; | ||||
|           }); | ||||
|       }); | ||||
|        | ||||
|       | ||||
|   } | ||||
|   theGridAccelerator->wait(); | ||||
|   for (int r = 0; r < rd; r++) { | ||||
|     lvSum[r] = mysum[r]; | ||||
|   } | ||||
|   free(mysum,*theGridAccelerator); | ||||
| } | ||||
| #endif | ||||
|  | ||||
| template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | ||||
|   typedef typename vobj::vector_type vector; | ||||
|   const int words = sizeof(vobj)/sizeof(vector); | ||||
|   const int osites = rd*e1*e2; | ||||
| @@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V | ||||
| 	    buf[ss] = dat[ss*words+w]; | ||||
|     }); | ||||
|  | ||||
|     sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||
|        | ||||
|     #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||
|       sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||
|     #elif defined(GRID_SYCL) | ||||
|       sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||
|     #endif | ||||
|  | ||||
|     for (int r = 0; r < rd; r++) { | ||||
|       lvSum_ptr[w+words*r]=lvSum_small[r]; | ||||
|     } | ||||
| @@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V | ||||
|    | ||||
| } | ||||
|  | ||||
| template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | ||||
| template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | ||||
| { | ||||
|   autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. | ||||
|   autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. | ||||
|     if constexpr (sizeof(vobj) <= 256) {  | ||||
|       sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|  | ||||
|       #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||
|         sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|       #elif defined (GRID_SYCL) | ||||
|         sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|       #endif | ||||
|  | ||||
|     } | ||||
|     else { | ||||
|       sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|       sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|     } | ||||
| } | ||||
| #endif | ||||
|  | ||||
|  | ||||
| #if defined(GRID_SYCL) | ||||
| template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   size_t subvol_size = e1*e2; | ||||
|  | ||||
|   vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); | ||||
|   vobj vobj_zero; | ||||
|   zeroit(vobj_zero); | ||||
|      | ||||
|   commVector<vobj> reduction_buffer(rd*subvol_size);     | ||||
|  | ||||
|   auto rb_p = &reduction_buffer[0]; | ||||
|  | ||||
|   autoView(Data_v, Data, AcceleratorRead); | ||||
|  | ||||
|   //prepare reduction buffer  | ||||
|   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  | ||||
|    | ||||
|       int n = s / e2; | ||||
|       int b = s % e2; | ||||
|       int so=r*ostride; // base offset for start of plane  | ||||
|       int ss= so+n*stride+b; | ||||
|  | ||||
|       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); | ||||
|  | ||||
|   }); | ||||
|  | ||||
|   for (int r = 0; r < rd; r++) { | ||||
|       mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable | ||||
|       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||
|           auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); | ||||
|           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, | ||||
|           Reduction, | ||||
|           [=](cl::sycl::id<1> item, auto &sum) { | ||||
|               auto s = item[0]; | ||||
|               sum += rb_p[r*subvol_size+s]; | ||||
|           }); | ||||
|       }); | ||||
|       theGridAccelerator->wait(); | ||||
|       lvSum[r] = mysum[0]; | ||||
|   } | ||||
|    | ||||
|   free(mysum,*theGridAccelerator); | ||||
| } | ||||
| #endif | ||||
|  | ||||
| template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||
| { | ||||
|   // sum over reduced dimension planes, breaking out orthog dir | ||||
| @@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data | ||||
|  | ||||
| template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | ||||
| { | ||||
|   #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||
|   #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) | ||||
|    | ||||
|   sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|    | ||||
|   #elif defined(GRID_SYCL) | ||||
|    | ||||
|   sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|   sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|    | ||||
|   #else | ||||
|   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||
|   | ||||
| @@ -42,50 +42,21 @@ inline void subdivides(GridBase *coarse,GridBase *fine) | ||||
|     assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);  | ||||
|   } | ||||
| } | ||||
|  | ||||
|   | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // remove and insert a half checkerboard | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full) | ||||
| { | ||||
|   half.Checkerboard() = cb; | ||||
|  | ||||
|   autoView( half_v, half, CpuWrite); | ||||
|   autoView( full_v, full, CpuRead); | ||||
|   thread_for(ss, full.Grid()->oSites(),{ | ||||
|     int cbos; | ||||
|     Coordinate coor; | ||||
|     full.Grid()->oCoorFromOindex(coor,ss); | ||||
|     cbos=half.Grid()->CheckerBoard(coor); | ||||
|  | ||||
|     if (cbos==cb) { | ||||
|       int ssh=half.Grid()->oIndex(coor); | ||||
|       half_v[ssh] = full_v[ss]; | ||||
|     } | ||||
|   }); | ||||
|   acceleratorPickCheckerboard(cb,half,full); | ||||
| } | ||||
| template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half) | ||||
| { | ||||
|   int cb = half.Checkerboard(); | ||||
|   autoView( half_v , half, CpuRead); | ||||
|   autoView( full_v , full, CpuWrite); | ||||
|   thread_for(ss,full.Grid()->oSites(),{ | ||||
|  | ||||
|     Coordinate coor; | ||||
|     int cbos; | ||||
|  | ||||
|     full.Grid()->oCoorFromOindex(coor,ss); | ||||
|     cbos=half.Grid()->CheckerBoard(coor); | ||||
|        | ||||
|     if (cbos==cb) { | ||||
|       int ssh=half.Grid()->oIndex(coor); | ||||
|       full_v[ss]=half_v[ssh]; | ||||
|     } | ||||
|   }); | ||||
|   acceleratorSetCheckerboard(full,half); | ||||
| } | ||||
|  | ||||
| template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0) | ||||
| template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0) | ||||
| { | ||||
|   half.Checkerboard() = cb; | ||||
|   autoView(half_v, half, AcceleratorWrite); | ||||
| @@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj | ||||
|   unsigned long ndim_half          = half.Grid()->_ndimension; | ||||
|   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; | ||||
|   Coordinate ostride_half          = half.Grid()->_ostride; | ||||
|   int checker_dim_half             = half.Grid()->CheckerDim(); | ||||
|   accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{ | ||||
|      | ||||
|     Coordinate coor; | ||||
| @@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj | ||||
|     } | ||||
|   }); | ||||
| } | ||||
| template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0) | ||||
| template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0) | ||||
| { | ||||
|   int cb = half.Checkerboard(); | ||||
|   autoView(half_v , half, AcceleratorRead); | ||||
| @@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full, | ||||
|   unsigned long ndim_half          = half.Grid()->_ndimension; | ||||
|   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; | ||||
|   Coordinate ostride_half          = half.Grid()->_ostride; | ||||
|   int checker_dim_half             = half.Grid()->CheckerDim(); | ||||
|   accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{ | ||||
|  | ||||
|     Coordinate coor; | ||||
|   | ||||
| @@ -90,6 +90,7 @@ public: | ||||
|         exit(1); | ||||
|       } | ||||
|       Parameters.StartingType = arg; | ||||
|       std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl; | ||||
|     } | ||||
|  | ||||
|     if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) { | ||||
| @@ -97,6 +98,7 @@ public: | ||||
|       std::vector<int> ivec(0); | ||||
|       GridCmdOptionIntVector(arg, ivec); | ||||
|       Parameters.StartTrajectory = ivec[0]; | ||||
|       std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl; | ||||
|     } | ||||
|  | ||||
|     if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) { | ||||
| @@ -104,6 +106,7 @@ public: | ||||
|       std::vector<int> ivec(0); | ||||
|       GridCmdOptionIntVector(arg, ivec); | ||||
|       Parameters.Trajectories = ivec[0]; | ||||
|       std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl; | ||||
|     } | ||||
|  | ||||
|     if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) { | ||||
| @@ -111,6 +114,7 @@ public: | ||||
|       std::vector<int> ivec(0); | ||||
|       GridCmdOptionIntVector(arg, ivec); | ||||
|       Parameters.NoMetropolisUntil = ivec[0]; | ||||
|       std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl; | ||||
|     } | ||||
|     if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) { | ||||
|       arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile"); | ||||
|   | ||||
| @@ -137,9 +137,11 @@ public: | ||||
|  | ||||
|       double start_force = usecond(); | ||||
|  | ||||
|       MemoryManager::Print(); | ||||
|       as[level].actions.at(a)->deriv_timer_start(); | ||||
|       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta | ||||
|       as[level].actions.at(a)->deriv_timer_stop(); | ||||
|       MemoryManager::Print(); | ||||
|  | ||||
|       auto name = as[level].actions.at(a)->action_name(); | ||||
|  | ||||
| @@ -246,7 +248,11 @@ public: | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   virtual ~Integrator() {} | ||||
|   virtual ~Integrator() | ||||
|   { | ||||
|     // Pain in the ass to clean up the Level pointers | ||||
|     // Guido's design is at fault as per comment above in constructor | ||||
|   } | ||||
|  | ||||
|   virtual std::string integrator_name() = 0; | ||||
|    | ||||
| @@ -460,6 +466,7 @@ public: | ||||
|     for (int level = 0; level < as.size(); ++level) { | ||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||
|  | ||||
| 	MemoryManager::Print(); | ||||
|         // get gauge field from the SmearingPolicy and | ||||
|         // based on the boolean is_smeared in actionID | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||
| @@ -468,6 +475,7 @@ public: | ||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||
|         H += Hterm; | ||||
| 	MemoryManager::Print(); | ||||
|  | ||||
|       } | ||||
|       as[level].apply(S_hireps, Representations, level, H); | ||||
|   | ||||
| @@ -32,7 +32,9 @@ private: | ||||
|   //  Smear_Stout<Gimpl> *StoutSmearing; | ||||
|   //  std::vector<GaugeField> SmearedSet; | ||||
|    | ||||
|   GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object | ||||
|   std::vector<LatticeLorentzComplex> masks; | ||||
|   std::vector<int> cbs; | ||||
|  | ||||
|   typedef typename SU3Adjoint::AMatrix AdjMatrix; | ||||
|   typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField; | ||||
| @@ -147,6 +149,25 @@ private: | ||||
|     } | ||||
|     pokeLorentz(Fdet, Fdet_pol, nu); | ||||
|   } | ||||
|  | ||||
|   void Compute_MpInvJx_dNxxdSy(int cb, | ||||
| 			       const GaugeLinkField &PlaqL, | ||||
| 			       const GaugeLinkField &PlaqR, | ||||
| 			       AdjMatrixField MpInvJx, | ||||
| 			       AdjVectorField &Fdet2 ) | ||||
|   { | ||||
|     GaugeLinkField PlaqLeo(UrbGrid); | ||||
|     GaugeLinkField PlaqReo(UrbGrid); | ||||
|     AdjMatrixField MpInvJxeo(UrbGrid); | ||||
|     AdjVectorField Fdet2eo(UrbGrid); | ||||
|     pickCheckerboard(cb,PlaqLeo,PlaqL); | ||||
|     pickCheckerboard(cb,PlaqReo,PlaqR); | ||||
|     pickCheckerboard(cb,MpInvJxeo,MpInvJx); | ||||
|     Fdet2eo.Checkerboard()=cb; | ||||
|     Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo); | ||||
|     setCheckerboard(Fdet2,Fdet2eo); | ||||
|   } | ||||
|    | ||||
|   void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 ) | ||||
|   { | ||||
|     GaugeLinkField UtaU(PlaqL.Grid()); | ||||
| @@ -278,8 +299,9 @@ public: | ||||
|     //////////////////////////////////////////////////////////////////////////////// | ||||
|     // Mask the gauge field | ||||
|     //////////////////////////////////////////////////////////////////////////////// | ||||
|     int cb = cbs[smr]; | ||||
|     auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask | ||||
|  | ||||
|      | ||||
|     Umsk = U; | ||||
|     ApplyMask(Umsk,smr); | ||||
|     Utmp = peekLorentz(Umsk,mu); | ||||
| @@ -442,7 +464,7 @@ public: | ||||
|     AdjMatrixField MpInvJx_nu(grid); | ||||
|     MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor | ||||
|  | ||||
|     Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV); | ||||
|     Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV); | ||||
|     Fdet2_mu=FdetV; | ||||
|     Fdet1_mu=Zero(); | ||||
|      | ||||
| @@ -499,7 +521,7 @@ public: | ||||
|  | ||||
| 	time=-usecond(); | ||||
| 	PlaqR=(-1.0)*PlaqR; | ||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV); | ||||
| 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV); | ||||
| 	Fdet2_nu = FdetV; | ||||
| 	time+=usecond(); | ||||
| 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl; | ||||
| @@ -520,7 +542,7 @@ public: | ||||
| 	 | ||||
|  | ||||
| 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | ||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | ||||
| 	 | ||||
| 	///////////////// -ve nu ///////////////// | ||||
| @@ -539,7 +561,7 @@ public: | ||||
| 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y; | ||||
|  | ||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | ||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | ||||
| 	 | ||||
| 	// x== | ||||
| @@ -560,7 +582,7 @@ public: | ||||
|  | ||||
| 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | ||||
| 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1); | ||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | ||||
|  | ||||
| 	///////////////////////////////////////////////////////////////////// | ||||
| @@ -589,7 +611,7 @@ public: | ||||
|  | ||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,-1); | ||||
|  | ||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Fdet2_mu = Fdet2_mu+FdetV; | ||||
|  | ||||
| 	//  __ | ||||
| @@ -609,7 +631,7 @@ public: | ||||
|  | ||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | ||||
|  | ||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||
| 	Fdet2_mu = Fdet2_mu+FdetV; | ||||
| 	 | ||||
|       } | ||||
| @@ -931,6 +953,10 @@ private: | ||||
| public: | ||||
|  | ||||
|   /* Standard constructor */ | ||||
|   virtual ~SmearedConfigurationMasked() | ||||
|   { | ||||
|     delete UrbGrid; | ||||
|   } | ||||
|   SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout) | ||||
|     : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout) | ||||
|   { | ||||
| @@ -939,7 +965,6 @@ public: | ||||
|     // was resized in base class | ||||
|     assert(this->SmearedSet.size()==Nsmear); | ||||
|      | ||||
|     GridRedBlackCartesian * UrbGrid; | ||||
|     UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid); | ||||
|     LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0); | ||||
|     LatticeComplex tmp(_UGrid); | ||||
| @@ -947,10 +972,11 @@ public: | ||||
|     for (unsigned int i = 0; i < this->smearingLevels; ++i) { | ||||
|  | ||||
|       masks.push_back(*(new LatticeLorentzComplex(_UGrid))); | ||||
|  | ||||
|       int mu= (i/2) %Nd; | ||||
|       int cb= (i%2); | ||||
|       LatticeComplex tmpcb(UrbGrid); | ||||
|  | ||||
|       cbs.push_back(cb); | ||||
| 	 | ||||
|       masks[i]=Zero(); | ||||
|       //////////////////// | ||||
| @@ -962,7 +988,6 @@ public: | ||||
|       PokeIndex<LorentzIndex>(masks[i],tmp, mu); | ||||
| 	 | ||||
|     } | ||||
|     delete UrbGrid; | ||||
|   } | ||||
|    | ||||
|   virtual void smeared_force(GaugeField &SigmaTilde)  | ||||
|   | ||||
| @@ -418,32 +418,32 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, | ||||
|   int hNNm1= NNm1/2; | ||||
|   RealD sqrt_2 = sqrt(2.0); | ||||
|   Complex ci(0.0,1.0); | ||||
|   for(int su2Index=0;su2Index<hNNm1;su2Index++){ | ||||
|     int i1, i2; | ||||
|     su2SubGroupIndex(i1, i2, su2Index); | ||||
|     int ax = su2Index*2; | ||||
|     int ay = su2Index*2+1; | ||||
|     accelerator_for(ss,grid->oSites(),1,{ | ||||
|  | ||||
|   const int nsimd=  Matrix::Nsimd(); | ||||
|   accelerator_for(ss,grid->oSites(),nsimd,{ | ||||
|       for(int su2Index=0;su2Index<hNNm1;su2Index++){ | ||||
| 	int i1, i2; | ||||
| 	su2SubGroupIndex(i1, i2, su2Index); | ||||
| 	int ax = su2Index*2; | ||||
| 	int ay = su2Index*2+1; | ||||
| 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian. | ||||
| 	// trace( Ta x Ci in) | ||||
| 	// Bet I need to move to real part with mult by -i | ||||
| 	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2))); | ||||
| 	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1))); | ||||
|       }); | ||||
|   } | ||||
|   for(int diagIndex=0;diagIndex<N-1;diagIndex++){ | ||||
|     int k = diagIndex + 1; // diagIndex starts from 0 | ||||
|     int a = NNm1+diagIndex; | ||||
|     RealD scale = 1.0/sqrt(2.0*k*(k+1)); | ||||
|     accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{ | ||||
| 	auto tmp = in_v[ss]()()(0,0); | ||||
| 	coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2)))); | ||||
| 	coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1)))); | ||||
|       } | ||||
|       for(int diagIndex=0;diagIndex<N-1;diagIndex++){ | ||||
| 	int k = diagIndex + 1; // diagIndex starts from 0 | ||||
| 	int a = NNm1+diagIndex; | ||||
| 	RealD scale = 1.0/sqrt(2.0*k*(k+1)); | ||||
| 	auto tmp = in_v(ss)()()(0,0); | ||||
| 	for(int i=1;i<k;i++){ | ||||
| 	  tmp=tmp+in_v[ss]()()(i,i); | ||||
| 	  tmp=tmp+in_v(ss)()()(i,i); | ||||
| 	} | ||||
| 	tmp = tmp - in_v[ss]()()(k,k)*k; | ||||
| 	out_v[ss]()()(a,b) =imag(tmp) * scale; | ||||
|       }); | ||||
|     } | ||||
| 	tmp = tmp - in_v(ss)()()(k,k)*k; | ||||
| 	coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale); | ||||
|       } | ||||
|     }); | ||||
| } | ||||
|  | ||||
|    | ||||
|   | ||||
| @@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) { | ||||
| //////////////////////////////////////////////////////////////////////// | ||||
| // Map a su2 subgroup number to the pair of rows that are non zero | ||||
| //////////////////////////////////////////////////////////////////////// | ||||
| static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { | ||||
| static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { | ||||
|   assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); | ||||
|  | ||||
|   int spare = su2_index; | ||||
|   | ||||
| @@ -99,6 +99,8 @@ using std::log; | ||||
| using std::exp; | ||||
| using std::sin; | ||||
| using std::cos; | ||||
| using std::asin; | ||||
| using std::acos; | ||||
|  | ||||
|  | ||||
| accelerator_inline RealF    conjugate(const RealF  & r){ return r; } | ||||
|   | ||||
| @@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) { | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|  | ||||
| #ifdef GRID_SYCL | ||||
| template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {}; | ||||
| template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {}; | ||||
| template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {}; | ||||
| #endif | ||||
|   | ||||
| @@ -539,12 +539,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize | ||||
|  | ||||
| #endif | ||||
|  | ||||
| inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) | ||||
| { | ||||
|   acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); | ||||
|   acceleratorCopySynchronise(); | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////// | ||||
| // CPU Target - No accelerator just thread instead | ||||
| ////////////////////////////////////////////// | ||||
| @@ -553,7 +547,6 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) | ||||
|  | ||||
| #undef GRID_SIMT | ||||
|  | ||||
|  | ||||
| inline void acceleratorMem(void) | ||||
| { | ||||
|   /* | ||||
| @@ -656,6 +649,12 @@ accelerator_inline void acceleratorFence(void) | ||||
|   return; | ||||
| } | ||||
|  | ||||
| inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) | ||||
| { | ||||
|   acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); | ||||
|   acceleratorCopySynchronise(); | ||||
| } | ||||
|  | ||||
| template<class T> void acceleratorPut(T& dev,T&host) | ||||
| { | ||||
|   acceleratorCopyToDevice(&host,&dev,sizeof(T)); | ||||
|   | ||||
							
								
								
									
										238
									
								
								HMC/ComputeWilsonFlow.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										238
									
								
								HMC/ComputeWilsonFlow.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,238 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid | ||||
|  | ||||
| Source file: HMC/ComputeWilsonFlow.cc | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Guido Cossu <guido.cossu@ed.ac.uk> | ||||
| Author: Shuhei Yamamoto <syamamoto@bnl.gov> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution | ||||
| directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <string> | ||||
|  | ||||
| namespace Grid{ | ||||
|   struct WFParameters: Serializable { | ||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters, | ||||
|             int, steps, | ||||
|             double, step_size, | ||||
|             int, meas_interval, | ||||
| 	    double, maxTau, // for the adaptive algorithm | ||||
| 	    int, meas_interval_density, | ||||
| 	    std::string, path);  | ||||
|         | ||||
|  | ||||
|     template <class ReaderClass > | ||||
|     WFParameters(Reader<ReaderClass>& Reader){ | ||||
|       read(Reader, "WilsonFlow", *this); | ||||
|     } | ||||
|  | ||||
|   }; | ||||
|  | ||||
|   struct ConfParameters: Serializable { | ||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters, | ||||
| 	   std::string, conf_path, | ||||
|            std::string, conf_prefix, | ||||
| 	   std::string, conf_smr_prefix, | ||||
|            std::string, rng_prefix, | ||||
| 	   int, StartConfiguration, | ||||
| 	   int, EndConfiguration, | ||||
|            int, Skip); | ||||
|    | ||||
|     template <class ReaderClass > | ||||
|     ConfParameters(Reader<ReaderClass>& Reader){ | ||||
|       read(Reader, "Configurations", *this); | ||||
|     } | ||||
|  | ||||
|   }; | ||||
| } | ||||
|  | ||||
| template <class T> void writeFile(T& in, std::string const fname){   | ||||
| #ifdef HAVE_LIME | ||||
|   // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111 | ||||
|   std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl; | ||||
|   Grid::emptyUserRecord record; | ||||
|   Grid::ScidacWriter WR(in.Grid()->IsBoss()); | ||||
|   WR.open(fname); | ||||
|   WR.writeScidacFieldRecord(in,record,0); | ||||
|   WR.close(); | ||||
| #endif | ||||
|   // What is the appropriate way to throw error? | ||||
| } | ||||
|  | ||||
|  | ||||
| int main(int argc, char **argv) { | ||||
|   using namespace Grid; | ||||
|    | ||||
|   Grid_init(&argc, &argv); | ||||
|   GridLogLayout(); | ||||
|  | ||||
|   auto latt_size   = GridDefaultLatt(); | ||||
|   auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd()); | ||||
|   auto mpi_layout  = GridDefaultMpi(); | ||||
|   GridCartesian               Grid(latt_size, simd_layout, mpi_layout); | ||||
|    | ||||
|   std::vector<int> seeds({1, 2, 3, 4, 5}); | ||||
|   GridSerialRNG sRNG; | ||||
|   GridParallelRNG pRNG(&Grid); | ||||
|   pRNG.SeedFixedIntegers(seeds); | ||||
|  | ||||
|   LatticeGaugeField Umu(&Grid), Uflow(&Grid); | ||||
|    | ||||
|   typedef Grid::XmlReader       Serialiser; | ||||
|   Serialiser Reader("input.xml", false, "root"); | ||||
|   WFParameters WFPar(Reader); | ||||
|   ConfParameters CPar(Reader); | ||||
|   CheckpointerParameters CPPar(CPar.conf_path+CPar.conf_prefix, CPar.conf_path+CPar.conf_smr_prefix, CPar.conf_path+CPar.rng_prefix); | ||||
|   NerscHmcCheckpointer<PeriodicGimplR> CPNersc(CPPar); | ||||
|  | ||||
|   for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){ | ||||
|  | ||||
|   CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG); | ||||
|  | ||||
|   std::cout << std::setprecision(15); | ||||
|   std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl; | ||||
|    | ||||
|   std::string file_pre  = WFPar.path; | ||||
|   std::string file_post = CPar.conf_prefix + "." + std::to_string(conf); | ||||
|  | ||||
|   WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval); | ||||
|   WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){ | ||||
|      | ||||
|     typedef typename PeriodicGimplR::GaugeLinkField GaugeMat; | ||||
|     typedef typename PeriodicGimplR::ComplexField ComplexField; | ||||
|      | ||||
|     assert(Nd == 4); | ||||
|  | ||||
|     // NOTE: | ||||
|     // Ideally, turn the folloing into methods of the appropriate class | ||||
|     /////////////   Compute Energy Density via Clover Leaf    ///////////////////////////////////////////////// | ||||
|     ///// Taken from qcd/smearing/WilsonFlow.h | ||||
|     //         For plq, use static sitePlaquette from class WilsonLoops in Grid/qcd/utils/WilsonLoops.h and divide it by #faces=(1.0 * Nd * (Nd - 1)) / 2.0, ncol=3 | ||||
|     //E = 1/2 tr( F_munu F_munu ) | ||||
|     //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths: | ||||
|     //F_01 F_02 F_03   F_12 F_13  F_23 | ||||
|     GaugeMat F(U.Grid()); | ||||
|     //LatticeComplexD R(U.Grid()); | ||||
|     ComplexField R(U.Grid()); | ||||
|     R = Zero(); | ||||
|    | ||||
|     for(int mu=0;mu<3;mu++){ | ||||
|       for(int nu=mu+1;nu<4;nu++){ | ||||
| 	WilsonLoops<PeriodicGimplR>::FieldStrength(F, U, mu, nu); | ||||
| 	R = R + trace(F*F); | ||||
|       } | ||||
|     } | ||||
|     R = (-1.0) * R; | ||||
|      | ||||
|     //// Taken from qcd/utils/WilsonLoops.h | ||||
|      | ||||
|     // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y) | ||||
|     GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid()); | ||||
|     WilsonLoops<PeriodicGimplR>::FieldStrength(Bx, U, Ydir, Zdir); | ||||
|     WilsonLoops<PeriodicGimplR>::FieldStrength(By, U, Zdir, Xdir); | ||||
|     WilsonLoops<PeriodicGimplR>::FieldStrength(Bz, U, Xdir, Ydir); | ||||
|  | ||||
|     // Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z) | ||||
|     GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid()); | ||||
|     WilsonLoops<PeriodicGimplR>::FieldStrength(Ex, U, Tdir, Xdir); | ||||
|     WilsonLoops<PeriodicGimplR>::FieldStrength(Ey, U, Tdir, Ydir); | ||||
|     WilsonLoops<PeriodicGimplR>::FieldStrength(Ez, U, Tdir, Zdir); | ||||
|  | ||||
|     double coeff = 8.0/(32.0*M_PI*M_PI); | ||||
|     ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez); | ||||
|     //ComplexField qfield Plq(U.Grid()); | ||||
|     //WilsonLoops<PeriodicGimplR>::sitePlaquette(Plq, U); | ||||
|     //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0; | ||||
|     //Plq = coeff * Plq; | ||||
|  | ||||
|     int tau = std::round(t); | ||||
|     std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post; | ||||
|     writeFile(R,efile); | ||||
|     std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post; | ||||
|     writeFile(qfield,tfile); | ||||
|  | ||||
|     RealD E = real(sum(R))/ RealD(U.Grid()->gSites()); | ||||
|     RealD T = real( sum(qfield) ); | ||||
|     Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0; | ||||
|     RealD E0 = real(peekSite(R,scoor)); | ||||
|     RealD T0 = real(peekSite(qfield,scoor)); | ||||
|     std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  " | ||||
| 	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl; | ||||
|      | ||||
|   }); | ||||
|    | ||||
|   int t=WFPar.maxTau; | ||||
|   WF.smear(Uflow, Umu); | ||||
|  | ||||
|   RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow); | ||||
|   RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow); | ||||
|   RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t | ||||
|   RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow); | ||||
|   std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl; | ||||
|   std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl; | ||||
|   std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl; | ||||
|   std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl; | ||||
|  | ||||
|   std::cout<< GridLogMessage << " Admissibility check:\n"; | ||||
|   const double sp_adm = 0.067;                // admissible threshold | ||||
|   const double pl_adm = 1.0-sp_adm/Nc; | ||||
|   std::cout << GridLogMessage << "   (pl_adm =" << pl_adm << ")\n"; | ||||
|  | ||||
|   // Need min and reduce min for this function | ||||
|   //double sp_max = NC_*(1.0-stpl.plaq_min(U,pl_adm)); | ||||
|   double sp_ave = Nc*(1.0-WFlow_plaq); | ||||
|  | ||||
|   //std::cout<< GridLogMessage << "   sp_max = "        << sp_max <<"\n"; | ||||
|   std::cout<< GridLogMessage << "   sp_ave = "        << sp_ave <<"\n"; | ||||
|   std::cout<< GridLogMessage << "   (sp_admissible = "<< sp_adm <<")\n"; | ||||
|   //std::cout<< GridLogMessage << "   sp_admissible - sp_max = "<<sp_adm-sp_max <<"\n"; | ||||
|   std::cout<< GridLogMessage << "   sp_admissible - sp_ave = "<<sp_adm-sp_ave <<"\n"; | ||||
|   } | ||||
|   Grid_finalize(); | ||||
| }  // main | ||||
|  | ||||
|  | ||||
| /* | ||||
| Input file example | ||||
|  | ||||
|  | ||||
| JSON | ||||
|  | ||||
| { | ||||
|     "WilsonFlow":{ | ||||
| 	"steps": 200, | ||||
| 	"step_size": 0.01, | ||||
| 	"meas_interval": 50, | ||||
|   "maxTau": 2.0 | ||||
|     }, | ||||
|     "Configurations":{ | ||||
| 	"conf_prefix": "ckpoint_lat", | ||||
| 	"rng_prefix": "ckpoint_rng", | ||||
| 	"StartConfiguration": 3000, | ||||
| 	"EndConfiguration": 3000, | ||||
| 	"Skip": 5 | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| */ | ||||
| @@ -58,7 +58,7 @@ int main(int argc, char **argv) { | ||||
|   HMCparameters HMCparams; | ||||
|   HMCparams.StartTrajectory  = 0; | ||||
|   HMCparams.Trajectories     = 200; | ||||
|   HMCparams.NoMetropolisUntil=  20; | ||||
|   HMCparams.NoMetropolisUntil=  0; | ||||
|   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; | ||||
|   HMCparams.StartingType     =std::string("ColdStart"); | ||||
|   HMCparams.MD = MD; | ||||
| @@ -70,7 +70,7 @@ int main(int argc, char **argv) { | ||||
|   CheckpointerParameters CPparams; | ||||
|   CPparams.config_prefix = "ckpoint_EODWF_lat"; | ||||
|   CPparams.rng_prefix    = "ckpoint_EODWF_rng"; | ||||
|   CPparams.saveInterval  = 10; | ||||
|   CPparams.saveInterval  = 1; | ||||
|   CPparams.format        = "IEEE64BIG"; | ||||
|   TheHMC.Resources.LoadNerscCheckpointer(CPparams); | ||||
|  | ||||
| @@ -186,6 +186,8 @@ int main(int argc, char **argv) { | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////// | ||||
|   // HMC parameters are serialisable | ||||
|   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file                                                                      | ||||
|   TheHMC.initializeGaugeFieldAndRNGs(U); | ||||
|  | ||||
|   std::cout << GridLogMessage << " Running the HMC "<< std::endl; | ||||
|   TheHMC.Run();  // no smearing | ||||
|   | ||||
| @@ -30,11 +30,13 @@ directory | ||||
| #include <string> | ||||
|  | ||||
| template <class T> void readFile(T& out, std::string const fname){ | ||||
| #ifdef HAVE_LIME | ||||
|   Grid::emptyUserRecord record; | ||||
|   Grid::ScidacReader RD; | ||||
|   RD.open(fname); | ||||
|   RD.readScidacFieldRecord(out,record); | ||||
|   RD.close(); | ||||
| #endif | ||||
| } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -31,11 +31,13 @@ directory | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
| template <class T> void writeFile(T& out, std::string const fname){ | ||||
| #ifdef HAVE_LIME | ||||
|   emptyUserRecord record; | ||||
|   ScidacWriter WR(out.Grid()->IsBoss()); | ||||
|   WR.open(fname); | ||||
|   WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC); | ||||
|   WR.close(); | ||||
| #endif | ||||
| } | ||||
| NAMESPACE_END(Grid); | ||||
| int main(int argc, char **argv) { | ||||
|   | ||||
| @@ -261,23 +261,25 @@ public: | ||||
|     fprintf(FP,"\n\n"); | ||||
|   }; | ||||
|  | ||||
|  | ||||
|   template<class CComplex> | ||||
|   static void BLAS(void) | ||||
|   { | ||||
|     //int nbasis, int nrhs, int coarseVol | ||||
|     int  basis[] = { 16,32,64 }; | ||||
|     int  rhs[]   = { 8,16,32 }; | ||||
|     int  vol  = 4*4*4*4; | ||||
|     int  rhs[]   = { 8,12,16 }; | ||||
|     int  vol  = 8*8*8*8; | ||||
|     int  blk  = 4*4*4*4; | ||||
|  | ||||
|     GridBLAS blas; | ||||
|      | ||||
|  | ||||
|     int fpbits = sizeof(CComplex)*4; | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl; | ||||
|     std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl; | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl; | ||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|    | ||||
|     fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n"); | ||||
|     fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits); | ||||
|  | ||||
|     for(int b=0;b<3;b++){ | ||||
|     for(int r=0;r<3;r++){ | ||||
| @@ -285,7 +287,7 @@ public: | ||||
|       int N=rhs[r]; | ||||
|       int K=basis[b]; | ||||
|       int BATCH=vol; | ||||
|       double p=blas.benchmark(M,N,K,BATCH); | ||||
|       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||
|  | ||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||
|        | ||||
| @@ -299,9 +301,9 @@ public: | ||||
|     for(int r=0;r<3;r++){ | ||||
|       int M=basis[b]; | ||||
|       int N=rhs[r]; | ||||
|       int K=vol; | ||||
|       int K=blk; | ||||
|       int BATCH=vol; | ||||
|       double p=blas.benchmark(M,N,K,BATCH); | ||||
|       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||
|  | ||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||
| @@ -313,10 +315,10 @@ public: | ||||
|     for(int b=0;b<3;b++){ | ||||
|     for(int r=0;r<3;r++){ | ||||
|       int M=rhs[r]; | ||||
|       int N=vol; | ||||
|       int N=blk; | ||||
|       int K=basis[b]; | ||||
|       int BATCH=vol; | ||||
|       double p=blas.benchmark(M,N,K,BATCH); | ||||
|       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||
|  | ||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||
| @@ -867,6 +869,7 @@ int main (int argc, char ** argv) | ||||
|   int do_memory=1; | ||||
|   int do_comms =1; | ||||
|   int do_blas  =1; | ||||
|   int do_dslash=1; | ||||
|  | ||||
|   int sel=4; | ||||
|   std::vector<int> L_list({8,12,16,24,32}); | ||||
| @@ -877,6 +880,7 @@ int main (int argc, char ** argv) | ||||
|   std::vector<double> staggered; | ||||
|  | ||||
|   int Ls=1; | ||||
|   if (do_dslash){ | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl; | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
| @@ -901,6 +905,7 @@ int main (int argc, char ** argv) | ||||
|     staggered.push_back(result); | ||||
|   } | ||||
|  | ||||
|  | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
| @@ -909,8 +914,33 @@ int main (int argc, char ** argv) | ||||
|     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl; | ||||
|   } | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|   } | ||||
|  | ||||
|   int NN=NN_global; | ||||
|   if(do_dslash){ | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl; | ||||
|     fprintf(FP,"Per node summary table\n"); | ||||
|     fprintf(FP,"\n"); | ||||
|     fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n"); | ||||
|     fprintf(FP,"\n"); | ||||
|     for(int l=0;l<L_list.size();l++){ | ||||
|       std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl; | ||||
|       fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.); | ||||
|     } | ||||
|     fprintf(FP,"\n"); | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|  | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl; | ||||
|     std::cout<<std::setprecision(3); | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|   } | ||||
|  | ||||
|    | ||||
|   if ( do_memory ) { | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; | ||||
| @@ -918,15 +948,6 @@ int main (int argc, char ** argv) | ||||
|     Benchmark::Memory(); | ||||
|   } | ||||
|  | ||||
|   if ( do_blas ) { | ||||
| #if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)    | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl; | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     Benchmark::BLAS(); | ||||
| #endif | ||||
|   } | ||||
|  | ||||
|   if ( do_su4 ) { | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl; | ||||
| @@ -941,28 +962,14 @@ int main (int argc, char ** argv) | ||||
|     Benchmark::Comms(); | ||||
|   } | ||||
|  | ||||
|   if ( do_blas ) { | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; | ||||
|     std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl; | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl; | ||||
|     fprintf(FP,"Per node summary table\n"); | ||||
|     fprintf(FP,"\n"); | ||||
|     fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n"); | ||||
|     fprintf(FP,"\n"); | ||||
|     for(int l=0;l<L_list.size();l++){ | ||||
|       std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl; | ||||
|       fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.); | ||||
|     } | ||||
|     fprintf(FP,"\n"); | ||||
|  | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|  | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl; | ||||
|     std::cout<<std::setprecision(3); | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|  | ||||
|     Benchmark::BLAS<ComplexD>(); | ||||
|     Benchmark::BLAS<ComplexF>(); | ||||
|   } | ||||
|    | ||||
|   Grid_finalize(); | ||||
|   fclose(FP); | ||||
| } | ||||
|   | ||||
| @@ -1,16 +1,18 @@ | ||||
|  | ||||
| export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl "  | ||||
| export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc " | ||||
| ../../configure \ | ||||
| 	--enable-simd=GPU \ | ||||
| 	--enable-gen-simd-width=64 \ | ||||
| 	--enable-comms=mpi-auto \ | ||||
| 	--enable-debug \ | ||||
| 	--disable-gparity \ | ||||
| 	--disable-fermion-reps \ | ||||
| 	--with-lime=$CLIME \ | ||||
| 	--enable-shm=nvlink \ | ||||
| 	--enable-accelerator=sycl \ | ||||
| 	--enable-accelerator-aware-mpi=yes\ | ||||
| 	--enable-unified=no \ | ||||
| 	MPICXX=mpicxx \ | ||||
| 	CXX=icpx \ | ||||
| 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ | ||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" | ||||
| 	CXX=icpx  | ||||
|  | ||||
|   | ||||
							
								
								
									
										23
									
								
								systems/Aurora/config-command-leak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								systems/Aurora/config-command-leak
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| source ~/spack/share/spack/setup-env.sh  | ||||
| spack load c-lime | ||||
| export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` | ||||
| export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` | ||||
| export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||
|  | ||||
| ../../configure \ | ||||
| 	--enable-debug \ | ||||
| 	--enable-simd=GPU \ | ||||
| 	--enable-gen-simd-width=64 \ | ||||
| 	--enable-comms=mpi-auto \ | ||||
| 	--disable-gparity \ | ||||
| 	--disable-fermion-reps \ | ||||
| 	--with-lime=$CLIME \ | ||||
| 	--enable-shm=nvlink \ | ||||
| 	--enable-accelerator=sycl \ | ||||
| 	--enable-accelerator-aware-mpi=yes\ | ||||
| 	--enable-unified=no \ | ||||
| 	MPICXX=mpicxx \ | ||||
| 	CXX=icpx \ | ||||
| 	LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl  -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \ | ||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host  -fsycl -fsanitize=leak " | ||||
|  | ||||
							
								
								
									
										22
									
								
								systems/Aurora/config-command-sanitize
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								systems/Aurora/config-command-sanitize
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | ||||
| # -fsycl-targets=spir64_gen -Xs\" -device pvc \" | ||||
| # -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc | ||||
| # -fsycl-targets=intel_gpu_pvc | ||||
|  | ||||
| unset DEVICE | ||||
| export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -Xarch_host -fsanitize=address"  | ||||
| export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -Xarch_host -fsanitize=address  -fsycl-targets=spir64_gen -Xs -device -Xs pvc " | ||||
| ../../configure \ | ||||
| 	--enable-simd=GPU \ | ||||
| 	--enable-gen-simd-width=64 \ | ||||
| 	--enable-comms=mpi-auto \ | ||||
| 	--enable-debug \ | ||||
| 	--disable-gparity \ | ||||
| 	--disable-fermion-reps \ | ||||
| 	--with-lime=$CLIME \ | ||||
| 	--enable-shm=nvlink \ | ||||
| 	--enable-accelerator=sycl \ | ||||
| 	--enable-accelerator-aware-mpi=yes\ | ||||
| 	--enable-unified=no \ | ||||
| 	MPICXX=mpicxx \ | ||||
| 	CXX=icpx  | ||||
|  | ||||
| @@ -1,14 +1,22 @@ | ||||
| source ~/spack/share/spack/setup-env.sh  | ||||
| spack load c-lime | ||||
| export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` | ||||
| #spack load libefence | ||||
| #export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' ` | ||||
| #export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH | ||||
| #spack load gperftools | ||||
| export TCMALLOC=/home/paboyle/gperftools/install | ||||
| export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||
| export INTELGT_AUTO_ATTACH_DISABLE=1 | ||||
|  | ||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 | ||||
|  | ||||
| module load oneapi/release/2023.12.15.001 | ||||
|  | ||||
| #module load oneapi/release/2023.12.15.001 | ||||
| #module use /soft/modulefiles | ||||
| #module load intel_compute_runtime/release/agama-devel-682.22 | ||||
|  | ||||
| export FI_CXI_DEFAULT_CQ_SIZE=131072 | ||||
| export FI_CXI_CQ_FILL_PERCENT=20 | ||||
|  | ||||
| export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||
| #export FI_CXI_DEFAULT_CQ_SIZE=131072 | ||||
| #export FI_CXI_CQ_FILL_PERCENT=20 | ||||
| #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||
| #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" | ||||
|  | ||||
| # | ||||
| @@ -16,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||
| # -ftarget-register-alloc-mode=pvc:small | ||||
| # -ftarget-register-alloc-mode=pvc:large | ||||
| # -ftarget-register-alloc-mode=pvc:auto | ||||
| # | ||||
| #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 | ||||
|  | ||||
| export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 | ||||
| export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 | ||||
| export http_proxy=http://proxy.alcf.anl.gov:3128 | ||||
| export https_proxy=http://proxy.alcf.anl.gov:3128 | ||||
| #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 | ||||
| git config --global http.proxy http://proxy.alcf.anl.gov:3128 | ||||
|  | ||||
| #source ~/spack/share/spack/setup-env.sh | ||||
| #spack load gperftools | ||||
| #export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` | ||||
| #export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||
|  | ||||
| export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||
|   | ||||
							
								
								
									
										76
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.csv
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,76 @@ | ||||
| Memory Bandwidth | ||||
|  | ||||
| Bytes, GB/s per node | ||||
| 6291456, 379.297050 | ||||
| 100663296, 3754.674992 | ||||
| 509607936, 6521.472413 | ||||
| 1610612736, 8513.456479 | ||||
| 3932160000, 9018.901766 | ||||
|  | ||||
|  | ||||
| GEMM | ||||
|  | ||||
|  M, N, K, BATCH, GF/s per rank | ||||
| 16, 8, 16, 256, 0.564958 | ||||
| 16, 16, 16, 256, 243.148058 | ||||
| 16, 32, 16, 256, 440.346877 | ||||
| 32, 8, 32, 256, 439.194136 | ||||
| 32, 16, 32, 256, 847.334141 | ||||
| 32, 32, 32, 256, 1430.892623 | ||||
| 64, 8, 64, 256, 1242.756741 | ||||
| 64, 16, 64, 256, 2196.689493 | ||||
| 64, 32, 64, 256, 3697.458072 | ||||
| 16, 8, 256, 256, 899.582627 | ||||
| 16, 16, 256, 256, 1673.537756 | ||||
| 16, 32, 256, 256, 2959.597089 | ||||
| 32, 8, 256, 256, 1558.858630 | ||||
| 32, 16, 256, 256, 2864.839445 | ||||
| 32, 32, 256, 256, 4810.671254 | ||||
| 64, 8, 256, 256, 2386.092942 | ||||
| 64, 16, 256, 256, 4451.665937 | ||||
| 64, 32, 256, 256, 5942.124095 | ||||
| 8, 256, 16, 256, 799.867271 | ||||
| 16, 256, 16, 256, 1584.624888 | ||||
| 32, 256, 16, 256, 1949.422338 | ||||
| 8, 256, 32, 256, 1389.417474 | ||||
| 16, 256, 32, 256, 2668.344493 | ||||
| 32, 256, 32, 256, 3234.162120 | ||||
| 8, 256, 64, 256, 2150.925128 | ||||
| 16, 256, 64, 256, 4012.488132 | ||||
| 32, 256, 64, 256, 5154.785521 | ||||
|  | ||||
|  | ||||
|  | ||||
| Communications | ||||
|  | ||||
| Packet bytes, direction, GB/s per node | ||||
| 4718592, 1, 245.026198 | ||||
| 4718592, 2, 251.180996 | ||||
| 4718592, 3, 361.110977 | ||||
| 4718592, 5, 247.898447 | ||||
| 4718592, 6, 249.867523 | ||||
| 4718592, 7, 359.033061 | ||||
| 15925248, 1, 255.030946 | ||||
| 15925248, 2, 264.453890 | ||||
| 15925248, 3, 392.949183 | ||||
| 15925248, 5, 256.040644 | ||||
| 15925248, 6, 264.681896 | ||||
| 15925248, 7, 392.102622 | ||||
| 37748736, 1, 258.823333 | ||||
| 37748736, 2, 268.181577 | ||||
| 37748736, 3, 401.478191 | ||||
| 37748736, 5, 258.995363 | ||||
| 37748736, 6, 268.206586 | ||||
| 37748736, 7, 400.397611 | ||||
|  | ||||
|  | ||||
| Per node summary table | ||||
|  | ||||
| L , Wilson, DWF4, Staggered, GF/s per node | ||||
|  | ||||
| 8 , 155, 1386, 50 | ||||
| 12 , 694, 4208, 230 | ||||
| 16 , 1841, 6675, 609 | ||||
| 24 , 3934, 8573, 1641 | ||||
| 32 , 5083, 9771, 3086 | ||||
|  | ||||
| 
 | 
							
								
								
									
										702
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										702
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,702 @@ | ||||
| RANK 1 using GPU 1 | ||||
| RANK 5 using GPU 6 | ||||
| RANK 0 using GPU 0 | ||||
| RANK 2 using GPU 2 | ||||
| RANK 3 using GPU 3 | ||||
| RANK 6 using GPU 5 | ||||
| RANK 7 using GPU 4 | ||||
| RANK 4 using GPU 7 | ||||
| world_rank 0 has 1 devices | ||||
| AcceleratorHipInit: ======================== | ||||
| AcceleratorHipInit: Device Number    : 0 | ||||
| AcceleratorHipInit: ======================== | ||||
| AcceleratorHipInit: Device identifier: AMD Instinct MI250X | ||||
| AcceleratorHipInit:   totalGlobalMem: 68702699520  | ||||
| AcceleratorHipInit:   isMultiGpuBoard: 0  | ||||
| AcceleratorHipInit:   warpSize: 64  | ||||
| AcceleratorHipInit: using default device  | ||||
| AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding  | ||||
| AcceleratorHipInit: Configure options --enable-setdevice=no  | ||||
| local rank 0 device 0 bus id: 0000:c1:00.0 | ||||
| AcceleratorHipInit: ================================================ | ||||
| SharedMemoryMpi:  World communicator of size 8 | ||||
| SharedMemoryMpi:  Node  communicator of size 8 | ||||
| 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 4294967296bytes at 0x7ff651800000 - 7ff7517fffff for comms buffers  | ||||
| Setting up IPC | ||||
|  | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||
| __|_                                    _|__ | ||||
| __|_   GGGG    RRRR    III    DDDD      _|__ | ||||
| __|_  G        R   R    I     D   D     _|__ | ||||
| __|_  G        R   R    I     D    D    _|__ | ||||
| __|_  G  GG    RRRR     I     D    D    _|__ | ||||
| __|_  G   G    R  R     I     D   D     _|__ | ||||
| __|_   GGGG    R   R   III    DDDD      _|__ | ||||
| __|_                                    _|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
| __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||
|   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||
|  | ||||
|  | ||||
| Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
| Current Grid git commit hash=9a1ad6a5eb29a369d74784e7483c60e578323d76: (HEAD -> develop, origin/develop, origin/HEAD) clean | ||||
|  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : MPI is initialised and logging filters activated  | ||||
| Grid : Message : ================================================  | ||||
| Grid : Message : This rank is running on host frontier01320 | ||||
| Grid : Message : Requested 4294967296 byte stencil comms buffers  | ||||
| Grid : Message : MemoryManager Cache 54962159616 bytes  | ||||
| Grid : Message : MemoryManager::Init() setting up | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0 | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0 | ||||
| Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0 | ||||
| Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||
| Grid : Message : MemoryManager::Init() Using hipMalloc | ||||
| Grid : Message : 0.293720 s : ================================================================================== | ||||
| Grid : Message : 0.293790 s : = Grid is setup to use 1 threads | ||||
| Grid : Message : 0.293800 s : ================================================================================== | ||||
| Grid : Message : 0.293810 s : Grid Default Decomposition patterns | ||||
| Grid : Message : 0.293810 s : 	OpenMP threads : 1 | ||||
| Grid : Message : 0.293820 s : 	MPI tasks      : 1 2 2 2  | ||||
| Grid : Message : 0.293870 s : 	vReal          : 512bits ; 1 2 2 2  | ||||
| Grid : Message : 0.293890 s : 	vRealF         : 512bits ; 2 2 2 2  | ||||
| Grid : Message : 0.293910 s : 	vRealD         : 512bits ; 1 2 2 2  | ||||
| Grid : Message : 0.293920 s : 	vComplex       : 512bits ; 1 1 2 2  | ||||
| Grid : Message : 0.293930 s : 	vComplexF      : 512bits ; 1 2 2 2  | ||||
| Grid : Message : 0.293960 s : 	vComplexD      : 512bits ; 1 1 2 2  | ||||
| Grid : Message : 0.293970 s : ================================================================================== | ||||
| Grid : Message : 0.293980 s : ================================================================================== | ||||
| Grid : Message : 0.293990 s :  Clover dslash 4D vectorised (temporarily Wilson) | ||||
| Grid : Message : 0.294000 s : ================================================================================== | ||||
| Grid : Message : 0.301330 s : ================================================================================== | ||||
| Grid : Message : 0.301360 s : Benchmark DWF on 8^4 local volume  | ||||
| Grid : Message : 0.301370 s : * Nc             : 3 | ||||
| Grid : Message : 0.301380 s : * Global volume  : 8 16 16 16  | ||||
| Grid : Message : 0.301410 s : * Ls             : 1 | ||||
| Grid : Message : 0.301420 s : * ranks          : 8 | ||||
| Grid : Message : 0.301430 s : * nodes          : 1 | ||||
| Grid : Message : 0.301440 s : * ranks/node     : 8 | ||||
| Grid : Message : 0.301450 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 0.301460 s : * Using 1 threads | ||||
| Grid : Message : 0.301470 s : ================================================================================== | ||||
| Grid : Message : 0.345030 s : Initialised RNGs | ||||
| Grid : Message : 0.158302 s : ================================================================================== | ||||
| Grid : Message : 0.158310 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 0.158311 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 0.158312 s : * SINGLE precision  | ||||
| Grid : Message : 0.158313 s : ================================================================================== | ||||
| Grid : Message : 0.240681 s : Deo FlopsPerSite is 1344 | ||||
| Grid : Message : 0.240711 s : Deo mflop/s =   154914.0 (130.8) 139367.7-159565.9 | ||||
| Grid : Message : 0.240715 s : Deo mflop/s per rank   19364.3 | ||||
| Grid : Message : 0.240716 s : Deo mflop/s per node   154914.0 | ||||
| Grid : Message : 0.240718 s : ================================================================================== | ||||
| Grid : Message : 0.240719 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 0.240719 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 0.240719 s : * SINGLE precision  | ||||
| Grid : Message : 0.240719 s : ================================================================================== | ||||
| Grid : Message : 0.315028 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 0.315033 s : Deo mflop/s =   151459.5 (142.0) 131856.9-157286.4 | ||||
| Grid : Message : 0.315036 s : Deo mflop/s per rank   18932.4 | ||||
| Grid : Message : 0.315037 s : Deo mflop/s per node   151459.5 | ||||
| Grid : Message : 0.315038 s : ================================================================================== | ||||
| Grid : Message : 0.315040 s : 8^4 x 1 Deo Best  mflop/s        =   154914.0 ; 154914.0 per node  | ||||
| Grid : Message : 0.315042 s : 8^4 x 1 Deo Worst mflop/s        =   151459.5 ; 151459.5 per node  | ||||
| Grid : Message : 0.315043 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 0.315043 s : 154914.0 ; 151459.5 ;  | ||||
| Grid : Message : 0.315044 s : ================================================================================== | ||||
| Grid : Message : 0.316507 s : ================================================================================== | ||||
| Grid : Message : 0.316510 s : Benchmark DWF on 12^4 local volume  | ||||
| Grid : Message : 0.316511 s : * Nc             : 3 | ||||
| Grid : Message : 0.316512 s : * Global volume  : 12 24 24 24  | ||||
| Grid : Message : 0.316515 s : * Ls             : 1 | ||||
| Grid : Message : 0.316516 s : * ranks          : 8 | ||||
| Grid : Message : 0.316517 s : * nodes          : 1 | ||||
| Grid : Message : 0.316518 s : * ranks/node     : 8 | ||||
| Grid : Message : 0.316518 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 0.316519 s : * Using 1 threads | ||||
| Grid : Message : 0.316520 s : ================================================================================== | ||||
| Grid : Message : 0.327883 s : Initialised RNGs | ||||
| Grid : Message : 0.786395 s : ================================================================================== | ||||
| Grid : Message : 0.786404 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 0.786405 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 0.786406 s : * SINGLE precision  | ||||
| Grid : Message : 0.786406 s : ================================================================================== | ||||
| Grid : Message : 0.871646 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 0.871659 s : Deo mflop/s =   684982.2 (632.4) 609162.5-714594.5 | ||||
| Grid : Message : 0.871663 s : Deo mflop/s per rank   85622.8 | ||||
| Grid : Message : 0.871664 s : Deo mflop/s per node   684982.2 | ||||
| Grid : Message : 0.871665 s : ================================================================================== | ||||
| Grid : Message : 0.871665 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 0.871665 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 0.871665 s : * SINGLE precision  | ||||
| Grid : Message : 0.871665 s : ================================================================================== | ||||
| Grid : Message : 0.953697 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 0.953702 s : Deo mflop/s =   693556.6 (576.5) 663552.0-719204.7 | ||||
| Grid : Message : 0.953705 s : Deo mflop/s per rank   86694.6 | ||||
| Grid : Message : 0.953706 s : Deo mflop/s per node   693556.6 | ||||
| Grid : Message : 0.953707 s : ================================================================================== | ||||
| Grid : Message : 0.953708 s : 12^4 x 1 Deo Best  mflop/s        =   693556.6 ; 693556.6 per node  | ||||
| Grid : Message : 0.953710 s : 12^4 x 1 Deo Worst mflop/s        =   684982.2 ; 684982.2 per node  | ||||
| Grid : Message : 0.953712 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 0.953712 s : 684982.2 ; 693556.6 ;  | ||||
| Grid : Message : 0.953713 s : ================================================================================== | ||||
| Grid : Message : 0.957609 s : ================================================================================== | ||||
| Grid : Message : 0.957613 s : Benchmark DWF on 16^4 local volume  | ||||
| Grid : Message : 0.957614 s : * Nc             : 3 | ||||
| Grid : Message : 0.957615 s : * Global volume  : 16 32 32 32  | ||||
| Grid : Message : 0.957620 s : * Ls             : 1 | ||||
| Grid : Message : 0.957621 s : * ranks          : 8 | ||||
| Grid : Message : 0.957622 s : * nodes          : 1 | ||||
| Grid : Message : 0.957623 s : * ranks/node     : 8 | ||||
| Grid : Message : 0.957623 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 0.957624 s : * Using 1 threads | ||||
| Grid : Message : 0.957625 s : ================================================================================== | ||||
| Grid : Message : 0.985828 s : Initialised RNGs | ||||
| Grid : Message : 2.379761 s : ================================================================================== | ||||
| Grid : Message : 2.379772 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 2.379773 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 2.379774 s : * SINGLE precision  | ||||
| Grid : Message : 2.379775 s : ================================================================================== | ||||
| Grid : Message : 2.486712 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 2.486725 s : Deo mflop/s =   1803226.1 (1139.4) 1646362.3-1864135.1 | ||||
| Grid : Message : 2.486729 s : Deo mflop/s per rank   225403.3 | ||||
| Grid : Message : 2.486731 s : Deo mflop/s per node   1803226.1 | ||||
| Grid : Message : 2.486732 s : ================================================================================== | ||||
| Grid : Message : 2.486732 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 2.486732 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 2.486732 s : * SINGLE precision  | ||||
| Grid : Message : 2.486732 s : ================================================================================== | ||||
| Grid : Message : 2.584407 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 2.584412 s : Deo mflop/s =   1840587.3 (1119.6) 1779401.7-1914791.0 | ||||
| Grid : Message : 2.584415 s : Deo mflop/s per rank   230073.4 | ||||
| Grid : Message : 2.584416 s : Deo mflop/s per node   1840587.3 | ||||
| Grid : Message : 2.584417 s : ================================================================================== | ||||
| Grid : Message : 2.584418 s : 16^4 x 1 Deo Best  mflop/s        =   1840587.3 ; 1840587.3 per node  | ||||
| Grid : Message : 2.584420 s : 16^4 x 1 Deo Worst mflop/s        =   1803226.1 ; 1803226.1 per node  | ||||
| Grid : Message : 2.584422 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 2.584422 s : 1803226.1 ; 1840587.3 ;  | ||||
| Grid : Message : 2.584423 s : ================================================================================== | ||||
| Grid : Message : 2.592858 s : ================================================================================== | ||||
| Grid : Message : 2.592862 s : Benchmark DWF on 24^4 local volume  | ||||
| Grid : Message : 2.592863 s : * Nc             : 3 | ||||
| Grid : Message : 2.592864 s : * Global volume  : 24 48 48 48  | ||||
| Grid : Message : 2.592869 s : * Ls             : 1 | ||||
| Grid : Message : 2.592870 s : * ranks          : 8 | ||||
| Grid : Message : 2.592871 s : * nodes          : 1 | ||||
| Grid : Message : 2.592872 s : * ranks/node     : 8 | ||||
| Grid : Message : 2.592872 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 2.592873 s : * Using 1 threads | ||||
| Grid : Message : 2.592874 s : ================================================================================== | ||||
| Grid : Message : 2.715623 s : Initialised RNGs | ||||
| Grid : Message : 9.608838 s : ================================================================================== | ||||
| Grid : Message : 9.608852 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 9.608853 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 9.608854 s : * SINGLE precision  | ||||
| Grid : Message : 9.608855 s : ================================================================================== | ||||
| Grid : Message : 9.870294 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 9.870309 s : Deo mflop/s =   3861903.3 (1708.9) 3511078.3-3937368.2 | ||||
| Grid : Message : 9.870313 s : Deo mflop/s per rank   482737.9 | ||||
| Grid : Message : 9.870314 s : Deo mflop/s per node   3861903.3 | ||||
| Grid : Message : 9.870315 s : ================================================================================== | ||||
| Grid : Message : 9.870316 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 9.870316 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 9.870317 s : * SINGLE precision  | ||||
| Grid : Message : 9.870317 s : ================================================================================== | ||||
| Grid : Message : 10.101619 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 10.101624 s : Deo mflop/s =   3933599.5 (1412.7) 3835758.7-4008152.3 | ||||
| Grid : Message : 10.101627 s : Deo mflop/s per rank   491699.9 | ||||
| Grid : Message : 10.101628 s : Deo mflop/s per node   3933599.5 | ||||
| Grid : Message : 10.101629 s : ================================================================================== | ||||
| Grid : Message : 10.101629 s : 24^4 x 1 Deo Best  mflop/s        =   3933599.5 ; 3933599.5 per node  | ||||
| Grid : Message : 10.101631 s : 24^4 x 1 Deo Worst mflop/s        =   3861903.3 ; 3861903.3 per node  | ||||
| Grid : Message : 10.101633 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 10.101633 s : 3861903.3 ; 3933599.5 ;  | ||||
| Grid : Message : 10.101634 s : ================================================================================== | ||||
| Grid : Message : 10.139642 s : ================================================================================== | ||||
| Grid : Message : 10.139652 s : Benchmark DWF on 32^4 local volume  | ||||
| Grid : Message : 10.139653 s : * Nc             : 3 | ||||
| Grid : Message : 10.139654 s : * Global volume  : 32 64 64 64  | ||||
| Grid : Message : 10.139661 s : * Ls             : 1 | ||||
| Grid : Message : 10.139661 s : * ranks          : 8 | ||||
| Grid : Message : 10.139662 s : * nodes          : 1 | ||||
| Grid : Message : 10.139662 s : * ranks/node     : 8 | ||||
| Grid : Message : 10.139662 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 10.139663 s : * Using 1 threads | ||||
| Grid : Message : 10.139663 s : ================================================================================== | ||||
| Grid : Message : 10.502161 s : Initialised RNGs | ||||
| Grid : Message : 32.211092 s : ================================================================================== | ||||
| Grid : Message : 32.211107 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 32.211108 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 32.211109 s : * SINGLE precision  | ||||
| Grid : Message : 32.211110 s : ================================================================================== | ||||
| Grid : Message : 32.841718 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 32.841732 s : Deo mflop/s =   4988499.9 (2722.5) 4244837.8-5120022.3 | ||||
| Grid : Message : 32.841736 s : Deo mflop/s per rank   623562.5 | ||||
| Grid : Message : 32.841737 s : Deo mflop/s per node   4988499.9 | ||||
| Grid : Message : 32.841738 s : ================================================================================== | ||||
| Grid : Message : 32.841739 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 32.841739 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 32.841740 s : * SINGLE precision  | ||||
| Grid : Message : 32.841740 s : ================================================================================== | ||||
| Grid : Message : 33.407434 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 33.407442 s : Deo mflop/s =   5082758.0 (1883.1) 4971027.0-5205119.6 | ||||
| Grid : Message : 33.407446 s : Deo mflop/s per rank   635344.7 | ||||
| Grid : Message : 33.407447 s : Deo mflop/s per node   5082758.0 | ||||
| Grid : Message : 33.407448 s : ================================================================================== | ||||
| Grid : Message : 33.407448 s : 32^4 x 1 Deo Best  mflop/s        =   5082758.0 ; 5082758.0 per node  | ||||
| Grid : Message : 33.407450 s : 32^4 x 1 Deo Worst mflop/s        =   4988499.9 ; 4988499.9 per node  | ||||
| Grid : Message : 33.407452 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 33.407452 s : 4988499.9 ; 5082758.0 ;  | ||||
| Grid : Message : 33.407453 s : ================================================================================== | ||||
| Grid : Message : 33.506785 s : ================================================================================== | ||||
| Grid : Message : 33.506798 s :  Domain wall dslash 4D vectorised | ||||
| Grid : Message : 33.506799 s : ================================================================================== | ||||
| Grid : Message : 33.530686 s : ================================================================================== | ||||
| Grid : Message : 33.530689 s : Benchmark DWF on 8^4 local volume  | ||||
| Grid : Message : 33.530690 s : * Nc             : 3 | ||||
| Grid : Message : 33.530691 s : * Global volume  : 8 16 16 16  | ||||
| Grid : Message : 33.530698 s : * Ls             : 12 | ||||
| Grid : Message : 33.530699 s : * ranks          : 8 | ||||
| Grid : Message : 33.530700 s : * nodes          : 1 | ||||
| Grid : Message : 33.530701 s : * ranks/node     : 8 | ||||
| Grid : Message : 33.530702 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 33.530703 s : * Using 1 threads | ||||
| Grid : Message : 33.530704 s : ================================================================================== | ||||
| Grid : Message : 33.545465 s : Initialised RNGs | ||||
| Grid : Message : 33.752384 s : ================================================================================== | ||||
| Grid : Message : 33.752397 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 33.752398 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 33.752399 s : * SINGLE precision  | ||||
| Grid : Message : 33.752400 s : ================================================================================== | ||||
| Grid : Message : 33.851964 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 33.851977 s : Deo mflop/s =   1383287.7 (849.8) 1321205.8-1420651.4 | ||||
| Grid : Message : 33.851981 s : Deo mflop/s per rank   172911.0 | ||||
| Grid : Message : 33.851983 s : Deo mflop/s per node   1383287.7 | ||||
| Grid : Message : 33.851984 s : ================================================================================== | ||||
| Grid : Message : 33.851984 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 33.851984 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 33.851984 s : * SINGLE precision  | ||||
| Grid : Message : 33.851984 s : ================================================================================== | ||||
| Grid : Message : 33.949235 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 33.949240 s : Deo mflop/s =   1386335.8 (734.6) 1341325.6-1428330.6 | ||||
| Grid : Message : 33.949243 s : Deo mflop/s per rank   173292.0 | ||||
| Grid : Message : 33.949244 s : Deo mflop/s per node   1386335.8 | ||||
| Grid : Message : 33.949245 s : ================================================================================== | ||||
| Grid : Message : 33.949245 s : 8^4 x 12 Deo Best  mflop/s        =   1386335.8 ; 1386335.8 per node  | ||||
| Grid : Message : 33.949247 s : 8^4 x 12 Deo Worst mflop/s        =   1383287.7 ; 1383287.7 per node  | ||||
| Grid : Message : 33.949249 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 33.949249 s : 1383287.7 ; 1386335.8 ;  | ||||
| Grid : Message : 33.949250 s : ================================================================================== | ||||
| Grid : Message : 33.952789 s : ================================================================================== | ||||
| Grid : Message : 33.952793 s : Benchmark DWF on 12^4 local volume  | ||||
| Grid : Message : 33.952794 s : * Nc             : 3 | ||||
| Grid : Message : 33.952795 s : * Global volume  : 12 24 24 24  | ||||
| Grid : Message : 33.952800 s : * Ls             : 12 | ||||
| Grid : Message : 33.952801 s : * ranks          : 8 | ||||
| Grid : Message : 33.952802 s : * nodes          : 1 | ||||
| Grid : Message : 33.952803 s : * ranks/node     : 8 | ||||
| Grid : Message : 33.952803 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 33.952804 s : * Using 1 threads | ||||
| Grid : Message : 33.952805 s : ================================================================================== | ||||
| Grid : Message : 34.362200 s : Initialised RNGs | ||||
| Grid : Message : 34.969821 s : ================================================================================== | ||||
| Grid : Message : 34.969832 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 34.969833 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 34.969834 s : * SINGLE precision  | ||||
| Grid : Message : 34.969835 s : ================================================================================== | ||||
| Grid : Message : 35.135545 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 35.135558 s : Deo mflop/s =   4208495.6 (2165.0) 4053699.5-4315228.5 | ||||
| Grid : Message : 35.135562 s : Deo mflop/s per rank   526062.0 | ||||
| Grid : Message : 35.135563 s : Deo mflop/s per node   4208495.6 | ||||
| Grid : Message : 35.135564 s : ================================================================================== | ||||
| Grid : Message : 35.135565 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 35.135565 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 35.135565 s : * SINGLE precision  | ||||
| Grid : Message : 35.135565 s : ================================================================================== | ||||
| Grid : Message : 35.299710 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 35.299715 s : Deo mflop/s =   4156968.7 (1450.2) 4053699.5-4219939.5 | ||||
| Grid : Message : 35.299718 s : Deo mflop/s per rank   519621.1 | ||||
| Grid : Message : 35.299719 s : Deo mflop/s per node   4156968.7 | ||||
| Grid : Message : 35.299721 s : ================================================================================== | ||||
| Grid : Message : 35.299721 s : 12^4 x 12 Deo Best  mflop/s        =   4208495.6 ; 4208495.6 per node  | ||||
| Grid : Message : 35.299723 s : 12^4 x 12 Deo Worst mflop/s        =   4156968.7 ; 4156968.7 per node  | ||||
| Grid : Message : 35.299725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 35.299725 s : 4208495.6 ; 4156968.7 ;  | ||||
| Grid : Message : 35.299726 s : ================================================================================== | ||||
| Grid : Message : 35.309687 s : ================================================================================== | ||||
| Grid : Message : 35.309693 s : Benchmark DWF on 16^4 local volume  | ||||
| Grid : Message : 35.309694 s : * Nc             : 3 | ||||
| Grid : Message : 35.309695 s : * Global volume  : 16 32 32 32  | ||||
| Grid : Message : 35.309701 s : * Ls             : 12 | ||||
| Grid : Message : 35.309702 s : * ranks          : 8 | ||||
| Grid : Message : 35.309703 s : * nodes          : 1 | ||||
| Grid : Message : 35.309704 s : * ranks/node     : 8 | ||||
| Grid : Message : 35.309704 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 35.309705 s : * Using 1 threads | ||||
| Grid : Message : 35.309706 s : ================================================================================== | ||||
| Grid : Message : 35.448780 s : Initialised RNGs | ||||
| Grid : Message : 38.468764 s : ================================================================================== | ||||
| Grid : Message : 38.468777 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 38.468778 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 38.468779 s : * SINGLE precision  | ||||
| Grid : Message : 38.468780 s : ================================================================================== | ||||
| Grid : Message : 38.801024 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 38.801040 s : Deo mflop/s =   6674673.6 (2168.6) 6484445.4-6797200.1 | ||||
| Grid : Message : 38.801044 s : Deo mflop/s per rank   834334.2 | ||||
| Grid : Message : 38.801045 s : Deo mflop/s per node   6674673.6 | ||||
| Grid : Message : 38.801046 s : ================================================================================== | ||||
| Grid : Message : 38.801047 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 38.801048 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 38.801049 s : * SINGLE precision  | ||||
| Grid : Message : 38.801049 s : ================================================================================== | ||||
| Grid : Message : 39.129777 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 39.129783 s : Deo mflop/s =   6560128.4 (2117.4) 6405846.1-6679081.3 | ||||
| Grid : Message : 39.129786 s : Deo mflop/s per rank   820016.1 | ||||
| Grid : Message : 39.129787 s : Deo mflop/s per node   6560128.4 | ||||
| Grid : Message : 39.129788 s : ================================================================================== | ||||
| Grid : Message : 39.129788 s : 16^4 x 12 Deo Best  mflop/s        =   6674673.6 ; 6674673.6 per node  | ||||
| Grid : Message : 39.129790 s : 16^4 x 12 Deo Worst mflop/s        =   6560128.4 ; 6560128.4 per node  | ||||
| Grid : Message : 39.129792 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 39.129793 s : 6674673.6 ; 6560128.4 ;  | ||||
| Grid : Message : 39.129795 s : ================================================================================== | ||||
| Grid : Message : 39.161251 s : ================================================================================== | ||||
| Grid : Message : 39.161265 s : Benchmark DWF on 24^4 local volume  | ||||
| Grid : Message : 39.161266 s : * Nc             : 3 | ||||
| Grid : Message : 39.161267 s : * Global volume  : 24 48 48 48  | ||||
| Grid : Message : 39.161274 s : * Ls             : 12 | ||||
| Grid : Message : 39.161275 s : * ranks          : 8 | ||||
| Grid : Message : 39.161276 s : * nodes          : 1 | ||||
| Grid : Message : 39.161277 s : * ranks/node     : 8 | ||||
| Grid : Message : 39.161277 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 39.161278 s : * Using 1 threads | ||||
| Grid : Message : 39.161279 s : ================================================================================== | ||||
| Grid : Message : 39.911996 s : Initialised RNGs | ||||
| Grid : Message : 54.971914 s : ================================================================================== | ||||
| Grid : Message : 54.971928 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 54.971929 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 54.971930 s : * SINGLE precision  | ||||
| Grid : Message : 54.971931 s : ================================================================================== | ||||
| Grid : Message : 56.309445 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 56.309462 s : Deo mflop/s =   8572660.7 (1374.9) 8483366.4-8644399.6 | ||||
| Grid : Message : 56.309467 s : Deo mflop/s per rank   1071582.6 | ||||
| Grid : Message : 56.309468 s : Deo mflop/s per node   8572660.7 | ||||
| Grid : Message : 56.309469 s : ================================================================================== | ||||
| Grid : Message : 56.309471 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 56.309472 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 56.309473 s : * SINGLE precision  | ||||
| Grid : Message : 56.309474 s : ================================================================================== | ||||
| Grid : Message : 57.640707 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 57.640714 s : Deo mflop/s =   8200141.3 (1445.8) 8113545.6-8286307.9 | ||||
| Grid : Message : 57.640717 s : Deo mflop/s per rank   1025017.7 | ||||
| Grid : Message : 57.640718 s : Deo mflop/s per node   8200141.3 | ||||
| Grid : Message : 57.640719 s : ================================================================================== | ||||
| Grid : Message : 57.640720 s : 24^4 x 12 Deo Best  mflop/s        =   8572660.7 ; 8572660.7 per node  | ||||
| Grid : Message : 57.640723 s : 24^4 x 12 Deo Worst mflop/s        =   8200141.3 ; 8200141.3 per node  | ||||
| Grid : Message : 57.640725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 57.640725 s : 8572660.7 ; 8200141.3 ;  | ||||
| Grid : Message : 57.640727 s : ================================================================================== | ||||
| Grid : Message : 57.806175 s : ================================================================================== | ||||
| Grid : Message : 57.806190 s : Benchmark DWF on 32^4 local volume  | ||||
| Grid : Message : 57.806191 s : * Nc             : 3 | ||||
| Grid : Message : 57.806192 s : * Global volume  : 32 64 64 64  | ||||
| Grid : Message : 57.806200 s : * Ls             : 12 | ||||
| Grid : Message : 57.806200 s : * ranks          : 8 | ||||
| Grid : Message : 57.806200 s : * nodes          : 1 | ||||
| Grid : Message : 57.806200 s : * ranks/node     : 8 | ||||
| Grid : Message : 57.806200 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 57.806201 s : * Using 1 threads | ||||
| Grid : Message : 57.806201 s : ================================================================================== | ||||
| Grid : Message : 60.313153 s : Initialised RNGs | ||||
| Grid : Message : 107.830286 s : ================================================================================== | ||||
| Grid : Message : 107.830306 s : * Using GENERIC Nc WilsonKernels | ||||
| Grid : Message : 107.830307 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 107.830308 s : * SINGLE precision  | ||||
| Grid : Message : 107.830309 s : ================================================================================== | ||||
| Grid : Message : 111.479603 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 111.479625 s : Deo mflop/s =   9771387.8 (1000.8) 9688589.9-9830800.0 | ||||
| Grid : Message : 111.479629 s : Deo mflop/s per rank   1221423.5 | ||||
| Grid : Message : 111.479630 s : Deo mflop/s per node   9771387.8 | ||||
| Grid : Message : 111.479631 s : ================================================================================== | ||||
| Grid : Message : 111.479631 s : * Using UNROLLED WilsonKernels | ||||
| Grid : Message : 111.479631 s : * Using Overlapped Comms/Compute | ||||
| Grid : Message : 111.479631 s : * SINGLE precision  | ||||
| Grid : Message : 111.479631 s : ================================================================================== | ||||
| Grid : Message : 115.406559 s : Deo FlopsPerSite is 1344.0 | ||||
| Grid : Message : 115.406573 s : Deo mflop/s =   8785297.3 (1739.6) 8628282.5-8911307.5 | ||||
| Grid : Message : 115.406576 s : Deo mflop/s per rank   1098162.2 | ||||
| Grid : Message : 115.406577 s : Deo mflop/s per node   8785297.3 | ||||
| Grid : Message : 115.406578 s : ================================================================================== | ||||
| Grid : Message : 115.406578 s : 32^4 x 12 Deo Best  mflop/s        =   9771387.8 ; 9771387.8 per node  | ||||
| Grid : Message : 115.406580 s : 32^4 x 12 Deo Worst mflop/s        =   8785297.3 ; 8785297.3 per node  | ||||
| Grid : Message : 115.406581 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 115.406581 s : 9771387.8 ; 8785297.3 ;  | ||||
| Grid : Message : 115.406582 s : ================================================================================== | ||||
| Grid : Message : 115.918888 s : ================================================================================== | ||||
| Grid : Message : 115.918902 s :  Improved Staggered dslash 4D vectorised | ||||
| Grid : Message : 115.918903 s : ================================================================================== | ||||
| Grid : Message : 115.920344 s : ================================================================================== | ||||
| Grid : Message : 115.920346 s : Benchmark ImprovedStaggered on 8^4 local volume  | ||||
| Grid : Message : 115.920347 s : * Global volume  : 8 16 16 16  | ||||
| Grid : Message : 115.920354 s : * ranks          : 8 | ||||
| Grid : Message : 115.920355 s : * nodes          : 1 | ||||
| Grid : Message : 115.920356 s : * ranks/node     : 8 | ||||
| Grid : Message : 115.920357 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 115.920376 s : * Using 1 threads | ||||
| Grid : Message : 115.920377 s : ================================================================================== | ||||
| Grid : Message : 115.923522 s : Initialised RNGs | ||||
| Grid : Message : 116.904870 s : ================================================================================== | ||||
| Grid : Message : 116.904950 s : * Using GENERIC Nc StaggeredKernels | ||||
| Grid : Message : 116.904960 s : * SINGLE precision  | ||||
| Grid : Message : 116.904970 s : ================================================================================== | ||||
| Grid : Message : 116.288979 s : Deo mflop/s =   49708.9 (22.9) 44075.3-50609.3 | ||||
| Grid : Message : 116.289000 s : Deo mflop/s per rank   6213.6 | ||||
| Grid : Message : 116.289002 s : Deo mflop/s per node   49708.9 | ||||
| Grid : Message : 116.289003 s : ================================================================================== | ||||
| Grid : Message : 116.289004 s : * SINGLE precision  | ||||
| Grid : Message : 116.289005 s : ================================================================================== | ||||
| Grid : Message : 116.481632 s : Deo mflop/s =   49737.1 (13.5) 48517.0-50338.0 | ||||
| Grid : Message : 116.481639 s : Deo mflop/s per rank   6217.1 | ||||
| Grid : Message : 116.481640 s : Deo mflop/s per node   49737.1 | ||||
| Grid : Message : 116.481641 s : ================================================================================== | ||||
| Grid : Message : 116.481642 s : 8^4  Deo Best  mflop/s        =   49737.1 ; 49737.1 per node  | ||||
| Grid : Message : 116.481644 s : 8^4  Deo Worst mflop/s        =   49708.9 ; 49708.9 per node  | ||||
| Grid : Message : 116.481646 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 116.481646 s : 49708.9 ; 49737.1 ;  | ||||
| Grid : Message : 116.481647 s : ================================================================================== | ||||
| Grid : Message : 116.483458 s : ================================================================================== | ||||
| Grid : Message : 116.483461 s : Benchmark ImprovedStaggered on 12^4 local volume  | ||||
| Grid : Message : 116.483462 s : * Global volume  : 12 24 24 24  | ||||
| Grid : Message : 116.483465 s : * ranks          : 8 | ||||
| Grid : Message : 116.483466 s : * nodes          : 1 | ||||
| Grid : Message : 116.483466 s : * ranks/node     : 8 | ||||
| Grid : Message : 116.483466 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 116.483467 s : * Using 1 threads | ||||
| Grid : Message : 116.483468 s : ================================================================================== | ||||
| Grid : Message : 116.489279 s : Initialised RNGs | ||||
| Grid : Message : 116.945016 s : ================================================================================== | ||||
| Grid : Message : 116.945025 s : * Using GENERIC Nc StaggeredKernels | ||||
| Grid : Message : 116.945026 s : * SINGLE precision  | ||||
| Grid : Message : 116.945027 s : ================================================================================== | ||||
| Grid : Message : 117.159821 s : Deo mflop/s =   229778.4 (89.5) 223656.1-233547.5 | ||||
| Grid : Message : 117.159835 s : Deo mflop/s per rank   28722.3 | ||||
| Grid : Message : 117.159837 s : Deo mflop/s per node   229778.4 | ||||
| Grid : Message : 117.159838 s : ================================================================================== | ||||
| Grid : Message : 117.159838 s : * SINGLE precision  | ||||
| Grid : Message : 117.159838 s : ================================================================================== | ||||
| Grid : Message : 117.371102 s : Deo mflop/s =   229516.6 (61.8) 225781.1-233547.5 | ||||
| Grid : Message : 117.371109 s : Deo mflop/s per rank   28689.6 | ||||
| Grid : Message : 117.371110 s : Deo mflop/s per node   229516.6 | ||||
| Grid : Message : 117.371111 s : ================================================================================== | ||||
| Grid : Message : 117.371111 s : 12^4  Deo Best  mflop/s        =   229778.4 ; 229778.4 per node  | ||||
| Grid : Message : 117.371113 s : 12^4  Deo Worst mflop/s        =   229516.6 ; 229516.6 per node  | ||||
| Grid : Message : 117.371115 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 117.371115 s : 229778.4 ; 229516.6 ;  | ||||
| Grid : Message : 117.371116 s : ================================================================================== | ||||
| Grid : Message : 117.373669 s : ================================================================================== | ||||
| Grid : Message : 117.373673 s : Benchmark ImprovedStaggered on 16^4 local volume  | ||||
| Grid : Message : 117.373674 s : * Global volume  : 16 32 32 32  | ||||
| Grid : Message : 117.373678 s : * ranks          : 8 | ||||
| Grid : Message : 117.373679 s : * nodes          : 1 | ||||
| Grid : Message : 117.373679 s : * ranks/node     : 8 | ||||
| Grid : Message : 117.373679 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 117.373680 s : * Using 1 threads | ||||
| Grid : Message : 117.373681 s : ================================================================================== | ||||
| Grid : Message : 117.386495 s : Initialised RNGs | ||||
| Grid : Message : 118.755695 s : ================================================================================== | ||||
| Grid : Message : 118.755706 s : * Using GENERIC Nc StaggeredKernels | ||||
| Grid : Message : 118.755707 s : * SINGLE precision  | ||||
| Grid : Message : 118.755708 s : ================================================================================== | ||||
| Grid : Message : 119.178990 s : Deo mflop/s =   608844.0 (126.1) 596065.5-615608.7 | ||||
| Grid : Message : 119.179160 s : Deo mflop/s per rank   76105.5 | ||||
| Grid : Message : 119.179180 s : Deo mflop/s per node   608844.0 | ||||
| Grid : Message : 119.179190 s : ================================================================================== | ||||
| Grid : Message : 119.179200 s : * SINGLE precision  | ||||
| Grid : Message : 119.179200 s : ================================================================================== | ||||
| Grid : Message : 119.271093 s : Deo mflop/s =   605259.7 (188.7) 591372.1-614349.7 | ||||
| Grid : Message : 119.271101 s : Deo mflop/s per rank   75657.5 | ||||
| Grid : Message : 119.271103 s : Deo mflop/s per node   605259.7 | ||||
| Grid : Message : 119.271104 s : ================================================================================== | ||||
| Grid : Message : 119.271105 s : 16^4  Deo Best  mflop/s        =   608844.0 ; 608844.0 per node  | ||||
| Grid : Message : 119.271107 s : 16^4  Deo Worst mflop/s        =   605259.7 ; 605259.7 per node  | ||||
| Grid : Message : 119.271109 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 119.271109 s : 608844.0 ; 605259.7 ;  | ||||
| Grid : Message : 119.271110 s : ================================================================================== | ||||
| Grid : Message : 119.275303 s : ================================================================================== | ||||
| Grid : Message : 119.275308 s : Benchmark ImprovedStaggered on 24^4 local volume  | ||||
| Grid : Message : 119.275309 s : * Global volume  : 24 48 48 48  | ||||
| Grid : Message : 119.275315 s : * ranks          : 8 | ||||
| Grid : Message : 119.275316 s : * nodes          : 1 | ||||
| Grid : Message : 119.275317 s : * ranks/node     : 8 | ||||
| Grid : Message : 119.275317 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 119.275318 s : * Using 1 threads | ||||
| Grid : Message : 119.275319 s : ================================================================================== | ||||
| Grid : Message : 119.328765 s : Initialised RNGs | ||||
| Grid : Message : 126.866160 s : ================================================================================== | ||||
| Grid : Message : 126.866270 s : * Using GENERIC Nc StaggeredKernels | ||||
| Grid : Message : 126.866280 s : * SINGLE precision  | ||||
| Grid : Message : 126.866290 s : ================================================================================== | ||||
| Grid : Message : 126.604376 s : Deo mflop/s =   1641161.6 (335.5) 1619660.5-1663961.9 | ||||
| Grid : Message : 126.604392 s : Deo mflop/s per rank   205145.2 | ||||
| Grid : Message : 126.604394 s : Deo mflop/s per node   1641161.6 | ||||
| Grid : Message : 126.604395 s : ================================================================================== | ||||
| Grid : Message : 126.604396 s : * SINGLE precision  | ||||
| Grid : Message : 126.604396 s : ================================================================================== | ||||
| Grid : Message : 127.829420 s : Deo mflop/s =   1620972.4 (344.9) 1602593.4-1644174.3 | ||||
| Grid : Message : 127.829520 s : Deo mflop/s per rank   202621.6 | ||||
| Grid : Message : 127.829530 s : Deo mflop/s per node   1620972.4 | ||||
| Grid : Message : 127.829540 s : ================================================================================== | ||||
| Grid : Message : 127.829550 s : 24^4  Deo Best  mflop/s        =   1641161.6 ; 1641161.6 per node  | ||||
| Grid : Message : 127.829570 s : 24^4  Deo Worst mflop/s        =   1620972.4 ; 1620972.4 per node  | ||||
| Grid : Message : 127.829590 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 127.829590 s : 1641161.6 ; 1620972.4 ;  | ||||
| Grid : Message : 127.829600 s : ================================================================================== | ||||
| Grid : Message : 127.107891 s : ================================================================================== | ||||
| Grid : Message : 127.107903 s : Benchmark ImprovedStaggered on 32^4 local volume  | ||||
| Grid : Message : 127.107904 s : * Global volume  : 32 64 64 64  | ||||
| Grid : Message : 127.107912 s : * ranks          : 8 | ||||
| Grid : Message : 127.107913 s : * nodes          : 1 | ||||
| Grid : Message : 127.107914 s : * ranks/node     : 8 | ||||
| Grid : Message : 127.107914 s : * ranks geom     : 1 2 2 2  | ||||
| Grid : Message : 127.107915 s : * Using 1 threads | ||||
| Grid : Message : 127.107916 s : ================================================================================== | ||||
| Grid : Message : 127.257116 s : Initialised RNGs | ||||
| Grid : Message : 148.527930 s : ================================================================================== | ||||
| Grid : Message : 148.527941 s : * Using GENERIC Nc StaggeredKernels | ||||
| Grid : Message : 148.527942 s : * SINGLE precision  | ||||
| Grid : Message : 148.527943 s : ================================================================================== | ||||
| Grid : Message : 149.401625 s : Deo mflop/s =   3085543.7 (956.0) 2934476.4-3115147.4 | ||||
| Grid : Message : 149.401643 s : Deo mflop/s per rank   385693.0 | ||||
| Grid : Message : 149.401645 s : Deo mflop/s per node   3085543.7 | ||||
| Grid : Message : 149.401646 s : ================================================================================== | ||||
| Grid : Message : 149.401647 s : * SINGLE precision  | ||||
| Grid : Message : 149.401648 s : ================================================================================== | ||||
| Grid : Message : 150.204533 s : Deo mflop/s =   3053468.5 (343.9) 3030688.8-3077255.0 | ||||
| Grid : Message : 150.204540 s : Deo mflop/s per rank   381683.6 | ||||
| Grid : Message : 150.204541 s : Deo mflop/s per node   3053468.5 | ||||
| Grid : Message : 150.204542 s : ================================================================================== | ||||
| Grid : Message : 150.204543 s : 32^4  Deo Best  mflop/s        =   3085543.7 ; 3085543.7 per node  | ||||
| Grid : Message : 150.204545 s : 32^4  Deo Worst mflop/s        =   3053468.5 ; 3053468.5 per node  | ||||
| Grid : Message : 150.204547 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||
| Grid : Message : 150.204547 s : 3085543.7 ; 3053468.5 ;  | ||||
| Grid : Message : 150.204548 s : ================================================================================== | ||||
| Grid : Message : 150.292848 s : ================================================================================== | ||||
| Grid : Message : 150.292864 s :  Summary table Ls=12 | ||||
| Grid : Message : 150.292866 s : ================================================================================== | ||||
| Grid : Message : 150.292866 s : L 		 Clover 		 DWF4 		 Staggered | ||||
| Grid : Message : 150.292867 s : 8 		 154914.0 		 1386335.8 		 49737.1 | ||||
| Grid : Message : 150.292880 s : 12 		 693556.6 		 4208495.6 		 229778.4 | ||||
| Grid : Message : 150.292882 s : 16 		 1840587.3 		 6674673.6 		 608844.0 | ||||
| Grid : Message : 150.292884 s : 24 		 3933599.5 		 8572660.7 		 1641161.6 | ||||
| Grid : Message : 150.292886 s : 32 		 5082758.0 		 9771387.8 		 3085543.7 | ||||
| Grid : Message : 150.292888 s : ================================================================================== | ||||
| Grid : Message : 150.292888 s : ================================================================================== | ||||
| Grid : Message : 150.292888 s :  Memory benchmark  | ||||
| Grid : Message : 150.292888 s : ================================================================================== | ||||
| Grid : Message : 150.295495 s : ================================================================================== | ||||
| Grid : Message : 150.295497 s : = Benchmarking a*x + y bandwidth | ||||
| Grid : Message : 150.295498 s : ================================================================================== | ||||
| Grid : Message : 150.295499 s :   L  		bytes			GB/s		Gflop/s		 seconds		GB/s / node | ||||
| Grid : Message : 150.295500 s : ---------------------------------------------------------- | ||||
| Grid : Message : 160.682233 s : 8		6291456.000   		379.297		31.608		10.367		379.297 | ||||
| Grid : Message : 161.851979 s : 16		100663296.000   		3754.675		312.890		1.047		3754.675 | ||||
| Grid : Message : 162.458098 s : 24		509607936.000   		6521.472		543.456		0.603		6521.472 | ||||
| Grid : Message : 162.924116 s : 32		1610612736.000   		8513.456		709.455		0.462		8513.456 | ||||
| Grid : Message : 163.363877 s : 40		3932160000.000   		9018.902		751.575		0.436		9018.902 | ||||
| Grid : Message : 163.363976 s : ================================================================================== | ||||
| Grid : Message : 163.363978 s :  Batched BLAS benchmark  | ||||
| Grid : Message : 163.363979 s : ================================================================================== | ||||
| hipblasCreate | ||||
| Grid : Message : 163.364046 s : ================================================================================== | ||||
| Grid : Message : 163.364048 s : = batched GEMM (double precision)  | ||||
| Grid : Message : 163.364048 s : ================================================================================== | ||||
| Grid : Message : 163.364048 s :   M  		N			K		Gflop/s / rank (coarse mrhs) | ||||
| Grid : Message : 163.364049 s : ---------------------------------------------------------- | ||||
| Grid : Message : 163.438476 s : 16		8		16		256		0.565 | ||||
| Grid : Message : 163.438944 s : 16		16		16		256		243.148 | ||||
| Grid : Message : 163.439501 s : 16		32		16		256		440.347 | ||||
| Grid : Message : 163.440003 s : 32		8		32		256		439.194 | ||||
| Grid : Message : 163.440463 s : 32		16		32		256		847.334 | ||||
| Grid : Message : 163.441051 s : 32		32		32		256		1430.893 | ||||
| Grid : Message : 163.441679 s : 64		8		64		256		1242.757 | ||||
| Grid : Message : 163.442354 s : 64		16		64		256		2196.689 | ||||
| Grid : Message : 163.443196 s : 64		32		64		256		3697.458 | ||||
| Grid : Message : 163.443200 s : ---------------------------------------------------------- | ||||
| Grid : Message : 163.443201 s :   M  		N			K		Gflop/s / rank (block project) | ||||
| Grid : Message : 163.443202 s : ---------------------------------------------------------- | ||||
| Grid : Message : 163.444013 s : 16		8		256		256		899.583 | ||||
| Grid : Message : 163.444933 s : 16		16		256		256		1673.538 | ||||
| Grid : Message : 163.446013 s : 16		32		256		256		2959.597 | ||||
| Grid : Message : 163.446951 s : 32		8		256		256		1558.859 | ||||
| Grid : Message : 163.447970 s : 32		16		256		256		2864.839 | ||||
| Grid : Message : 163.449240 s : 32		32		256		256		4810.671 | ||||
| Grid : Message : 163.450524 s : 64		8		256		256		2386.093 | ||||
| Grid : Message : 163.451877 s : 64		16		256		256		4451.666 | ||||
| Grid : Message : 163.453806 s : 64		32		256		256		5942.124 | ||||
| Grid : Message : 163.453809 s : ---------------------------------------------------------- | ||||
| Grid : Message : 163.453810 s :   M  		N			K		Gflop/s / rank (block promote) | ||||
| Grid : Message : 163.453811 s : ---------------------------------------------------------- | ||||
| Grid : Message : 163.454716 s : 8		256		16		256		799.867 | ||||
| Grid : Message : 163.455690 s : 16		256		16		256		1584.625 | ||||
| Grid : Message : 163.457209 s : 32		256		16		256		1949.422 | ||||
| Grid : Message : 163.458254 s : 8		256		32		256		1389.417 | ||||
| Grid : Message : 163.459339 s : 16		256		32		256		2668.344 | ||||
| Grid : Message : 163.461158 s : 32		256		32		256		3234.162 | ||||
| Grid : Message : 163.462566 s : 8		256		64		256		2150.925 | ||||
| Grid : Message : 163.464066 s : 16		256		64		256		4012.488 | ||||
| Grid : Message : 163.466272 s : 32		256		64		256		5154.786 | ||||
| Grid : Message : 163.466276 s : ================================================================================== | ||||
| Grid : Message : 163.466277 s : ================================================================================== | ||||
| Grid : Message : 163.466278 s :  Communications benchmark  | ||||
| Grid : Message : 163.466279 s : ================================================================================== | ||||
| Grid : Message : 163.466280 s : ==================================================================================================== | ||||
| Grid : Message : 163.466280 s : = Benchmarking threaded STENCIL halo exchange in 3 dimensions | ||||
| Grid : Message : 163.466281 s : ==================================================================================================== | ||||
| Grid : Message : 163.466281 s :  L  	 Ls  	bytes	 MB/s uni  		 MB/s bidi  | ||||
| Grid : Message : 163.521339 s : 16	12	 4718592 	 122513.099		245026.198 | ||||
| Grid : Message : 163.551417 s : 16	12	 4718592 	 125590.498		251180.996 | ||||
| Grid : Message : 163.572339 s : 16	12	 4718592 	 180555.489		361110.977 | ||||
| Grid : Message : 163.602810 s : 16	12	 4718592 	 123949.223		247898.447 | ||||
| Grid : Message : 163.633041 s : 16	12	 4718592 	 124933.761		249867.523 | ||||
| Grid : Message : 163.654084 s : 16	12	 4718592 	 179516.530		359033.061 | ||||
| Grid : Message : 163.756280 s : 24	12	 15925248 	 127515.473		255030.946 | ||||
| Grid : Message : 163.852651 s : 24	12	 15925248 	 132226.945		264453.890 | ||||
| Grid : Message : 163.917510 s : 24	12	 15925248 	 196474.591		392949.183 | ||||
| Grid : Message : 164.170390 s : 24	12	 15925248 	 128020.322		256040.644 | ||||
| Grid : Message : 164.113321 s : 24	12	 15925248 	 132340.948		264681.896 | ||||
| Grid : Message : 164.178314 s : 24	12	 15925248 	 196051.311		392102.622 | ||||
| Grid : Message : 164.413983 s : 32	12	 37748736 	 129411.666		258823.333 | ||||
| Grid : Message : 164.639218 s : 32	12	 37748736 	 134090.789		268181.577 | ||||
| Grid : Message : 164.789675 s : 32	12	 37748736 	 200739.096		401478.191 | ||||
| Grid : Message : 165.228910 s : 32	12	 37748736 	 129497.681		258995.363 | ||||
| Grid : Message : 165.248096 s : 32	12	 37748736 	 134103.293		268206.586 | ||||
| Grid : Message : 165.398958 s : 32	12	 37748736 	 200198.805		400397.611 | ||||
| Grid : Message : 165.399411 s : ================================================================================== | ||||
| Grid : Message : 165.399413 s :  Per Node Summary table Ls=12 | ||||
| Grid : Message : 165.399414 s : ================================================================================== | ||||
| Grid : Message : 165.399414 s :  L 		 Clover		 DWF4		 Staggered (GF/s per node) | ||||
| Grid : Message : 165.399417 s : 8 		 154914.003 	 1386335.817 	 49737.127 | ||||
| Grid : Message : 165.399423 s : 12 		 693556.579 	 4208495.611 	 229778.435 | ||||
| Grid : Message : 165.399426 s : 16 		 1840587.280 	 6674673.647 	 608844.000 | ||||
| Grid : Message : 165.399429 s : 24 		 3933599.545 	 8572660.656 	 1641161.613 | ||||
| Grid : Message : 165.399432 s : 32 		 5082757.996 	 9771387.820 	 3085543.742 | ||||
| Grid : Message : 165.399435 s : ================================================================================== | ||||
| Grid : Message : 165.399435 s : ================================================================================== | ||||
| Grid : Message : 165.399435 s :  Comparison point     result: 9172024.238 Mflop/s per node | ||||
| Grid : Message : 165.399436 s :  Comparison point is 0.5*(9771387.820+8572660.656)  | ||||
| Grid : Message : 165.399438 s : ================================================================================== | ||||
| Grid : Message : 165.399438 s : ******************************************* | ||||
| Grid : Message : 165.399438 s : ******* Grid Finalize                ****** | ||||
| Grid : Message : 165.399438 s : ******************************************* | ||||
							
								
								
									
										38
									
								
								systems/Frontier/benchmarks/benchusqcd.slurm
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										38
									
								
								systems/Frontier/benchmarks/benchusqcd.slurm
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| #!/bin/bash -l | ||||
| #SBATCH --job-name=bench | ||||
| ##SBATCH --partition=small-g | ||||
| ##SBATCH -q debug | ||||
| #SBATCH --nodes=1 | ||||
| #SBATCH --ntasks-per-node=8 | ||||
| #SBATCH --cpus-per-task=7 | ||||
| #SBATCH --gpus-per-node=8 | ||||
| #SBATCH --time=00:30:00 | ||||
| #SBATCH --account=phy157_dwf | ||||
| #SBATCH --gpu-bind=none | ||||
| #SBATCH --exclusive | ||||
| #SBATCH --mem=0 | ||||
|  | ||||
| cat << EOF > select_gpu | ||||
| #!/bin/bash | ||||
| export GPU_MAP=(0 1 2 3 7 6 5 4) | ||||
| export NUMA_MAP=(3 3 1 1 2 2 0 0) | ||||
| export GPU=\${GPU_MAP[\$SLURM_LOCALID]} | ||||
| export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} | ||||
| export HIP_VISIBLE_DEVICES=\$GPU | ||||
| unset ROCR_VISIBLE_DEVICES | ||||
| echo RANK \$SLURM_LOCALID using GPU \$GPU     | ||||
| exec numactl -m \$NUMA -N \$NUMA \$* | ||||
| EOF | ||||
|  | ||||
| chmod +x ./select_gpu | ||||
|  | ||||
| root=$HOME/Frontier/Grid/systems/Frontier/ | ||||
| source ${root}/sourceme.sh | ||||
|  | ||||
| export OMP_NUM_THREADS=7 | ||||
| export MPICH_GPU_SUPPORT_ENABLED=1 | ||||
| #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM | ||||
|  | ||||
| srun ./select_gpu ./Benchmark_usqcd --grid 32.32.32.32 --mpi 1.2.2.2 --accelerator-threads 8 --comms-overlap --shm 4096 --shm-mpi 0 --grid $vol  > Benchmark_usqcd.log | ||||
|  | ||||
|  | ||||
| @@ -3,7 +3,7 @@ spack load c-lime | ||||
| module load emacs  | ||||
| module load PrgEnv-gnu | ||||
| module load rocm | ||||
| module load cray-mpich/8.1.23 | ||||
| module load cray-mpich | ||||
| module load gmp | ||||
| module load cray-fftw | ||||
| module load craype-accel-amd-gfx90a | ||||
|   | ||||
| @@ -2,11 +2,11 @@ | ||||
|     --enable-comms=mpi \ | ||||
|     --enable-simd=GPU \ | ||||
|     --enable-shm=nvlink \ | ||||
|     --enable-gen-simd-width=64 \ | ||||
|     --enable-accelerator=cuda \ | ||||
|     --enable-gen-simd-width=64 \ | ||||
|     --disable-gparity \ | ||||
|     --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \ | ||||
|     --enable-accelerator-cshift \ | ||||
|     --disable-unified \ | ||||
|     CXX=nvcc \ | ||||
|     LDFLAGS="-cudart shared " \ | ||||
|     CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" | ||||
|     LDFLAGS="-cudart shared -lcublas " \ | ||||
|     CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared --diag-suppress 177,550,611" | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| module load cuda/11.4.1  openmpi/4.1.1-cuda11.4.1  ucx/1.12.0-cuda11.4.1   | ||||
| #module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1 | ||||
| export PREFIX=/home/tc002/tc002/shared/env/prefix/ | ||||
| export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH | ||||
| module load cuda/12.3  | ||||
| module load ucx/1.15.0-cuda12.3   | ||||
| module load openmpi/4.1.5-cuda12.3 | ||||
| source /home/dp207/dp207/shared/env/production/env-base.sh  | ||||
| source /home/dp207/dp207/shared/env/production/env-gpu.sh  | ||||
| unset SBATCH_EXPORT | ||||
|  | ||||
|   | ||||
| @@ -142,7 +142,9 @@ int main (int argc, char ** argv) | ||||
|   std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl; | ||||
|   int iters; | ||||
|  | ||||
|   time_t now; | ||||
|   time_t start = time(NULL); | ||||
|   UGrid->Broadcast(0,(void *)&start,sizeof(start)); | ||||
|  | ||||
|   FlightRecorder::ContinueOnFail = 0; | ||||
|   FlightRecorder::PrintEntireLog = 0; | ||||
| @@ -162,9 +164,9 @@ int main (int argc, char ** argv) | ||||
|     } | ||||
|     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; | ||||
|     result_o = Zero(); | ||||
|     t1=usecond(); | ||||
|     t1=usecond();  | ||||
|     mCG(src_o,result_o); | ||||
|     t2=usecond(); | ||||
|     t2=usecond();  | ||||
|     iters = mCG.TotalInnerIterations; //Number of inner CG iterations | ||||
|     flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; | ||||
|     flops+= CGsiteflops*FrbGrid->gSites()*iters; | ||||
| @@ -176,7 +178,8 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|     std::cout << " FlightRecorder is OK! "<<std::endl; | ||||
|     iter ++; | ||||
|   } while (time(NULL) < (start + nsecs/10) ); | ||||
|     now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now)); | ||||
|   } while (now < (start + nsecs/10) ); | ||||
|      | ||||
|   std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; | ||||
|   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); | ||||
| @@ -189,7 +192,7 @@ int main (int argc, char ** argv) | ||||
|     } | ||||
|     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; | ||||
|     result_o_2 = Zero(); | ||||
|     t1=usecond(); | ||||
|     t1=usecond();  | ||||
|     CG(HermOpEO,src_o,result_o_2); | ||||
|     t2=usecond(); | ||||
|     iters = CG.IterationsToComplete; | ||||
| @@ -201,8 +204,9 @@ int main (int argc, char ** argv) | ||||
|     std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl; | ||||
|     assert(FlightRecorder::ErrorCount()==0); | ||||
|     std::cout << " FlightRecorder is OK! "<<std::endl; | ||||
|     now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now)); | ||||
|     i++; | ||||
|   } while (time(NULL) < (start + nsecs) ); | ||||
|   } while (now < (start + nsecs) ); | ||||
|  | ||||
|   LatticeFermionD diff_o(FrbGrid); | ||||
|   RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2); | ||||
|   | ||||
							
								
								
									
										118
									
								
								tests/debug/Test_8888.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										118
									
								
								tests/debug/Test_8888.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,118 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./tests/Test_general_coarse_hdcg.cc | ||||
|  | ||||
|     Copyright (C) 2023 | ||||
|  | ||||
| Author: Peter Boyle <pboyle@bnl.gov> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h> | ||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h> | ||||
| #include <Grid/algorithms/iterative/AdefMrhs.h> | ||||
|  | ||||
| using namespace std; | ||||
| using namespace Grid; | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|   const int Ls=8; | ||||
|   const int nbasis = 40; | ||||
|   const int cb = 0 ; | ||||
|   RealD mass=0.01; | ||||
|   RealD M5=1.8; | ||||
|   RealD b=1.0; | ||||
|   RealD c=0.0; | ||||
|  | ||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), | ||||
| 								   GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||
| 								   GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   ///////////////////////// RNGs ///////////////////////////////// | ||||
|   std::vector<int> seeds4({1,2,3,4}); | ||||
|   std::vector<int> seeds5({5,6,7,8}); | ||||
|   std::vector<int> cseeds({5,6,7,8}); | ||||
|  | ||||
|   GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5); | ||||
|   GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4); | ||||
|  | ||||
|   ///////////////////////// Configuration ///////////////////////////////// | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|  | ||||
|   FieldMetaData header; | ||||
|   std::string file("ckpoint_EODWF_lat.125"); | ||||
|   NerscIO::readConfiguration(Umu,header,file); | ||||
|  | ||||
|   //////////////////////// Fermion action ////////////////////////////////// | ||||
|   MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); | ||||
|  | ||||
|   MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf); | ||||
|  | ||||
|    | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "         Fine Power method            "<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|  | ||||
|   LatticeFermionD pm_src(FGrid); | ||||
|   pm_src = ComplexD(1.0); | ||||
|   PowerMethod<LatticeFermionD>       fPM; | ||||
|   fPM(HermOp,pm_src); | ||||
|  | ||||
|    | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "         Fine Lanczos  (poly, low)    "<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|    | ||||
|   int Nk=80; | ||||
|   int Nm=Nk*3; | ||||
|   int Nstop=8; | ||||
|   int Nconv_test_interval=1; | ||||
|    | ||||
|   //  Chebyshev<LatticeFermionD>      IRLChebyLo(0.2,64.0,201);  // 1 iter | ||||
|   Chebyshev<LatticeFermionD>      IRLChebyLo(0.0,55.0,101);  // 1 iter | ||||
|   FunctionHermOp<LatticeFermionD>    PolyOp(IRLChebyLo,HermOp); | ||||
|   PlainHermOp<LatticeFermionD>          Op(HermOp); | ||||
|  | ||||
|   ImplicitlyRestartedLanczos IRL(PolyOp, | ||||
| 				 Op, | ||||
| 				 Nk, // sought vecs | ||||
| 				 Nk, // sought vecs | ||||
| 				 Nm, // spare vecs | ||||
| 				 1.0e-8, | ||||
| 				 10 // Max iterations | ||||
| 				 ); | ||||
|  | ||||
|   int Nconv; | ||||
|   std::vector<RealD>            eval(Nm); | ||||
|   std::vector<LatticeFermionD>     evec(Nm,FGrid); | ||||
|   LatticeFermionD     irl_src(FGrid); | ||||
|  | ||||
|   IRL.calc(eval,evec,irl_src,Nconv); | ||||
|  | ||||
|   Grid_finalize(); | ||||
|   return 0; | ||||
| } | ||||
| @@ -392,9 +392,27 @@ void  TestCGschur(What & Ddwf, | ||||
| 		   GridParallelRNG *RNG5) | ||||
| { | ||||
|   LatticeFermion src   (FGrid); random(*RNG5,src); | ||||
|   LatticeFermion result(FGrid); result=Zero(); | ||||
|   LatticeFermion result1(FGrid); result1=Zero(); | ||||
|   LatticeFermion result2(FGrid); result2=Zero(); | ||||
|   LatticeFermion result3(FGrid); result3=Zero(); | ||||
|  | ||||
|   ConjugateGradient<LatticeFermion> CG(1.0e-8,10000); | ||||
|   SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG); | ||||
|   SchurSolver(Ddwf,src,result); | ||||
|   SchurSolver(Ddwf,src,result1); | ||||
|  | ||||
|   SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG); | ||||
|   SchurSolverSymm1(Ddwf,src,result2); | ||||
|  | ||||
|   SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG); | ||||
|   SchurSolverSymm2(Ddwf,src,result3); | ||||
|  | ||||
|   std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl; | ||||
|  | ||||
|   std::cout << GridLogMessage << " Symm1    " <<norm2(result2)<<std::endl;  | ||||
|   result2=result2-result1; | ||||
|   std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl;  | ||||
|  | ||||
|   std::cout << GridLogMessage << " Symm2    " <<norm2(result3)<<std::endl;  | ||||
|   result3=result3-result1; | ||||
|   std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl;  | ||||
| } | ||||
|   | ||||
| @@ -244,7 +244,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  | ||||
|  | ||||
|    | ||||
| #if 0   | ||||
|   MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs); | ||||
|   typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t; | ||||
|    | ||||
| @@ -307,7 +307,8 @@ int main (int argc, char ** argv) | ||||
|     rh_res= Zero(); | ||||
|     mrhsCG(MrhsCoarseOp,rh_src,rh_res); | ||||
|   } | ||||
|    | ||||
|  | ||||
| #endif | ||||
|   std::cout<<GridLogMessage<<std::endl; | ||||
|   std::cout<<GridLogMessage<<std::endl; | ||||
|   std::cout<<GridLogMessage<<"*******************************************"<<std::endl; | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -26,84 +26,13 @@ Author: Peter Boyle <pboyle@bnl.gov> | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/lattice/PaddedCell.h> | ||||
| #include <Grid/stencil/GeneralLocalStencil.h> | ||||
| //#include <Grid/algorithms/GeneralCoarsenedMatrix.h> | ||||
| #include <Grid/algorithms/iterative/AdefGeneric.h> | ||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h> | ||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h> | ||||
| #include <Grid/algorithms/iterative/AdefMrhs.h> | ||||
|  | ||||
| using namespace std; | ||||
| using namespace Grid; | ||||
|  | ||||
| template<class Coarsened> | ||||
| void SaveOperator(Coarsened &Operator,std::string file) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   emptyUserRecord record; | ||||
|   ScidacWriter WR(Operator.Grid()->IsBoss()); | ||||
|   assert(Operator._A.size()==Operator.geom.npoint); | ||||
|   WR.open(file); | ||||
|   for(int p=0;p<Operator._A.size();p++){ | ||||
|     auto tmp = Operator.Cell.Extract(Operator._A[p]); | ||||
|     WR.writeScidacFieldRecord(tmp,record); | ||||
|   } | ||||
|   WR.close(); | ||||
| #endif | ||||
| } | ||||
| template<class Coarsened> | ||||
| void LoadOperator(Coarsened &Operator,std::string file) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   emptyUserRecord record; | ||||
|   Grid::ScidacReader RD ; | ||||
|   RD.open(file); | ||||
|   assert(Operator._A.size()==Operator.geom.npoint); | ||||
|   for(int p=0;p<Operator.geom.npoint;p++){ | ||||
|     conformable(Operator._A[p].Grid(),Operator.CoarseGrid()); | ||||
|     RD.readScidacFieldRecord(Operator._A[p],record); | ||||
|   }     | ||||
|   RD.close(); | ||||
|   Operator.ExchangeCoarseLinks(); | ||||
| #endif | ||||
| } | ||||
| template<class aggregation> | ||||
| void SaveBasis(aggregation &Agg,std::string file) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   emptyUserRecord record; | ||||
|   ScidacWriter WR(Agg.FineGrid->IsBoss()); | ||||
|   WR.open(file); | ||||
|   for(int b=0;b<Agg.subspace.size();b++){ | ||||
|     WR.writeScidacFieldRecord(Agg.subspace[b],record); | ||||
|   } | ||||
|   WR.close(); | ||||
| #endif | ||||
| } | ||||
| template<class aggregation> | ||||
| void LoadBasis(aggregation &Agg, std::string file) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   emptyUserRecord record; | ||||
|   ScidacReader RD ; | ||||
|   RD.open(file); | ||||
|   for(int b=0;b<Agg.subspace.size();b++){ | ||||
|     RD.readScidacFieldRecord(Agg.subspace[b],record); | ||||
|   }     | ||||
|   RD.close(); | ||||
| #endif | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class Field> class TestSolver : public LinearFunction<Field> { | ||||
| public: | ||||
|   TestSolver() {}; | ||||
|   void operator() (const Field &in, Field &out){    out = Zero();  }      | ||||
| }; | ||||
|  | ||||
|  | ||||
| RealD InverseApproximation(RealD x){ | ||||
|   return 1.0/x; | ||||
| } | ||||
|  | ||||
| // Want Op in CoarsenOp to call MatPcDagMatPc | ||||
| template<class Field> | ||||
| class HermOpAdaptor : public LinearOperatorBase<Field> | ||||
| @@ -119,33 +48,37 @@ public: | ||||
|   void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  }; | ||||
|   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  } | ||||
| }; | ||||
| template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field> | ||||
|  | ||||
| template<class Field> class CGSmoother : public LinearFunction<Field> | ||||
| { | ||||
| public: | ||||
|   using LinearFunction<Field>::operator(); | ||||
|   typedef LinearOperatorBase<Field> FineOperator; | ||||
|   FineOperator   & _SmootherOperator; | ||||
|   Chebyshev<Field> Cheby; | ||||
|   ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) : | ||||
|   int iters; | ||||
|   CGSmoother(int _iters, FineOperator &SmootherOperator) : | ||||
|     _SmootherOperator(SmootherOperator), | ||||
|     Cheby(_lo,_hi,_ord,InverseApproximation) | ||||
|     iters(_iters) | ||||
|   { | ||||
|     std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl; | ||||
|     std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl; | ||||
|   }; | ||||
|   void operator() (const Field &in, Field &out)  | ||||
|   { | ||||
|     Field tmp(in.Grid()); | ||||
|     tmp = in; | ||||
|     Cheby(_SmootherOperator,tmp,out); | ||||
|     ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother | ||||
|  | ||||
|     out=Zero(); | ||||
|  | ||||
|     CG(_SmootherOperator,in,out); | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|   const int Ls=24; | ||||
|   const int nbasis = 40; | ||||
|   const int nbasis = 60; | ||||
|   const int cb = 0 ; | ||||
|   RealD mass=0.00078; | ||||
|   RealD M5=1.8; | ||||
| @@ -160,10 +93,12 @@ int main (int argc, char ** argv) | ||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   // Construct a coarsened grid with 4^4 cell | ||||
|   Coordinate Block({4,4,4,4}); | ||||
|   Coordinate clatt = GridDefaultLatt(); | ||||
|   for(int d=0;d<clatt.size();d++){ | ||||
|     clatt[d] = clatt[d]/4; | ||||
|     clatt[d] = clatt[d]/Block[d]; | ||||
|   } | ||||
|  | ||||
|   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, | ||||
| 							    GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||
| 							    GridDefaultMpi());; | ||||
| @@ -182,7 +117,7 @@ int main (int argc, char ** argv) | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|  | ||||
|   FieldMetaData header; | ||||
|   std::string file("ckpoint_lat.4000"); | ||||
|   std::string file("ckpoint_EODWF_lat.125"); | ||||
|   NerscIO::readConfiguration(Umu,header,file); | ||||
|  | ||||
|   //////////////////////// Fermion action ////////////////////////////////// | ||||
| @@ -192,15 +127,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix; | ||||
|   HermFineMatrix FineHermOp(HermOpEO); | ||||
|    | ||||
|   LatticeFermion result(FrbGrid); result=Zero(); | ||||
|  | ||||
|   LatticeFermion    src(FrbGrid); random(RNG5,src); | ||||
|  | ||||
|   // Run power method on FineHermOp | ||||
|   PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src); | ||||
|  | ||||
|   | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   ///////////// Coarse basis and Little Dirac Operator /////// | ||||
|   //////////////////////////////////////////////////////////// | ||||
| @@ -208,219 +135,170 @@ int main (int argc, char ** argv) | ||||
|   typedef LittleDiracOperator::CoarseVector CoarseVector; | ||||
|  | ||||
|   NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); | ||||
|   NearestStencilGeometry5D geom_nn(Coarse5d); | ||||
|    | ||||
|   // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp | ||||
|  | ||||
|   typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace; | ||||
|   Subspace Aggregates(Coarse5d,FrbGrid,cb); | ||||
|  | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   // Need to check about red-black grid coarsening | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d); | ||||
|  | ||||
|   bool load=false; | ||||
|   if ( load ) { | ||||
|     LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac"); | ||||
|     LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac"); | ||||
|   } else { | ||||
|     Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, | ||||
| 				       95.0,0.1, | ||||
| 				       //				     400,200,200 -- 48 iters | ||||
| 				       //				     600,200,200 -- 38 iters, 162s | ||||
| 				       //				     600,200,100 -- 38 iters, 169s | ||||
| 				       //				     600,200,50  -- 88 iters. 370s  | ||||
| 				       800, | ||||
| 				       200, | ||||
| 				       100, | ||||
| 				       0.0); | ||||
|     LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates); | ||||
|     SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac"); | ||||
|     SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac"); | ||||
|   } | ||||
|    | ||||
|   // Try projecting to one hop only | ||||
|   LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d); | ||||
|   LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n | ||||
|   int refine=1; | ||||
|     //    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO, | ||||
|     //    					0.0003,1.0e-5,2000); // Lo, tol, maxit | ||||
|     //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Create Subspace"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);  | ||||
|  | ||||
|   typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix; | ||||
|   HermMatrix CoarseOp     (LittleDiracOp); | ||||
|   HermMatrix CoarseOpProj (LittleDiracOpProj); | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Refine Subspace"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters | ||||
|    | ||||
|   ////////////////////////////////////////// | ||||
|   // Build a coarse lanczos | ||||
|   ////////////////////////////////////////// | ||||
|   Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter | ||||
|   FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp); | ||||
|   PlainHermOp<CoarseVector>    IRLOp    (CoarseOp); | ||||
|   int Nk=48; | ||||
|   int Nm=64; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Coarsen after refine"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   Aggregates.Orthogonalise(); | ||||
|  | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Building MultiRHS Coarse operator"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true); | ||||
|      | ||||
|   const int nrhs=12; | ||||
|      | ||||
|   Coordinate mpi=GridDefaultMpi(); | ||||
|   Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]}); | ||||
|   Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]}); | ||||
|   Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1}); | ||||
|      | ||||
|   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  | ||||
|   typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t; | ||||
|   MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs); | ||||
|  | ||||
|   mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d); | ||||
|    | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "         Coarse Lanczos               "<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|  | ||||
|   typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix; | ||||
|   Chebyshev<CoarseVector>      IRLCheby(0.01,42.0,301);  // 1 iter | ||||
|   MrhsHermMatrix MrhsCoarseOp     (mrhs); | ||||
|  | ||||
|   CoarseVector pm_src(CoarseMrhs); | ||||
|   pm_src = ComplexD(1.0); | ||||
|   PowerMethod<CoarseVector>       cPM; | ||||
|   cPM(MrhsCoarseOp,pm_src); | ||||
|  | ||||
|   int Nk=192; | ||||
|   int Nm=384; | ||||
|   int Nstop=Nk; | ||||
|   ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20); | ||||
|   int Nconv_test_interval=1; | ||||
|    | ||||
|   ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp, | ||||
| 							  Coarse5d, | ||||
| 							  CoarseMrhs, | ||||
| 							  nrhs, | ||||
| 							  IRLCheby, | ||||
| 							  Nstop, | ||||
| 							  Nconv_test_interval, | ||||
| 							  nrhs, | ||||
| 							  Nk, | ||||
| 							  Nm, | ||||
| 							  1e-5,10); | ||||
|  | ||||
|   int Nconv; | ||||
|   std::vector<RealD>            eval(Nm); | ||||
|   std::vector<CoarseVector>     evec(Nm,Coarse5d); | ||||
|   CoarseVector c_src(Coarse5d); | ||||
|   //c_src=1.0; | ||||
|   random(CRNG,c_src); | ||||
|  | ||||
|   CoarseVector c_res(Coarse5d);  | ||||
|   CoarseVector c_ref(Coarse5d);  | ||||
|  | ||||
|   PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src); | ||||
|  | ||||
|   IRL.calc(eval,evec,c_src,Nconv); | ||||
|   DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval); | ||||
|   std::vector<CoarseVector>     c_src(nrhs,Coarse5d); | ||||
|  | ||||
|   ////////////////////////////////////////// | ||||
|   // Build a coarse space solver | ||||
|   // Block projector for coarse/fine | ||||
|   ////////////////////////////////////////// | ||||
|   int maxit=20000; | ||||
|   ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false); | ||||
|   ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,10000,false); | ||||
|   ZeroGuesser<CoarseVector> CoarseZeroGuesser; | ||||
|  | ||||
|   //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser); | ||||
|   HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser); | ||||
|   c_res=Zero(); | ||||
|   HPDSolve(c_src,c_res); c_ref = c_res; | ||||
|   std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl; | ||||
|   std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl; | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   // Deflated (with real op EV's) solve for the projected coarse op | ||||
|   // Work towards ADEF1 in the coarse space | ||||
|   ////////////////////////////////////////////////////////////////////////// | ||||
|   HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser); | ||||
|   c_res=Zero(); | ||||
|   HPDSolveProj(c_src,c_res); | ||||
|   std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl; | ||||
|   std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl; | ||||
|   c_res = c_res - c_ref; | ||||
|   std::cout << "Projected solver error "<<norm2(c_res)<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Calling mRHS HDCG"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   MultiRHSBlockProject<LatticeFermionD> MrhsProjector; | ||||
|   MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d); | ||||
|   MrhsProjector.ImportBasis(Aggregates.subspace); | ||||
|  | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   // Coarse ADEF1 with deflation space | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   ChebyshevSmoother<CoarseVector,HermMatrix > | ||||
|     CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence | ||||
|     //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  // | ||||
|   //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s | ||||
|   //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s | ||||
|   //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter | ||||
|   //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter | ||||
|   //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311 | ||||
|  | ||||
|   //////////////////////////////////////////////////////// | ||||
|   // CG, Cheby mode spacing 200,200 | ||||
|   // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s | ||||
|   // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s | ||||
|   // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s | ||||
|   //////////////////////////////////////////////////////// | ||||
|   // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs  | ||||
|   //////////////////////////////////////////////////////// | ||||
|   // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain | ||||
|   // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s | ||||
|   // HDCG 38 iters 162s | ||||
|   // | ||||
|   // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs  | ||||
|   // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain | ||||
|   // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s | ||||
|   // HDCG 38 iters 169s | ||||
|  | ||||
|   TwoLevelADEF1defl<CoarseVector> | ||||
|     cADEF1(1.0e-8, 500, | ||||
| 	   CoarseOp, | ||||
| 	   CoarseSmoother, | ||||
| 	   evec,eval); | ||||
|  | ||||
|   c_res=Zero(); | ||||
|   cADEF1(c_src,c_res); | ||||
|   std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl; | ||||
|   std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl; | ||||
|   c_res = c_res - c_ref; | ||||
|   std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl; | ||||
|    | ||||
|   //  cADEF1.Tolerance = 4.0e-2; | ||||
|   //  cADEF1.Tolerance = 1.0e-1; | ||||
|   cADEF1.Tolerance = 5.0e-2; | ||||
|   c_res=Zero(); | ||||
|   cADEF1(c_src,c_res); | ||||
|   std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl; | ||||
|   std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl; | ||||
|   c_res = c_res - c_ref; | ||||
|   std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl; | ||||
|    | ||||
|   ////////////////////////////////////////// | ||||
|   // Build a smoother | ||||
|   ////////////////////////////////////////// | ||||
|   //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499 | ||||
|   //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383 | ||||
|   //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328 | ||||
|   //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1 | ||||
|   //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations | ||||
|   //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters | ||||
|   //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters | ||||
|  | ||||
|   // | ||||
|   // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40 | ||||
|   // Need to measure cost of coarse space. | ||||
|   // | ||||
|   // -- i) Reduce coarse residual   -- 0.04 | ||||
|   // -- ii) Lanczos on coarse space -- done | ||||
|   // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and | ||||
|   //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec | ||||
|   // | ||||
|   std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters | ||||
|  | ||||
|   //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults) | ||||
|   std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)   | ||||
|  | ||||
|   for(int l=0;l<los.size();l++){ | ||||
|  | ||||
|     RealD lo = los[l]; | ||||
|  | ||||
|     for(int o=0;o<ords.size();o++){ | ||||
|  | ||||
|       ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false); | ||||
|       HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser); | ||||
|        | ||||
|       //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case | ||||
|       ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp);  // 311 | ||||
|  | ||||
|       ////////////////////////////////////////// | ||||
|       // Build a HDCG solver | ||||
|       ////////////////////////////////////////// | ||||
|       TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace> | ||||
| 	HDCG(1.0e-8, 100, | ||||
| 	     FineHermOp, | ||||
| 	     Smoother, | ||||
| 	     HPDSolveSloppy, | ||||
| 	     HPDSolve, | ||||
| 	     Aggregates); | ||||
|  | ||||
|       TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace> | ||||
| 	HDCGdefl(1.0e-8, 100, | ||||
| 		 FineHermOp, | ||||
| 		 Smoother, | ||||
| 		 cADEF1, | ||||
| 		 HPDSolve, | ||||
| 		 Aggregates); | ||||
|        | ||||
|       result=Zero(); | ||||
|       HDCGdefl(src,result); | ||||
|  | ||||
|       result=Zero(); | ||||
|       HDCG(src,result); | ||||
|  | ||||
|        | ||||
|     } | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << " Recompute coarse evecs  "<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   evec.resize(Nm,Coarse5d); | ||||
|   eval.resize(Nm); | ||||
|   for(int r=0;r<nrhs;r++){ | ||||
|     random(CRNG,c_src[r]); | ||||
|   } | ||||
|  | ||||
|   // Standard CG | ||||
|   result=Zero(); | ||||
|   CGfine(HermOpEO, src, result); | ||||
|   IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl); | ||||
|  | ||||
|   /////////////////////// | ||||
|   // Deflation guesser object | ||||
|   /////////////////////// | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << " Reimport coarse evecs  "<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   MultiRHSDeflation<CoarseVector> MrhsGuesser; | ||||
|   MrhsGuesser.ImportEigenBasis(evec,eval); | ||||
|        | ||||
|   ////////////////////////// | ||||
|   // Extra HDCG parameters | ||||
|   ////////////////////////// | ||||
|   int maxit=3000; | ||||
|   ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false); | ||||
|   RealD lo=2.0; | ||||
|   int ord = 7; | ||||
|  | ||||
|   DoNothingGuesser<CoarseVector> DoNothing; | ||||
|   HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing); | ||||
|  | ||||
|   ///////////////////////////////////////////////// | ||||
|   // Mirs smoother | ||||
|   ///////////////////////////////////////////////// | ||||
|   RealD MirsShift = lo; | ||||
|   ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift); | ||||
|   CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ; | ||||
|  | ||||
|   TwoLevelADEF2mrhs<LatticeFermion,CoarseVector> | ||||
|     HDCGmrhs(1.0e-8, 500, | ||||
| 	     FineHermOp, | ||||
| 	     CGsmooth, | ||||
| 	     HPDSolveMrhs,    // Used in M1 | ||||
| 	     HPDSolveMrhs,          // Used in Vstart | ||||
| 	     MrhsProjector, | ||||
| 	     MrhsGuesser, | ||||
| 	     CoarseMrhs); | ||||
|      | ||||
|   std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid); | ||||
|   std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid); | ||||
|    | ||||
|   for(int r=0;r<nrhs;r++){ | ||||
|     random(RNG5,src_mrhs[r]); | ||||
|     res_mrhs[r]=Zero(); | ||||
|   } | ||||
|    | ||||
|   HDCGmrhs(src_mrhs,res_mrhs); | ||||
|  | ||||
|   // Standard CG | ||||
| #if 1 | ||||
|   { | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Calling red black CG"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|        | ||||
|     LatticeFermion result(FrbGrid); result=Zero(); | ||||
|     LatticeFermion    src(FrbGrid); random(RNG5,src); | ||||
|     result=Zero(); | ||||
|  | ||||
|     ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false); | ||||
|     CGfine(HermOpEO, src, result); | ||||
|   } | ||||
| #endif   | ||||
|   Grid_finalize(); | ||||
|   return 0; | ||||
| } | ||||
|   | ||||
| @@ -145,7 +145,7 @@ int main (int argc, char ** argv) | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|   const int Ls=24; | ||||
|   const int nbasis = 60; | ||||
|   const int nbasis = 62; | ||||
|   const int cb = 0 ; | ||||
|   RealD mass=0.00078; | ||||
|   RealD M5=1.8; | ||||
| @@ -160,7 +160,7 @@ int main (int argc, char ** argv) | ||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   // Construct a coarsened grid with 4^4 cell | ||||
|   Coordinate Block({4,4,4,4}); | ||||
|   Coordinate Block({4,4,6,4}); | ||||
|   Coordinate clatt = GridDefaultLatt(); | ||||
|   for(int d=0;d<clatt.size();d++){ | ||||
|     clatt[d] = clatt[d]/Block[d]; | ||||
|   | ||||
							
								
								
									
										396
									
								
								tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										396
									
								
								tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,396 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./tests/Test_general_coarse_hdcg.cc | ||||
|  | ||||
|     Copyright (C) 2023 | ||||
|  | ||||
| Author: Peter Boyle <pboyle@bnl.gov> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h> | ||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h> | ||||
| #include <Grid/algorithms/iterative/AdefMrhs.h> | ||||
|  | ||||
| using namespace std; | ||||
| using namespace Grid; | ||||
|  | ||||
| template<class aggregation> | ||||
| void SaveBasis(aggregation &Agg,std::string file) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   emptyUserRecord record; | ||||
|   ScidacWriter WR(Agg.FineGrid->IsBoss()); | ||||
|   WR.open(file); | ||||
|   for(int b=0;b<Agg.subspace.size();b++){ | ||||
|     WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC); | ||||
|     //    WR.writeScidacFieldRecord(Agg.subspace[b],record); | ||||
|   } | ||||
|   WR.close(); | ||||
| #endif | ||||
| } | ||||
| template<class aggregation> | ||||
| void LoadBasis(aggregation &Agg, std::string file) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   emptyUserRecord record; | ||||
|   ScidacReader RD ; | ||||
|   RD.open(file); | ||||
|   for(int b=0;b<Agg.subspace.size();b++){ | ||||
|     RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC); | ||||
|     //    RD.readScidacFieldRecord(Agg.subspace[b],record,0); | ||||
|   }     | ||||
|   RD.close(); | ||||
| #endif | ||||
| } | ||||
| template<class CoarseVector> | ||||
| void SaveEigenvectors(std::vector<RealD>            &eval, | ||||
| 		      std::vector<CoarseVector>     &evec, | ||||
| 		      std::string evec_file, | ||||
| 		      std::string eval_file) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   emptyUserRecord record; | ||||
|   ScidacWriter WR(evec[0].Grid()->IsBoss()); | ||||
|   WR.open(evec_file); | ||||
|   for(int b=0;b<evec.size();b++){ | ||||
|     WR.writeScidacFieldRecord(evec[b],record,0,0); | ||||
|   } | ||||
|   WR.close(); | ||||
|   XmlWriter WRx(eval_file); | ||||
|   write(WRx,"evals",eval); | ||||
| #endif | ||||
| } | ||||
| template<class CoarseVector> | ||||
| void LoadEigenvectors(std::vector<RealD>            &eval, | ||||
| 		      std::vector<CoarseVector>     &evec, | ||||
| 		      std::string evec_file, | ||||
| 		      std::string eval_file) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|     XmlReader RDx(eval_file); | ||||
|     read(RDx,"evals",eval); | ||||
|     emptyUserRecord record; | ||||
|  | ||||
|     Grid::ScidacReader RD ; | ||||
|     RD.open(evec_file); | ||||
|     assert(evec.size()==eval.size()); | ||||
|     for(int k=0;k<eval.size();k++) { | ||||
|       RD.readScidacFieldRecord(evec[k],record); | ||||
|     } | ||||
|     RD.close(); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| // Want Op in CoarsenOp to call MatPcDagMatPc | ||||
| template<class Field> | ||||
| class HermOpAdaptor : public LinearOperatorBase<Field> | ||||
| { | ||||
|   LinearOperatorBase<Field> & wrapped; | ||||
| public: | ||||
|   HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {}; | ||||
|   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  } | ||||
|   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); } | ||||
|   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  } | ||||
|   void OpDiag (const Field &in, Field &out)                  {    assert(0);  } | ||||
|   void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  } | ||||
|   void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  }; | ||||
|   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  } | ||||
| }; | ||||
|  | ||||
| template<class Field> class CGSmoother : public LinearFunction<Field> | ||||
| { | ||||
| public: | ||||
|   using LinearFunction<Field>::operator(); | ||||
|   typedef LinearOperatorBase<Field> FineOperator; | ||||
|   FineOperator   & _SmootherOperator; | ||||
|   int iters; | ||||
|   CGSmoother(int _iters, FineOperator &SmootherOperator) : | ||||
|     _SmootherOperator(SmootherOperator), | ||||
|     iters(_iters) | ||||
|   { | ||||
|     std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl; | ||||
|   }; | ||||
|   void operator() (const Field &in, Field &out)  | ||||
|   { | ||||
|     ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother | ||||
|  | ||||
|     out=Zero(); | ||||
|  | ||||
|     CG(_SmootherOperator,in,out); | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|   const int Ls=24; | ||||
|   const int nbasis = 60; | ||||
|   const int cb = 0 ; | ||||
|   RealD mass=0.00078; | ||||
|   RealD M5=1.8; | ||||
|   RealD b=1.5; | ||||
|   RealD c=0.5; | ||||
|  | ||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), | ||||
| 								   GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||
| 								   GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   // Construct a coarsened grid with 4^4 cell | ||||
|   //  Coordinate Block({4,4,6,4}); | ||||
|   Coordinate Block({4,4,4,4}); | ||||
|   Coordinate clatt = GridDefaultLatt(); | ||||
|   for(int d=0;d<clatt.size();d++){ | ||||
|     clatt[d] = clatt[d]/Block[d]; | ||||
|   } | ||||
|  | ||||
|   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, | ||||
| 							    GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||
| 							    GridDefaultMpi());; | ||||
|   GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d); | ||||
|  | ||||
|   ///////////////////////// RNGs ///////////////////////////////// | ||||
|   std::vector<int> seeds4({1,2,3,4}); | ||||
|   std::vector<int> seeds5({5,6,7,8}); | ||||
|   std::vector<int> cseeds({5,6,7,8}); | ||||
|  | ||||
|   GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5); | ||||
|   GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4); | ||||
|   GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); | ||||
|  | ||||
|   ///////////////////////// Configuration ///////////////////////////////// | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|  | ||||
|   FieldMetaData header; | ||||
|   std::string file("/lustre/orion/phy157/proj-shared/phy157_dwf/lehner/ensemble-Ha/ckpoint_lat.2250"); | ||||
|   NerscIO::readConfiguration(Umu,header,file); | ||||
|  | ||||
|   //////////////////////// Fermion action ////////////////////////////////// | ||||
|   MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); | ||||
|  | ||||
|   SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf); | ||||
|  | ||||
|   typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix; | ||||
|   HermFineMatrix FineHermOp(HermOpEO); | ||||
|  | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   ///////////// Coarse basis and Little Dirac Operator /////// | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator; | ||||
|   typedef LittleDiracOperator::CoarseVector CoarseVector; | ||||
|  | ||||
|   NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); | ||||
|  | ||||
|   typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace; | ||||
|   Subspace Aggregates(Coarse5d,FrbGrid,cb); | ||||
|  | ||||
|   //////////////////////////////////////////////////////////// | ||||
|   // Need to check about red-black grid coarsening | ||||
|   //////////////////////////////////////////////////////////// | ||||
|  | ||||
|   //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys96.mixed.2500.60"); | ||||
|   std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60"); | ||||
|   std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60_v2"); | ||||
|   std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys96.mixed.60"); | ||||
|   std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac"); | ||||
|   std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml"); | ||||
|   bool load_agg=true; | ||||
|   bool load_refine=true; | ||||
|   bool load_mat=false; | ||||
|   bool load_evec=false; | ||||
|  | ||||
|   int refine=1; | ||||
|   if ( load_agg ) { | ||||
|     if ( !(refine) || (!load_refine) ) {  | ||||
|       LoadBasis(Aggregates,subspace_file); | ||||
|     } | ||||
|   } else { | ||||
|     Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);  | ||||
|     SaveBasis(Aggregates,subspace_file); | ||||
|   } | ||||
|  | ||||
|   if ( load_refine ) { | ||||
|     std::cout << " Load Refine "<< refine_file <<std::endl; | ||||
|     LoadBasis(Aggregates,refine_file); | ||||
|   } else { | ||||
|     Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,3000); // 172 iters | ||||
|     //    Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,2500); // 172 iters | ||||
|     SaveBasis(Aggregates,refine_file); | ||||
|   } | ||||
|   Aggregates.Orthogonalise(); | ||||
|    | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Building MultiRHS Coarse operator"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|      | ||||
|   const int nrhs=12; | ||||
|      | ||||
|   Coordinate mpi=GridDefaultMpi(); | ||||
|   Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]}); | ||||
|   Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]}); | ||||
|   Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1}); | ||||
|      | ||||
|   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  | ||||
|   typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t; | ||||
|   MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs); | ||||
|  | ||||
|   /////////////////////// | ||||
|   // Deflation guesser object | ||||
|   /////////////////////// | ||||
|   MultiRHSDeflation<CoarseVector> MrhsGuesser; | ||||
|  | ||||
|   ////////////////////////////////////////// | ||||
|   // Block projector for coarse/fine | ||||
|   ////////////////////////////////////////// | ||||
|   MultiRHSBlockProject<LatticeFermionD> MrhsProjector; | ||||
|  | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Coarsen after refine"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d); | ||||
|  | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "         Coarse Lanczos               "<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|  | ||||
|   typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix; | ||||
|   //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,301);  // 4.4.6.4 | ||||
|   //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,501);  // for 4.4.4.4 blocking 350 evs | ||||
|   //  Chebyshev<CoarseVector>      IRLCheby(0.0014,42.0,501);  // for 4.4.4.4 blocking 700 evs | ||||
|   //  Chebyshev<CoarseVector>      IRLCheby(0.002,42.0,501);  // for 4.4.4.4 blocking 1226 evs | ||||
|   //  Chebyshev<CoarseVector>      IRLCheby(0.0025,42.0,501);  // for 4.4.4.4 blocking 1059 evs | ||||
| 							  //							  3e-4,2); | ||||
|   Chebyshev<CoarseVector>      IRLCheby(0.0018,42.0,301);  // for 4.4.4.4 blocking  // 790 evs | ||||
|    | ||||
|   MrhsHermMatrix MrhsCoarseOp     (mrhs); | ||||
|  | ||||
|   CoarseVector pm_src(CoarseMrhs); | ||||
|   pm_src = ComplexD(1.0); | ||||
|   PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src); | ||||
|  | ||||
|   //  int Nk=nrhs*30; // 4.4.6.4 | ||||
|   //  int Nk=nrhs*80; | ||||
|   int Nk=nrhs*60; // 720 | ||||
|   int Nm=Nk*4;    // 2880 ; generally finishes at 1440 | ||||
|   int Nstop=512; | ||||
|   int Nconv_test_interval=1; | ||||
|    | ||||
|   ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp, | ||||
| 							  Coarse5d, | ||||
| 							  CoarseMrhs, | ||||
| 							  nrhs, | ||||
| 							  IRLCheby, | ||||
| 							  Nstop, | ||||
| 							  Nconv_test_interval, | ||||
| 							  nrhs, | ||||
| 							  Nk, | ||||
| 							  Nm, | ||||
| 							  3e-4,2); | ||||
|  | ||||
|   std::vector<RealD>            eval(Nm); | ||||
|   std::vector<CoarseVector>     evec(Nm,Coarse5d); | ||||
|   std::vector<CoarseVector>     c_src(nrhs,Coarse5d); | ||||
|  | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << " Recompute coarse evecs  "<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   evec.resize(Nm,Coarse5d); | ||||
|   eval.resize(Nm); | ||||
|   for(int r=0;r<nrhs;r++){ | ||||
|     random(CRNG,c_src[r]); | ||||
|   } | ||||
|   int Nconv; | ||||
|   IRL.calc(eval,evec,c_src,Nconv,LanczosType::rbl); | ||||
|   Nconv = eval.size(); | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << " import coarse evecs  "<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   MrhsGuesser.ImportEigenBasis(evec,eval); | ||||
|  | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Calling mRHS HDCG"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d); | ||||
|   MrhsProjector.ImportBasis(Aggregates.subspace); | ||||
|  | ||||
|   ////////////////////////// | ||||
|   // Extra HDCG parameters | ||||
|   ////////////////////////// | ||||
|   int maxit=3000; | ||||
|   ConjugateGradient<CoarseVector>  CG(7.5e-2,maxit,false); | ||||
|   RealD lo=2.0; | ||||
|   int ord = 7; | ||||
|  | ||||
|   DoNothingGuesser<CoarseVector> DoNothing; | ||||
|   HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing); | ||||
|   HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing); | ||||
|  | ||||
|   ///////////////////////////////////////////////// | ||||
|   // Mirs smoother | ||||
|   ///////////////////////////////////////////////// | ||||
|   RealD MirsShift = lo; | ||||
|   ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift); | ||||
|   CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ; | ||||
|    | ||||
|    | ||||
|   TwoLevelADEF2mrhs<LatticeFermion,CoarseVector> | ||||
|     HDCGmrhs(1.0e-8, 500, | ||||
| 	     FineHermOp, | ||||
| 	     CGsmooth, | ||||
| 	     HPDSolveMrhs,    // Used in M1 | ||||
| 	     HPDSolveMrhs,          // Used in Vstart | ||||
| 	     MrhsProjector, | ||||
| 	     MrhsGuesser, | ||||
| 	     CoarseMrhs); | ||||
|      | ||||
|   std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid); | ||||
|   std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid); | ||||
|    | ||||
|   for(int r=0;r<nrhs;r++){ | ||||
|     random(RNG5,src_mrhs[r]); | ||||
|     res_mrhs[r]=Zero(); | ||||
|   } | ||||
|    | ||||
|   HDCGmrhs(src_mrhs,res_mrhs); | ||||
|  | ||||
|   // Standard CG | ||||
| #if 0 | ||||
|   { | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|   std::cout << "Calling red black CG"<<std::endl; | ||||
|   std::cout << "**************************************"<<std::endl; | ||||
|        | ||||
|     LatticeFermion result(FrbGrid); result=Zero(); | ||||
|     LatticeFermion    src(FrbGrid); random(RNG5,src); | ||||
|     result=Zero(); | ||||
|  | ||||
|     ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false); | ||||
|     CGfine(HermOpEO, src, result); | ||||
|   } | ||||
| #endif   | ||||
|   Grid_finalize(); | ||||
|   return 0; | ||||
| } | ||||
		Reference in New Issue
	
	Block a user