mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 20:14:32 +00:00 
			
		
		
		
	Compare commits
	
		
			66 Commits
		
	
	
		
			9a1ad6a5eb
			...
			feature/ft
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | bffd30abec | ||
|  | da919949f9 | ||
|  | b12b4fdaff | ||
|  | 557fa483ff | ||
|  | fc15d55df6 | ||
|  | 53573d7d94 | ||
|  | bb3c177000 | ||
|  | a3322b470f | ||
|  | f8f408e7a9 | ||
|  | baac1127d0 | ||
|  | 6f1328160c | ||
|  | 04cf902791 | ||
|  | 7a5b1c1a19 | ||
|  | 18d2d7da4a | ||
|  | b461184797 | ||
|  | 4563b39305 | ||
|  | c9d5674d5b | ||
|  | 486412635a | ||
|  | 8b23a1546a | ||
|  | a901e4e369 | ||
|  | 804d9367d4 | ||
|  | 41d8adca95 | ||
|  | 059e8e5bb0 | ||
|  | b3ee8ded96 | ||
|  | cf3584ad15 | ||
|  | a66973163f | ||
|  | 4502a8c8a1 | ||
|  | 9c902e4c2d | ||
|  | f3eb36adcf | ||
|  | 7c246606c1 | ||
|  | 172c75029e | ||
|  | 6ae52da571 | ||
|  | 4ee9c68053 | ||
|  | a15b4378a3 | ||
|  | 89fdd7f8dd | ||
|  | c328be24b7 | ||
|  | a73dc6dbf4 | ||
|  | eee2a2657f | ||
|  | 12b8be7cb9 | ||
|  | 63c223ea5d | ||
|  | 2877fb4a2c | ||
|  | d299c86633 | ||
|  | 6ce52092e8 | ||
|  | b5926c1d21 | ||
|  | 9563238e9b | ||
|  | fb9b1d76ca | ||
|  | 1739146599 | ||
|  | ed20b39ab3 | ||
|  | 284fc05f15 | ||
|  | 07a07b6fa3 | ||
|  | dc80b08969 | ||
|  | a49a161f8d | ||
|  | a6479ca50f | ||
|  | 0e607a55e7 | ||
|  | c4b9f71357 | ||
|  | 394e506aea | ||
|  | e19b26341b | ||
|  | cfe1b13225 | ||
|  | 890c5ea1cd | ||
|  | a87378d3b6 | ||
|  | 832fc08809 | ||
|  | 461cd045c6 | ||
|  | fee65d7a75 | ||
|  | 31f9971dbf | ||
|  | d87296f3e8 | ||
|  | be94cf1c6f | 
							
								
								
									
										1052
									
								
								BLAS_benchmark/BatchBlasBench.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1052
									
								
								BLAS_benchmark/BatchBlasBench.cc
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										2
									
								
								BLAS_benchmark/compile-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								BLAS_benchmark/compile-command
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  |  | ||||||
|  | mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench | ||||||
| @@ -30,9 +30,14 @@ directory | |||||||
|  |  | ||||||
| #include <type_traits> | #include <type_traits> | ||||||
| #include <cassert> | #include <cassert> | ||||||
|  | #include <exception> | ||||||
|  |  | ||||||
| #define NAMESPACE_BEGIN(A) namespace A { | #define NAMESPACE_BEGIN(A) namespace A { | ||||||
| #define NAMESPACE_END(A)   } | #define NAMESPACE_END(A)   } | ||||||
| #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) | #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) | ||||||
| #define GRID_NAMESPACE_END   NAMESPACE_END(Grid) | #define GRID_NAMESPACE_END   NAMESPACE_END(Grid) | ||||||
| #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  );  | #define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  );  | ||||||
|  |  | ||||||
|  | #define EXCEPTION_CHECK_BEGIN(A) try { | ||||||
|  | #define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -89,9 +89,10 @@ public: | |||||||
|       gridblasHandle = theGridAccelerator; |       gridblasHandle = theGridAccelerator; | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_ONE_MKL | #ifdef GRID_ONE_MKL | ||||||
|       cl::sycl::cpu_selector selector; |       cl::sycl::gpu_selector selector; | ||||||
|       cl::sycl::device selectedDevice { selector }; |       cl::sycl::device selectedDevice { selector }; | ||||||
|       gridblasHandle =new sycl::queue (selectedDevice); |       cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; | ||||||
|  |       gridblasHandle =new sycl::queue (selectedDevice,q_prop); | ||||||
| #endif | #endif | ||||||
|       gridblasInit=1; |       gridblasInit=1; | ||||||
|     } |     } | ||||||
| @@ -207,6 +208,9 @@ public: | |||||||
|     assert(Bkn.size()==batchCount); |     assert(Bkn.size()==batchCount); | ||||||
|     assert(Cmn.size()==batchCount); |     assert(Cmn.size()==batchCount); | ||||||
|  |  | ||||||
|  |     assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose | ||||||
|  |     assert(OpB!=GridBLAS_OP_T); | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |     int lda = m; // m x k column major | ||||||
|     int ldb = k; // k x n column major |     int ldb = k; // k x n column major | ||||||
|     int ldc = m; // m x b column major |     int ldc = m; // m x b column major | ||||||
| @@ -266,26 +270,130 @@ public: | |||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |     assert(err==CUBLAS_STATUS_SUCCESS); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |       int64_t m64=m; | ||||||
| #warning "oneMKL implementation not built " |       int64_t n64=n; | ||||||
| #endif |       int64_t k64=k; | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) |       int64_t lda64=lda; | ||||||
|     // Need a default/reference implementation |       int64_t ldb64=ldb; | ||||||
|     int sda = lda*k; |       int64_t ldc64=ldc; | ||||||
|     int sdb = ldb*k; |       int64_t batchCount64=batchCount; | ||||||
|     int sdc = ldc*n; |  | ||||||
|     for (int p = 0; p < batchCount; ++p) { |       oneapi::mkl::transpose iOpA; | ||||||
|       for (int mm = 0; mm < m; ++mm) { |       oneapi::mkl::transpose iOpB; | ||||||
| 	for (int nn = 0; nn < n; ++nn) { |        | ||||||
| 	  ComplexD c_mn(0.0); |       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; |       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; |       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||||
|  | 						  &iOpA, | ||||||
|  | 						  &iOpB, | ||||||
|  | 						  &m64,&n64,&k64, | ||||||
|  | 						  (ComplexD *) &alpha_p[0], | ||||||
|  | 						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64, | ||||||
|  | 						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64, | ||||||
|  | 						  (ComplexD *) &beta_p[0], | ||||||
|  | 						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64, | ||||||
|  | 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||||
|  |       synchronise(); | ||||||
|  | #if 0 | ||||||
|  |       // This code was used to check the mat mul on Sunspot/OneMKL | ||||||
|  |       std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl; | ||||||
|  |       std::vector<ComplexD> A(m*k);  // pointer list to matrices | ||||||
|  |       std::vector<ComplexD> B(k*n); | ||||||
|  |       std::vector<ComplexD> C(m*n); | ||||||
|  |       //      int sda = lda*k; | ||||||
|  |       //      int sdb = ldb*k; | ||||||
|  |       //      int sdc = ldc*n; | ||||||
|  |       std::cerr << " Checking the GEMM results "<<std::endl; | ||||||
|  |       for (int p = 0; p < 1; ++p) { | ||||||
|  | 	ComplexD * Amk_p;  // pointer list to matrices | ||||||
|  | 	ComplexD * Bkn_p;  // pointer list to matrices | ||||||
|  | 	ComplexD * Cmn_p;  // pointer list to matrices | ||||||
|  | 	acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*)); | ||||||
|  | 	acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*)); | ||||||
|  | 	acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*)); | ||||||
|  | 	std::cerr << " p " << p << " copied pointers "<<std::endl; | ||||||
|  | 	acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD)); | ||||||
|  | 	acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD)); | ||||||
|  | 	acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD)); | ||||||
|  | 	std::cerr << " p " << p << " copied matrices "<<std::endl; | ||||||
|  | 	std::cerr << " C[0] "<<C[0]<<std::endl; | ||||||
|  | 	std::cerr << " A[0] "<<A[0]<<std::endl; | ||||||
|  | 	std::cerr << " B[0] "<<B[0]<<std::endl; | ||||||
|  | 	std::cerr << " m "<<m<<std::endl; | ||||||
|  | 	std::cerr << " n "<<n<<std::endl; | ||||||
|  | 	std::cerr << " k "<<k<<std::endl; | ||||||
|  | 	for (int mm = 0; mm < m; ++mm) { | ||||||
|  | 	  for (int nn = 0; nn < n; ++nn) { | ||||||
|  | 	    ComplexD c_mn(0.0); | ||||||
|  | 	    for (int kk = 0; kk < k; ++kk) { | ||||||
|  | 	      int idx_a, idx_b; | ||||||
|  | 	      //    int lda = m; // m x k column major | ||||||
|  | 	      //    int ldb = k; // k x n column major | ||||||
|  | 	      //    int ldc = m; // m x b column major | ||||||
|  | 	      if(OpA!=GridBLAS_OP_N) { | ||||||
|  | 		idx_a =kk + mm*lda; | ||||||
|  | 	      } else { | ||||||
|  | 		idx_a =mm + kk*lda; | ||||||
|  | 	      } | ||||||
|  | 	      if(OpB!=GridBLAS_OP_N) { | ||||||
|  | 		idx_b =nn + kk*ldb; | ||||||
|  | 	      } else { | ||||||
|  | 		idx_b =kk + nn*ldb; | ||||||
|  | 	      } | ||||||
|  | 	      //	      std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl; | ||||||
|  |  | ||||||
|  | 	      ComplexD Ac = A[idx_a]; | ||||||
|  | 	      ComplexD Bc = B[idx_b]; | ||||||
|  | 	      if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac); | ||||||
|  | 	      if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc); | ||||||
|  | 	       | ||||||
|  | 	      c_mn += Ac*Bc; | ||||||
|  | 	    } | ||||||
|  | 	    std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl; | ||||||
|  | 	  } | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |  | ||||||
| #endif | #endif | ||||||
|     //    synchronise(); | #endif | ||||||
|  | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||||
|  |     // Need a default/reference implementation; use Eigen | ||||||
|  |       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||||
|  |         }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; | ||||||
|  | 	  } ); | ||||||
|  |       } else {  | ||||||
|  | 	assert(0); | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|      RealD t1=usecond(); |      RealD t1=usecond(); | ||||||
|      RealD flops = 8.0*m*n*k*batchCount; |      RealD flops = 8.0*m*n*k*batchCount; | ||||||
|      RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; |      RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; | ||||||
| @@ -306,6 +414,9 @@ public: | |||||||
|     RealD t2=usecond(); |     RealD t2=usecond(); | ||||||
|     int32_t batchCount = Amk.size(); |     int32_t batchCount = Amk.size(); | ||||||
|  |  | ||||||
|  |     assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose | ||||||
|  |     assert(OpB!=GridBLAS_OP_T); | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |     int lda = m; // m x k column major | ||||||
|     int ldb = k; // k x n column major |     int ldb = k; // k x n column major | ||||||
|     int ldc = m; // m x b column major |     int ldc = m; // m x b column major | ||||||
| @@ -366,26 +477,69 @@ public: | |||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |     assert(err==CUBLAS_STATUS_SUCCESS); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |       int64_t m64=m; | ||||||
| #warning "oneMKL implementation not built " |       int64_t n64=n; | ||||||
|  |       int64_t k64=k; | ||||||
|  |       int64_t lda64=lda; | ||||||
|  |       int64_t ldb64=ldb; | ||||||
|  |       int64_t ldc64=ldc; | ||||||
|  |       int64_t batchCount64=batchCount; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::transpose iOpA; | ||||||
|  |       oneapi::mkl::transpose iOpB; | ||||||
|  |        | ||||||
|  |       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||||
|  |       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||||
|  | 						  &iOpA, | ||||||
|  | 						  &iOpB, | ||||||
|  | 						  &m64,&n64,&k64, | ||||||
|  | 						  (ComplexF *) &alpha_p[0], | ||||||
|  | 						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64, | ||||||
|  | 						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, | ||||||
|  | 						  (ComplexF *) &beta_p[0], | ||||||
|  | 						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64, | ||||||
|  | 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||||
|  |     synchronise(); | ||||||
| #endif | #endif | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||||
|     int sda = lda*k; |     // Need a default/reference implementation; use Eigen | ||||||
|     int sdb = ldb*k; |       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|     int sdc = ldc*n; | 	thread_for (p, batchCount, { | ||||||
|     ComplexF alphaf(real(alpha),imag(alpha)); | 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); | ||||||
|     ComplexF betaf(real(beta),imag(beta)); | 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); | ||||||
|     // Need a default/reference implementation | 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||||
|     for (int p = 0; p < batchCount; ++p) { | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||||
|       for (int mm = 0; mm < m; ++mm) { | 	  }); | ||||||
| 	for (int nn = 0; nn < n; ++nn) { |       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { | ||||||
| 	  ComplexF c_mn(0.0); | 	thread_for (p, batchCount, { | ||||||
| 	  for (int kk = 0; kk < k; ++kk) | 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ]; | 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||||
| 	} | 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; | ||||||
|  | 	  } ); | ||||||
|  |       } else {  | ||||||
|  | 	assert(0); | ||||||
|       } |       } | ||||||
|     } |  | ||||||
| #endif | #endif | ||||||
|      RealD t1=usecond(); |      RealD t1=usecond(); | ||||||
|      RealD flops = 8.0*m*n*k*batchCount; |      RealD flops = 8.0*m*n*k*batchCount; | ||||||
| @@ -408,6 +562,9 @@ public: | |||||||
|     RealD t2=usecond(); |     RealD t2=usecond(); | ||||||
|     int32_t batchCount = Amk.size(); |     int32_t batchCount = Amk.size(); | ||||||
|  |  | ||||||
|  |     assert(OpA!=GridBLAS_OP_C); // Real case no conjugate | ||||||
|  |     assert(OpB!=GridBLAS_OP_C); | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |     int lda = m; // m x k column major | ||||||
|     int ldb = k; // k x n column major |     int ldb = k; // k x n column major | ||||||
|     int ldc = m; // m x b column major |     int ldc = m; // m x b column major | ||||||
| @@ -467,24 +624,69 @@ public: | |||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |     assert(err==CUBLAS_STATUS_SUCCESS); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |       int64_t m64=m; | ||||||
| #warning "oneMKL implementation not built " |       int64_t n64=n; | ||||||
|  |       int64_t k64=k; | ||||||
|  |       int64_t lda64=lda; | ||||||
|  |       int64_t ldb64=ldb; | ||||||
|  |       int64_t ldc64=ldc; | ||||||
|  |       int64_t batchCount64=batchCount; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::transpose iOpA; | ||||||
|  |       oneapi::mkl::transpose iOpB; | ||||||
|  |        | ||||||
|  |       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||||
|  |       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||||
|  |       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||||
|  |       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||||
|  |  | ||||||
|  |       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||||
|  | 						  &iOpA, | ||||||
|  | 						  &iOpB, | ||||||
|  | 						  &m64,&n64,&k64, | ||||||
|  | 						  (float *) &alpha_p[0], | ||||||
|  | 						  (const float **)&Amk[0], (const int64_t *)&lda64, | ||||||
|  | 						  (const float **)&Bkn[0], (const int64_t *)&ldb64, | ||||||
|  | 						  (float *) &beta_p[0], | ||||||
|  | 						  (float **)&Cmn[0], (const int64_t *)&ldc64, | ||||||
|  | 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||||
|  |       synchronise(); | ||||||
| #endif | #endif | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||||
|     int sda = lda*k; |     // Need a default/reference implementation; use Eigen | ||||||
|     int sdb = ldb*k; |       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|     int sdc = ldc*n; | 	thread_for (p, batchCount, { | ||||||
|     // Need a default/reference implementation | 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); | ||||||
|     for (int p = 0; p < batchCount; ++p) { | 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); | ||||||
|       for (int mm = 0; mm < m; ++mm) { | 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||||
| 	for (int nn = 0; nn < n; ++nn) { | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||||
| 	  RealD c_mn(0.0); | 	  }); | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | 	thread_for (p, batchCount, { | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; | 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); | ||||||
| 	} | 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; | ||||||
|  | 	  } ); | ||||||
|  |       } else {  | ||||||
|  | 	assert(0); | ||||||
|       } |       } | ||||||
|     } |  | ||||||
| #endif | #endif | ||||||
|      RealD t1=usecond(); |      RealD t1=usecond(); | ||||||
|      RealD flops = 2.0*m*n*k*batchCount; |      RealD flops = 2.0*m*n*k*batchCount; | ||||||
| @@ -495,7 +697,6 @@ public: | |||||||
|   /////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////// | ||||||
|   // Double precision real GEMM |   // Double precision real GEMM | ||||||
|   /////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|   void gemmBatched(GridBLASOperation_t OpA, |   void gemmBatched(GridBLASOperation_t OpA, | ||||||
| 		   GridBLASOperation_t OpB, | 		   GridBLASOperation_t OpB, | ||||||
| 		   int m,int n, int k, | 		   int m,int n, int k, | ||||||
| @@ -508,6 +709,9 @@ public: | |||||||
|     RealD t2=usecond(); |     RealD t2=usecond(); | ||||||
|     int32_t batchCount = Amk.size(); |     int32_t batchCount = Amk.size(); | ||||||
|  |  | ||||||
|  |     assert(OpA!=GridBLAS_OP_C); // Real case no conjugate | ||||||
|  |     assert(OpB!=GridBLAS_OP_C); | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |     int lda = m; // m x k column major | ||||||
|     int ldb = k; // k x n column major |     int ldb = k; // k x n column major | ||||||
|     int ldc = m; // m x b column major |     int ldc = m; // m x b column major | ||||||
| @@ -568,160 +772,124 @@ public: | |||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |     assert(err==CUBLAS_STATUS_SUCCESS); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|     /* |  | ||||||
|       int64_t m64=m; |       int64_t m64=m; | ||||||
|       int64_t n64=n; |       int64_t n64=n; | ||||||
|       int64_t k64=k; |       int64_t k64=k; | ||||||
|  |       int64_t lda64=lda; | ||||||
|  |       int64_t ldb64=ldb; | ||||||
|  |       int64_t ldc64=ldc; | ||||||
|       int64_t batchCount64=batchCount; |       int64_t batchCount64=batchCount; | ||||||
|       oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, |  | ||||||
|       onemkl::transpose::N, |       oneapi::mkl::transpose iOpA; | ||||||
|       onemkl::transpose::N, |       oneapi::mkl::transpose iOpB; | ||||||
|       &m64,&n64,&k64, |        | ||||||
|       (double *) &alpha_p[0], |       if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; | ||||||
|       (double **)&Amk[0], lda, |       if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; | ||||||
|       (double **)&Bkn[0], ldb, |       if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; | ||||||
|       (double *) &beta_p[0], |       if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; | ||||||
|       (double **)&Cmn[0], ldc, |       if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; | ||||||
|       1,&batchCount64); |       if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C; | ||||||
|      */ |  | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |       oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, | ||||||
| #warning "oneMKL implementation not built " | 						  &iOpA, | ||||||
|  | 						  &iOpB, | ||||||
|  | 						  &m64,&n64,&k64, | ||||||
|  | 						  (double *) &alpha_p[0], | ||||||
|  | 						  (const double **)&Amk[0], (const int64_t *)&lda64, | ||||||
|  | 						  (const double **)&Bkn[0], (const int64_t *)&ldb64, | ||||||
|  | 						  (double *) &beta_p[0], | ||||||
|  | 						  (double **)&Cmn[0], (const int64_t *)&ldc64, | ||||||
|  | 						  (int64_t)1,&batchCount64,std::vector<sycl::event>()); | ||||||
|  |       synchronise(); | ||||||
| #endif | #endif | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) | ||||||
|     int sda = lda*k; |     // Need a default/reference implementation; use Eigen | ||||||
|     int sdb = ldb*k; |       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { | ||||||
|     int sdc = ldc*n; | 	thread_for (p, batchCount, { | ||||||
|     // Need a default/reference implementation | 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); | ||||||
|     for (int p = 0; p < batchCount; ++p) { | 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); | ||||||
|       for (int mm = 0; mm < m; ++mm) { | 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||||
| 	for (int nn = 0; nn < n; ++nn) { | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn ; | ||||||
| 	  RealD c_mn(0.0); | 	  }); | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; | 	thread_for (p, batchCount, { | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; | 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); | ||||||
| 	} | 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; | ||||||
|  | 	  }); | ||||||
|  |       } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { | ||||||
|  | 	thread_for (p, batchCount, { | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); | ||||||
|  | 	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); | ||||||
|  | 	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; | ||||||
|  | 	  }); | ||||||
|  |       } else {  | ||||||
|  | 	assert(0); | ||||||
|       } |       } | ||||||
|     } |  | ||||||
| #endif | #endif | ||||||
|      RealD t1=usecond(); |      RealD t1=usecond(); | ||||||
|      RealD flops = 2.0*m*n*k*batchCount; |      RealD flops = 2.0*m*n*k*batchCount; | ||||||
|      RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; |      RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; | ||||||
|   } |   } | ||||||
|    |  | ||||||
|  |  | ||||||
|    |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Strided case used by benchmark, but generally unused in Grid |  | ||||||
|   // Keep a code example in double complex, but don't generate the single and real variants for now |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|    |  | ||||||
|   void gemmStridedBatched(int m,int n, int k, |  | ||||||
| 			  ComplexD alpha, |  | ||||||
| 			  ComplexD* Amk,  // pointer list to matrices |  | ||||||
| 			  ComplexD* Bkn, |  | ||||||
| 			  ComplexD beta, |  | ||||||
| 			  ComplexD* Cmn, |  | ||||||
| 			  int batchCount) |  | ||||||
|   { |  | ||||||
|     // Use C-row major storage, so transpose calls |  | ||||||
|     int lda = m; // m x k column major |  | ||||||
|     int ldb = k; // k x n column major |  | ||||||
|     int ldc = m; // m x b column major |  | ||||||
|     int sda = m*k; |  | ||||||
|     int sdb = k*n; |  | ||||||
|     int sdc = m*n; |  | ||||||
|     deviceVector<ComplexD> alpha_p(1); |  | ||||||
|     deviceVector<ComplexD> beta_p(1); |  | ||||||
|     acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); |  | ||||||
|     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl; |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl; |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl; |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|     auto err = hipblasZgemmStridedBatched(gridblasHandle, |  | ||||||
| 					  HIPBLAS_OP_N, |  | ||||||
| 					  HIPBLAS_OP_N, |  | ||||||
| 					  m,n,k, |  | ||||||
| 					  (hipblasDoubleComplex *) &alpha_p[0], |  | ||||||
| 					  (hipblasDoubleComplex *) Amk, lda, sda, |  | ||||||
| 					  (hipblasDoubleComplex *) Bkn, ldb, sdb, |  | ||||||
| 					  (hipblasDoubleComplex *) &beta_p[0], |  | ||||||
| 					  (hipblasDoubleComplex *) Cmn, ldc, sdc, |  | ||||||
| 					  batchCount); |  | ||||||
|     assert(err==HIPBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|     cublasZgemmStridedBatched(gridblasHandle, |  | ||||||
| 			      CUBLAS_OP_N, |  | ||||||
| 			      CUBLAS_OP_N, |  | ||||||
| 			      m,n,k, |  | ||||||
| 			      (cuDoubleComplex *) &alpha_p[0], |  | ||||||
| 			      (cuDoubleComplex *) Amk, lda, sda, |  | ||||||
| 			      (cuDoubleComplex *) Bkn, ldb, sdb, |  | ||||||
| 			      (cuDoubleComplex *) &beta_p[0], |  | ||||||
| 			      (cuDoubleComplex *) Cmn, ldc, sdc, |  | ||||||
| 			      batchCount); |  | ||||||
| #endif |  | ||||||
| #if defined(GRID_SYCL) || defined(GRID_ONE_MKL) |  | ||||||
|     oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, |  | ||||||
| 						oneapi::mkl::transpose::N, |  | ||||||
| 						oneapi::mkl::transpose::N, |  | ||||||
| 						m,n,k, |  | ||||||
| 						alpha, |  | ||||||
| 						(const ComplexD *)Amk,lda,sda, |  | ||||||
| 						(const ComplexD *)Bkn,ldb,sdb, |  | ||||||
| 						beta, |  | ||||||
| 						(ComplexD *)Cmn,ldc,sdc, |  | ||||||
| 						batchCount); |  | ||||||
| #endif |  | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) |  | ||||||
|      // Need a default/reference implementation |  | ||||||
|      for (int p = 0; p < batchCount; ++p) { |  | ||||||
|        for (int mm = 0; mm < m; ++mm) { |  | ||||||
| 	 for (int nn = 0; nn < n; ++nn) { |  | ||||||
| 	   ComplexD c_mn(0.0); |  | ||||||
| 	   for (int kk = 0; kk < k; ++kk) |  | ||||||
| 	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; |  | ||||||
| 	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc]; |  | ||||||
| 	 } |  | ||||||
|        } |  | ||||||
|      } |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |   template<class CComplex> | ||||||
|   double benchmark(int M, int N, int K, int BATCH) |   double benchmark(int M, int N, int K, int BATCH) | ||||||
|   { |   { | ||||||
|     int32_t N_A = M*K*BATCH; |     int32_t N_A = M*K*BATCH; | ||||||
|     int32_t N_B = K*N*BATCH; |     int32_t N_B = K*N*BATCH; | ||||||
|     int32_t N_C = M*N*BATCH; |     int32_t N_C = M*N*BATCH; | ||||||
|     deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); |     deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex)); | ||||||
|     deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); |     deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex)); | ||||||
|     deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); |     deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex)); | ||||||
|     ComplexD alpha(1.0); |     CComplex alpha(1.0); | ||||||
|     ComplexD beta (1.0); |     CComplex beta (1.0); | ||||||
|     RealD flops = 8.0*M*N*K*BATCH; |     RealD flops = 8.0*M*N*K*BATCH; | ||||||
|     int ncall=10; |     int ncall=1000; | ||||||
|  |     deviceVector<CComplex *> As(BATCH); | ||||||
|  |     deviceVector<CComplex *> Bs(BATCH); | ||||||
|  |     deviceVector<CComplex *> Cs(BATCH); | ||||||
|  |     for(int b = 0 ; b < BATCH;b++) { | ||||||
|  |       CComplex *ptr; | ||||||
|  |       ptr = &A[b*M*K];      acceleratorPut(As[b],ptr); | ||||||
|  |       ptr = &B[b*K*N];      acceleratorPut(Bs[b],ptr); | ||||||
|  |       ptr = &C[b*M*N];      acceleratorPut(Cs[b],ptr); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Warm up call | ||||||
|  |     gemmBatched(M,N,K, | ||||||
|  | 		alpha, | ||||||
|  | 		As, // m x k  | ||||||
|  | 		Bs, // k x n | ||||||
|  | 		beta,  | ||||||
|  | 		Cs); | ||||||
|  |     synchronise(); | ||||||
|  |  | ||||||
|     RealD t0 = usecond(); |     RealD t0 = usecond(); | ||||||
|     for(int i=0;i<ncall;i++){ |     for(int i=0;i<ncall;i++){ | ||||||
|       gemmStridedBatched(M,N,K, |       gemmBatched(M,N,K, | ||||||
| 			 alpha, | 		  alpha, | ||||||
| 			 &A[0], // m x k  | 		  As, // m x k  | ||||||
| 			 &B[0], // k x n | 		  Bs, // k x n | ||||||
| 			 beta,  | 		  beta,  | ||||||
| 			 &C[0], // m x n | 		  Cs); | ||||||
| 			 BATCH); |       synchronise(); | ||||||
|     } |     } | ||||||
|     synchronise(); |  | ||||||
|     RealD t1 = usecond(); |     RealD t1 = usecond(); | ||||||
|     RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH; |     RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH; | ||||||
|     flops = 8.0*M*N*K*BATCH*ncall; |     flops = 8.0*M*N*K*BATCH*ncall; | ||||||
|     flops = flops/(t1-t0)/1.e3; |     flops = flops/(t1-t0)/1.e3; | ||||||
|     return flops; // Returns gigaflops |     return flops; // Returns gigaflops | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -279,11 +279,11 @@ public: | |||||||
|       Qt = Eigen::MatrixXcd::Identity(Nm,Nm); |       Qt = Eigen::MatrixXcd::Identity(Nm,Nm); | ||||||
|       diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid); |       diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid); | ||||||
|       _sort.push(eval2,Nm); |       _sort.push(eval2,Nm); | ||||||
|       //      Glog << "#Ritz value before shift: "<< std::endl; |       Glog << "#Ritz value before shift: "<< std::endl; | ||||||
|       for(int i=0; i<Nm; ++i){ |       for(int i=0; i<Nm; ++i){ | ||||||
| 	//        std::cout.precision(13); | 	std::cout.precision(13); | ||||||
| 	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | 	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||||
| 	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | 	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | ||||||
|       } |       } | ||||||
|        |        | ||||||
|       //---------------------------------------------------------------------- |       //---------------------------------------------------------------------- | ||||||
| @@ -297,7 +297,8 @@ public: | |||||||
|          |          | ||||||
|         unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM); |         unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM); | ||||||
|  |  | ||||||
|         for(int ip=Nk; ip<Nm; ++ip){  |         for(int ip=Nk; ip<Nm; ++ip){ | ||||||
|  | 	  Glog << " ip "<<ip<<" / "<<Nm<<std::endl; | ||||||
|           shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q); |           shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q); | ||||||
|         } |         } | ||||||
|          |          | ||||||
| @@ -325,7 +326,7 @@ public: | |||||||
|         Qt = Eigen::MatrixXcd::Identity(Nm,Nm); |         Qt = Eigen::MatrixXcd::Identity(Nm,Nm); | ||||||
|         diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid); |         diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid); | ||||||
|         _sort.push(eval2,Nk); |         _sort.push(eval2,Nk); | ||||||
| 	//        Glog << "#Ritz value after shift: "<< std::endl; | 	Glog << "#Ritz value after shift: "<< std::endl; | ||||||
|         for(int i=0; i<Nk; ++i){ |         for(int i=0; i<Nk; ++i){ | ||||||
| 	  //          std::cout.precision(13); | 	  //          std::cout.precision(13); | ||||||
| 	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | 	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||||
| @@ -467,10 +468,10 @@ public: | |||||||
|    |    | ||||||
|     // set initial vector |     // set initial vector | ||||||
|     for (int i=0; i<Nu; ++i) { |     for (int i=0; i<Nu; ++i) { | ||||||
|       //      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl; |       Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl; | ||||||
|       evec[i] = src[i]; |       evec[i] = src[i]; | ||||||
|       orthogonalize(evec[i],evec,i); |       orthogonalize(evec[i],evec,i); | ||||||
|       //      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl; |       Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl; | ||||||
|     } |     } | ||||||
| //    exit(-43); | //    exit(-43); | ||||||
|      |      | ||||||
| @@ -506,11 +507,11 @@ public: | |||||||
|       Qt = Eigen::MatrixXcd::Identity(Nr,Nr); |       Qt = Eigen::MatrixXcd::Identity(Nr,Nr); | ||||||
|       diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid); |       diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid); | ||||||
|       _sort.push(eval2,Nr); |       _sort.push(eval2,Nr); | ||||||
|       //      Glog << "#Ritz value: "<< std::endl; |       Glog << "#Ritz value: "<< std::endl; | ||||||
|       for(int i=0; i<Nr; ++i){ |       for(int i=0; i<Nr; ++i){ | ||||||
| 	//        std::cout.precision(13); | 	std::cout.precision(13); | ||||||
| 	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | 	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||||
| 	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | 	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; | ||||||
|       } |       } | ||||||
|        |        | ||||||
|       // Convergence test |       // Convergence test | ||||||
| @@ -570,6 +571,7 @@ public: | |||||||
|       Glog << fname + " NOT converged ; Summary :\n"; |       Glog << fname + " NOT converged ; Summary :\n"; | ||||||
|     } else { |     } else { | ||||||
|       Glog << fname + " CONVERGED ; Summary :\n"; |       Glog << fname + " CONVERGED ; Summary :\n"; | ||||||
|  |       Nstop = Nconv_guess; // Just take them all | ||||||
|       // Sort convered eigenpairs. |       // Sort convered eigenpairs. | ||||||
|       std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating |       std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating | ||||||
|  |  | ||||||
| @@ -642,7 +644,7 @@ private: | |||||||
|       //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl; |       //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl; | ||||||
|       k_start +=mrhs; |       k_start +=mrhs; | ||||||
|     } |     } | ||||||
|     //    Glog << "LinAlg "<< std::endl; |     Glog << "LinAlg "<< std::endl; | ||||||
|      |      | ||||||
|     if (b>0) { |     if (b>0) { | ||||||
|       for (int u=0; u<Nu; ++u) { |       for (int u=0; u<Nu; ++u) { | ||||||
| @@ -676,7 +678,7 @@ private: | |||||||
|       } |       } | ||||||
|       w_copy[u] = w[u]; |       w_copy[u] = w[u]; | ||||||
|     } |     } | ||||||
|     //    Glog << "LinAlg done"<< std::endl; |     Glog << "LinAlg done"<< std::endl; | ||||||
|      |      | ||||||
|     // In block version, the steps 6 and 7 in Lanczos construction is |     // In block version, the steps 6 and 7 in Lanczos construction is | ||||||
|     // replaced by the QR decomposition of new basis block. |     // replaced by the QR decomposition of new basis block. | ||||||
| @@ -689,15 +691,15 @@ private: | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     // re-orthogonalization for numerical stability |     // re-orthogonalization for numerical stability | ||||||
|     //    Glog << "Gram Schmidt"<< std::endl; |     Glog << "Gram Schmidt"<< std::endl; | ||||||
|     orthogonalize(w,Nu,evec,R); |     orthogonalize(w,Nu,evec,R); | ||||||
|     // QR part |     // QR part | ||||||
|     for (int u=1; u<Nu; ++u) { |     for (int u=1; u<Nu; ++u) { | ||||||
|       orthogonalize(w[u],w,u); |       orthogonalize(w[u],w,u); | ||||||
|     } |     } | ||||||
|     //    Glog << "Gram Schmidt done "<< std::endl; |     Glog << "Gram Schmidt done "<< std::endl; | ||||||
|      |      | ||||||
|     //    Glog << "LinAlg "<< std::endl; |     Glog << "LinAlg "<< std::endl; | ||||||
|     for (int u=0; u<Nu; ++u) { |     for (int u=0; u<Nu; ++u) { | ||||||
|       //for (int v=0; v<Nu; ++v) { |       //for (int v=0; v<Nu; ++v) { | ||||||
|       for (int v=u; v<Nu; ++v) { |       for (int v=u; v<Nu; ++v) { | ||||||
| @@ -714,7 +716,7 @@ private: | |||||||
| 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; | 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     //    Glog << "LinAlg done "<< std::endl; |     Glog << "LinAlg done "<< std::endl; | ||||||
|  |  | ||||||
|     if (b < Nm/Nu-1) { |     if (b < Nm/Nu-1) { | ||||||
|       for (int u=0; u<Nu; ++u) { |       for (int u=0; u<Nu; ++u) { | ||||||
| @@ -779,7 +781,7 @@ private: | |||||||
|      |      | ||||||
|     for ( int u=0; u<Nu; ++u ) { |     for ( int u=0; u<Nu; ++u ) { | ||||||
|       for (int k=0; k<Nk; ++k ) { |       for (int k=0; k<Nk; ++k ) { | ||||||
| //        Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl; | 	//	Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl; | ||||||
|         BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k]; |         BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k]; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
| @@ -933,7 +935,7 @@ if (1){ | |||||||
|          int Nu, int Nb, int Nk, int Nm, |          int Nu, int Nb, int Nk, int Nm, | ||||||
|          Eigen::MatrixXcd& M) |          Eigen::MatrixXcd& M) | ||||||
|   { |   { | ||||||
|     //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';  |     Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';  | ||||||
|     assert( Nk%Nu == 0 && Nm%Nu == 0 ); |     assert( Nk%Nu == 0 && Nm%Nu == 0 ); | ||||||
|     assert( Nk <= Nm ); |     assert( Nk <= Nm ); | ||||||
|     M = Eigen::MatrixXcd::Zero(Nk,Nk); |     M = Eigen::MatrixXcd::Zero(Nk,Nk); | ||||||
| @@ -951,7 +953,7 @@ if (1){ | |||||||
|         M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; |         M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl;  |     Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;  | ||||||
|   } |   } | ||||||
|   |   | ||||||
|  |  | ||||||
| @@ -961,7 +963,7 @@ if (1){ | |||||||
|          int Nu, int Nb, int Nk, int Nm, |          int Nu, int Nb, int Nk, int Nm, | ||||||
|          Eigen::MatrixXcd& M) |          Eigen::MatrixXcd& M) | ||||||
|   { |   { | ||||||
|     //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';  |     Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';  | ||||||
|     assert( Nk%Nu == 0 && Nm%Nu == 0 ); |     assert( Nk%Nu == 0 && Nm%Nu == 0 ); | ||||||
|     assert( Nk <= Nm ); |     assert( Nk <= Nm ); | ||||||
|      |      | ||||||
| @@ -977,7 +979,7 @@ if (1){ | |||||||
|         lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); |         lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl;  |     Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -986,7 +988,7 @@ if (1){ | |||||||
| 		            RealD Dsh, | 		            RealD Dsh, | ||||||
| 		            Eigen::MatrixXcd& Qprod) | 		            Eigen::MatrixXcd& Qprod) | ||||||
|   { |   { | ||||||
|     //Glog << "shiftedQRDecompEigen() begin" << '\n';  |     Glog << "shiftedQRDecompEigen() begin" << '\n';  | ||||||
|     Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); |     Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
|     Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm); |     Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
|     Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); |     Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
| @@ -1002,6 +1004,7 @@ if (1){ | |||||||
|                         // lower triangular part used to represent series |                         // lower triangular part used to represent series | ||||||
|                         // of Q sequence. |                         // of Q sequence. | ||||||
|  |  | ||||||
|  |     Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';  | ||||||
|     // equivalent operation of Qprod *= Q |     // equivalent operation of Qprod *= Q | ||||||
|     //M = Eigen::MatrixXcd::Zero(Nm,Nm); |     //M = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
|      |      | ||||||
| @@ -1022,6 +1025,7 @@ if (1){ | |||||||
|      |      | ||||||
|     Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); |     Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); | ||||||
|  |  | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';  | ||||||
|     for (int i=0; i<Nm; ++i) { |     for (int i=0; i<Nm; ++i) { | ||||||
|       for (int j=0; j<Nm-(Nu+1); ++j) { |       for (int j=0; j<Nm-(Nu+1); ++j) { | ||||||
|         for (int k=0; k<Nu+1+j; ++k) { |         for (int k=0; k<Nu+1+j; ++k) { | ||||||
| @@ -1029,6 +1033,7 @@ if (1){ | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';  | ||||||
|     for (int i=0; i<Nm; ++i) { |     for (int i=0; i<Nm; ++i) { | ||||||
|       for (int j=Nm-(Nu+1); j<Nm; ++j) { |       for (int j=Nm-(Nu+1); j<Nm; ++j) { | ||||||
|         for (int k=0; k<Nm; ++k) { |         for (int k=0; k<Nm; ++k) { | ||||||
| @@ -1036,6 +1041,7 @@ if (1){ | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';  | ||||||
|      |      | ||||||
|     //static int ntimes = 2; |     //static int ntimes = 2; | ||||||
|     //for (int j=0; j<Nm-(ntimes*Nu); ++j) { |     //for (int j=0; j<Nm-(ntimes*Nu); ++j) { | ||||||
| @@ -1061,11 +1067,13 @@ if (1){ | |||||||
|         Mtmp(j,i) = conj(Mtmp(i,j)); |         Mtmp(j,i) = conj(Mtmp(i,j)); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';  | ||||||
|  |  | ||||||
|     for (int i=0; i<Nm; ++i) { |     for (int i=0; i<Nm; ++i) { | ||||||
|       Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; |       Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; | ||||||
|     } |     } | ||||||
|      |      | ||||||
|  |     Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';  | ||||||
|     M = Mtmp; |     M = Mtmp; | ||||||
|  |  | ||||||
|     //M = Q.adjoint()*(M*Q); |     //M = Q.adjoint()*(M*Q); | ||||||
| @@ -1077,7 +1085,7 @@ if (1){ | |||||||
|     //  } |     //  } | ||||||
|     //} |     //} | ||||||
|      |      | ||||||
|     //Glog << "shiftedQRDecompEigen() end" << endl;  |     Glog << "shiftedQRDecompEigen() end" <<std::endl;  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void exampleQRDecompEigen(void) |   void exampleQRDecompEigen(void) | ||||||
|   | |||||||
| @@ -499,6 +499,87 @@ namespace Grid { | |||||||
|       } |       } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Site diagonal is identity, left preconditioned by Mee^inv | ||||||
|  |   // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta | ||||||
|  |   // | ||||||
|  |   // Solve: | ||||||
|  |   // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta | ||||||
|  |   // | ||||||
|  |   // Old notation e<->o | ||||||
|  |   // | ||||||
|  |   // Left precon by Moo^-1 | ||||||
|  |   //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o | ||||||
|  |   //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e) | ||||||
|  |   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> { | ||||||
|  |   public: | ||||||
|  |     typedef CheckerBoardedSparseMatrixBase<Field> Matrix; | ||||||
|  |  | ||||||
|  |     ///////////////////////////////////////////////////// | ||||||
|  |     // Wrap the usual normal equations Schur trick | ||||||
|  |     ///////////////////////////////////////////////////// | ||||||
|  |   SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false, | ||||||
|  |       const bool _solnAsInitGuess = false)   | ||||||
|  |     : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {}; | ||||||
|  |  | ||||||
|  |     virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) | ||||||
|  |     { | ||||||
|  |       GridBase *grid = _Matrix.RedBlackGrid(); | ||||||
|  |       GridBase *fgrid= _Matrix.Grid(); | ||||||
|  |  | ||||||
|  |       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||||
|  |        | ||||||
|  |       Field   tmp(grid); | ||||||
|  |       Field  Mtmp(grid); | ||||||
|  |  | ||||||
|  |       pickCheckerboard(Even,src_e,src); | ||||||
|  |       pickCheckerboard(Odd ,src_o,src); | ||||||
|  |      | ||||||
|  |       ///////////////////////////////////////////////////// | ||||||
|  |       // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e) | ||||||
|  |       ///////////////////////////////////////////////////// | ||||||
|  |       _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even); | ||||||
|  |       _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);      | ||||||
|  |       Mtmp=src_o-Mtmp;                  | ||||||
|  |       _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);      | ||||||
|  |        | ||||||
|  |       // get the right MpcDag | ||||||
|  |       _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);        | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) | ||||||
|  |     { | ||||||
|  |       GridBase *grid = _Matrix.RedBlackGrid(); | ||||||
|  |       GridBase *fgrid= _Matrix.Grid(); | ||||||
|  |  | ||||||
|  |       Field   tmp(grid); | ||||||
|  |       Field   sol_e(grid); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |       /////////////////////////////////////////////////// | ||||||
|  |       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... | ||||||
|  |       /////////////////////////////////////////////////// | ||||||
|  |       _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even); | ||||||
|  |       tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even); | ||||||
|  |       _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even); | ||||||
|  |       | ||||||
|  |       setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even); | ||||||
|  |       setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd ); | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o) | ||||||
|  |     { | ||||||
|  |       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||||
|  |       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); | ||||||
|  |     }; | ||||||
|  |     virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o) | ||||||
|  |     { | ||||||
|  |       SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix); | ||||||
|  |       this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Site diagonal is identity, right preconditioned by Mee^inv |   // Site diagonal is identity, right preconditioned by Mee^inv | ||||||
|   // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta |   // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta | ||||||
|   | |||||||
| @@ -54,6 +54,9 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|     profilerAllocate(bytes); |     profilerAllocate(bytes); | ||||||
|     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); |     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); | ||||||
|  |     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||||
|  |       printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||||
|  |     } | ||||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); |     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
| @@ -100,6 +103,9 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|     profilerAllocate(bytes); |     profilerAllocate(bytes); | ||||||
|     _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); |     _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); | ||||||
|  |     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||||
|  |       printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||||
|  |     } | ||||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); |     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
| @@ -145,6 +151,9 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|     profilerAllocate(bytes); |     profilerAllocate(bytes); | ||||||
|     _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); |     _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); | ||||||
|  |     if ( (_Tp*)ptr == (_Tp *) NULL ) { | ||||||
|  |       printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); | ||||||
|  |     } | ||||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); |     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid); | |||||||
| uint64_t total_shared; | uint64_t total_shared; | ||||||
| uint64_t total_device; | uint64_t total_device; | ||||||
| uint64_t total_host;; | uint64_t total_host;; | ||||||
|  |  | ||||||
|  | #if defined(__has_feature) | ||||||
|  | #if __has_feature(leak_sanitizer) | ||||||
|  | #define ASAN_LEAK_CHECK | ||||||
|  | #endif | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #ifdef ASAN_LEAK_CHECK | ||||||
|  | #include <sanitizer/asan_interface.h> | ||||||
|  | #include <sanitizer/common_interface_defs.h> | ||||||
|  | #include <sanitizer/lsan_interface.h> | ||||||
|  | #define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); } | ||||||
|  | #else | ||||||
|  | #define LEAK_CHECK(A) { } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | void MemoryManager::DisplayMallinfo(void) | ||||||
|  | { | ||||||
|  | #ifdef __linux__ | ||||||
|  |   struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform | ||||||
|  |    | ||||||
|  |   mi = mallinfo(); | ||||||
|  |  | ||||||
|  |   std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl; | ||||||
|  |   std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl; | ||||||
|  | #endif | ||||||
|  |   LEAK_CHECK(); | ||||||
|  |   | ||||||
|  | } | ||||||
|  |  | ||||||
| void MemoryManager::PrintBytes(void) | void MemoryManager::PrintBytes(void) | ||||||
| { | { | ||||||
|   std::cout << " MemoryManager : ------------------------------------ "<<std::endl; |   std::cout << " MemoryManager : ------------------------------------ "<<std::endl; | ||||||
| @@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void) | |||||||
| #ifdef GRID_CUDA | #ifdef GRID_CUDA | ||||||
|   cuda_mem(); |   cuda_mem(); | ||||||
| #endif | #endif | ||||||
|    |   DisplayMallinfo(); | ||||||
| } | } | ||||||
|  |  | ||||||
| uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | ||||||
|   | |||||||
| @@ -211,6 +211,7 @@ private: | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  public: |  public: | ||||||
|  |   static void DisplayMallinfo(void); | ||||||
|   static void NotifyDeletion(void * CpuPtr); |   static void NotifyDeletion(void * CpuPtr); | ||||||
|   static void Print(void); |   static void Print(void); | ||||||
|   static void PrintAll(void); |   static void PrintAll(void); | ||||||
|   | |||||||
| @@ -91,6 +91,7 @@ public: | |||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   virtual int CheckerBoarded(int dim)=0; |   virtual int CheckerBoarded(int dim)=0; | ||||||
|   virtual int CheckerBoard(const Coordinate &site)=0; |   virtual int CheckerBoard(const Coordinate &site)=0; | ||||||
|  |   virtual int CheckerDim(void){ return 0; }; | ||||||
|   virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; |   virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; | ||||||
|   virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; |   virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; | ||||||
|   virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; |   virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; | ||||||
|   | |||||||
| @@ -60,6 +60,7 @@ public: | |||||||
|   int              _checker_dim; |   int              _checker_dim; | ||||||
|   std::vector<int> _checker_board; |   std::vector<int> _checker_board; | ||||||
|  |  | ||||||
|  |   virtual int CheckerDim(void){ return _checker_dim; }; | ||||||
|   virtual int CheckerBoarded(int dim){ |   virtual int CheckerBoarded(int dim){ | ||||||
|     if( dim==_checker_dim) return 1; |     if( dim==_checker_dim) return 1; | ||||||
|     else return 0; |     else return 0; | ||||||
|   | |||||||
| @@ -236,11 +236,18 @@ public: | |||||||
|   template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ |   template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ | ||||||
|     vobj vtmp; |     vobj vtmp; | ||||||
|     vtmp = r; |     vtmp = r; | ||||||
|  | #if 1 | ||||||
|  |     auto me  = View(CpuWrite); | ||||||
|  |     thread_for(ss,me.size(),{ | ||||||
|  |        me[ss]= r; | ||||||
|  |       }); | ||||||
|  | #else     | ||||||
|     auto me  = View(AcceleratorWrite); |     auto me  = View(AcceleratorWrite); | ||||||
|     accelerator_for(ss,me.size(),vobj::Nsimd(),{ |     accelerator_for(ss,me.size(),vobj::Nsimd(),{ | ||||||
| 	auto stmp=coalescedRead(vtmp); | 	auto stmp=coalescedRead(vtmp); | ||||||
| 	coalescedWrite(me[ss],stmp); | 	coalescedWrite(me[ss],stmp); | ||||||
|     }); |     }); | ||||||
|  | #endif     | ||||||
|     me.ViewClose(); |     me.ViewClose(); | ||||||
|     return *this; |     return *this; | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | |||||||
|   const uint64_t sites = grid->oSites(); |   const uint64_t sites = grid->oSites(); | ||||||
|    |    | ||||||
|   // Might make all code paths go this way. |   // Might make all code paths go this way. | ||||||
| #if 0 |  | ||||||
|   typedef decltype(innerProductD(vobj(),vobj())) inner_t; |  | ||||||
|   Vector<inner_t> inner_tmp(sites); |  | ||||||
|   auto inner_tmp_v = &inner_tmp[0]; |  | ||||||
|   { |  | ||||||
|     autoView( left_v , left, AcceleratorRead); |  | ||||||
|     autoView( right_v,right, AcceleratorRead); |  | ||||||
|     // This code could read coalesce |  | ||||||
|     // GPU - SIMT lane compliance... |  | ||||||
|     accelerator_for( ss, sites, nsimd,{ |  | ||||||
| 	auto x_l = left_v(ss); |  | ||||||
| 	auto y_l = right_v(ss); |  | ||||||
| 	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l)); |  | ||||||
|     }); |  | ||||||
|   } |  | ||||||
| #else |  | ||||||
|   typedef decltype(innerProduct(vobj(),vobj())) inner_t; |   typedef decltype(innerProduct(vobj(),vobj())) inner_t; | ||||||
|   Vector<inner_t> inner_tmp(sites); |   deviceVector<inner_t> inner_tmp(sites); | ||||||
|   auto inner_tmp_v = &inner_tmp[0]; |   auto inner_tmp_v = &inner_tmp[0]; | ||||||
|      |      | ||||||
|   { |   { | ||||||
| @@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | |||||||
| 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); | 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); | ||||||
|     }); |     }); | ||||||
|   } |   } | ||||||
| #endif |  | ||||||
|   // This is in single precision and fails some tests |   // This is in single precision and fails some tests | ||||||
|   auto anrm = sumD(inner_tmp_v,sites);   |   auto anrm = sumD(inner_tmp_v,sites);   | ||||||
|   nrm = anrm; |   nrm = anrm; | ||||||
| @@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt | |||||||
|   nrm = real(TensorRemove(sum(inner_tmp_v,sites))); |   nrm = real(TensorRemove(sum(inner_tmp_v,sites))); | ||||||
| #else | #else | ||||||
|   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; |   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; | ||||||
|   Vector<inner_t> inner_tmp(sites); |   deviceVector<inner_t> inner_tmp; | ||||||
|  |   inner_tmp.resize(sites); | ||||||
|   auto inner_tmp_v = &inner_tmp[0]; |   auto inner_tmp_v = &inner_tmp[0]; | ||||||
|  |  | ||||||
|   accelerator_for( ss, sites, nsimd,{ |   accelerator_for( ss, sites, nsimd,{ | ||||||
|   | |||||||
| @@ -9,14 +9,18 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os | |||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|   typedef typename vobj::scalar_objectD sobjD; |   typedef typename vobj::scalar_objectD sobjD; | ||||||
|   sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator); |   static Vector<sobj> mysum; | ||||||
|  |   mysum.resize(1); | ||||||
|  |   sobj *mysum_p = & mysum[0]; | ||||||
|   sobj identity; zeroit(identity); |   sobj identity; zeroit(identity); | ||||||
|  |   mysum[0] = identity; | ||||||
|   sobj ret ;  |   sobj ret ;  | ||||||
|  |  | ||||||
|   Integer nsimd= vobj::Nsimd(); |   Integer nsimd= vobj::Nsimd(); | ||||||
|    |  | ||||||
|  |   const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|      auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>()); |     auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList); | ||||||
|      cgh.parallel_for(cl::sycl::range<1>{osites}, |      cgh.parallel_for(cl::sycl::range<1>{osites}, | ||||||
| 		      Reduction, | 		      Reduction, | ||||||
| 		      [=] (cl::sycl::id<1> item, auto &sum) { | 		      [=] (cl::sycl::id<1> item, auto &sum) { | ||||||
| @@ -26,7 +30,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os | |||||||
|    }); |    }); | ||||||
|   theGridAccelerator->wait(); |   theGridAccelerator->wait(); | ||||||
|   ret = mysum[0]; |   ret = mysum[0]; | ||||||
|   free(mysum,*theGridAccelerator); |   //  free(mysum,*theGridAccelerator); | ||||||
|   sobjD dret; convertType(dret,ret); |   sobjD dret; convertType(dret,ret); | ||||||
|   return dret; |   return dret; | ||||||
| } | } | ||||||
| @@ -73,19 +77,23 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite | |||||||
| template<class Word> Word svm_xor(Word *vec,uint64_t L) | template<class Word> Word svm_xor(Word *vec,uint64_t L) | ||||||
| { | { | ||||||
|   Word xorResult; xorResult = 0; |   Word xorResult; xorResult = 0; | ||||||
|   Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); |   static Vector<Word> d_sum; | ||||||
|  |   d_sum.resize(1); | ||||||
|  |   Word *d_sum_p=&d_sum[0]; | ||||||
|   Word identity;  identity=0; |   Word identity;  identity=0; | ||||||
|  |   d_sum[0] = identity; | ||||||
|  |   const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|      auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); |     auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); | ||||||
|      cgh.parallel_for(cl::sycl::range<1>{L}, |      cgh.parallel_for(cl::sycl::range<1>{L}, | ||||||
| 		      Reduction, | 		      Reduction, | ||||||
| 		      [=] (cl::sycl::id<1> index, auto &sum) { | 		      [=] (cl::sycl::id<1> index, auto &sum) { | ||||||
| 	 sum ^=vec[index]; | 	 sum^=vec[index]; | ||||||
|      }); |      }); | ||||||
|    }); |    }); | ||||||
|   theGridAccelerator->wait(); |   theGridAccelerator->wait(); | ||||||
|   Word ret = d_sum[0]; |   Word ret = d_sum[0]; | ||||||
|   free(d_sum,*theGridAccelerator); |   //  free(d_sum,*theGridAccelerator); | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| #pragma once | #pragma once | ||||||
| #include <type_traits> |  | ||||||
| #if defined(GRID_CUDA) | #if defined(GRID_CUDA) | ||||||
|  |  | ||||||
| #include <cub/cub.cuh> | #include <cub/cub.cuh> | ||||||
| @@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V | |||||||
|    |    | ||||||
|  |  | ||||||
| } | } | ||||||
|  | #endif  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { |  | ||||||
|  | #if defined(GRID_SYCL) | ||||||
|  | template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||||
|  | { | ||||||
|  |   size_t subvol_size = e1*e2; | ||||||
|  |  | ||||||
|  |   vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); | ||||||
|  |   vobj vobj_zero; | ||||||
|  |   zeroit(vobj_zero); | ||||||
|  |   for (int r = 0; r<rd; r++) {  | ||||||
|  |     mysum[r] = vobj_zero;  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   commVector<vobj> reduction_buffer(rd*subvol_size);     | ||||||
|  |  | ||||||
|  |   auto rb_p = &reduction_buffer[0]; | ||||||
|  |  | ||||||
|  |   // autoView(Data_v, Data, AcceleratorRead); | ||||||
|  |  | ||||||
|  |   //prepare reduction buffer  | ||||||
|  |   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  | ||||||
|  |    | ||||||
|  |       int n = s / e2; | ||||||
|  |       int b = s % e2; | ||||||
|  |       int so=r*ostride; // base offset for start of plane  | ||||||
|  |       int ss= so+n*stride+b; | ||||||
|  |  | ||||||
|  |       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); | ||||||
|  |  | ||||||
|  |   }); | ||||||
|  |  | ||||||
|  |   for (int r = 0; r < rd; r++) { | ||||||
|  |       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|  |           auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); | ||||||
|  |           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, | ||||||
|  |           Reduction, | ||||||
|  |           [=](cl::sycl::id<1> item, auto &sum) { | ||||||
|  |               auto s = item[0]; | ||||||
|  |               sum += rb_p[r*subvol_size+s]; | ||||||
|  |           }); | ||||||
|  |       }); | ||||||
|  |        | ||||||
|  |       | ||||||
|  |   } | ||||||
|  |   theGridAccelerator->wait(); | ||||||
|  |   for (int r = 0; r < rd; r++) { | ||||||
|  |     lvSum[r] = mysum[r]; | ||||||
|  |   } | ||||||
|  |   free(mysum,*theGridAccelerator); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | ||||||
|   typedef typename vobj::vector_type vector; |   typedef typename vobj::vector_type vector; | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |   const int words = sizeof(vobj)/sizeof(vector); | ||||||
|   const int osites = rd*e1*e2; |   const int osites = rd*e1*e2; | ||||||
| @@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V | |||||||
| 	    buf[ss] = dat[ss*words+w]; | 	    buf[ss] = dat[ss*words+w]; | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); |     #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
|        |       sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||||
|  |     #elif defined(GRID_SYCL) | ||||||
|  |       sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); | ||||||
|  |     #endif | ||||||
|  |  | ||||||
|     for (int r = 0; r < rd; r++) { |     for (int r = 0; r < rd; r++) { | ||||||
|       lvSum_ptr[w+words*r]=lvSum_small[r]; |       lvSum_ptr[w+words*r]=lvSum_small[r]; | ||||||
|     } |     } | ||||||
| @@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V | |||||||
|    |    | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | ||||||
| { | { | ||||||
|   autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. |   autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. | ||||||
|     if constexpr (sizeof(vobj) <= 256) {  |     if constexpr (sizeof(vobj) <= 256) {  | ||||||
|       sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|  |       #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
|  |         sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|  |       #elif defined (GRID_SYCL) | ||||||
|  |         sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|  |       #endif | ||||||
|  |  | ||||||
|     } |     } | ||||||
|     else { |     else { | ||||||
|       sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |       sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|     } |     } | ||||||
| } | } | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #if defined(GRID_SYCL) |  | ||||||
| template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) |  | ||||||
| { |  | ||||||
|   typedef typename vobj::scalar_object sobj; |  | ||||||
|   size_t subvol_size = e1*e2; |  | ||||||
|  |  | ||||||
|   vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); |  | ||||||
|   vobj vobj_zero; |  | ||||||
|   zeroit(vobj_zero); |  | ||||||
|      |  | ||||||
|   commVector<vobj> reduction_buffer(rd*subvol_size);     |  | ||||||
|  |  | ||||||
|   auto rb_p = &reduction_buffer[0]; |  | ||||||
|  |  | ||||||
|   autoView(Data_v, Data, AcceleratorRead); |  | ||||||
|  |  | ||||||
|   //prepare reduction buffer  |  | ||||||
|   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  |  | ||||||
|    |  | ||||||
|       int n = s / e2; |  | ||||||
|       int b = s % e2; |  | ||||||
|       int so=r*ostride; // base offset for start of plane  |  | ||||||
|       int ss= so+n*stride+b; |  | ||||||
|  |  | ||||||
|       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); |  | ||||||
|  |  | ||||||
|   }); |  | ||||||
|  |  | ||||||
|   for (int r = 0; r < rd; r++) { |  | ||||||
|       mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable |  | ||||||
|       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |  | ||||||
|           auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); |  | ||||||
|           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, |  | ||||||
|           Reduction, |  | ||||||
|           [=](cl::sycl::id<1> item, auto &sum) { |  | ||||||
|               auto s = item[0]; |  | ||||||
|               sum += rb_p[r*subvol_size+s]; |  | ||||||
|           }); |  | ||||||
|       }); |  | ||||||
|       theGridAccelerator->wait(); |  | ||||||
|       lvSum[r] = mysum[0]; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   free(mysum,*theGridAccelerator); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | ||||||
| { | { | ||||||
|   // sum over reduced dimension planes, breaking out orthog dir |   // sum over reduced dimension planes, breaking out orthog dir | ||||||
| @@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data | |||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | ||||||
| { | { | ||||||
|   #if defined(GRID_CUDA) || defined(GRID_HIP) |   #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) | ||||||
|    |    | ||||||
|   sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|    |  | ||||||
|   #elif defined(GRID_SYCL) |  | ||||||
|    |  | ||||||
|   sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|    |    | ||||||
|   #else |   #else | ||||||
|   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|   | |||||||
| @@ -42,50 +42,21 @@ inline void subdivides(GridBase *coarse,GridBase *fine) | |||||||
|     assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);  |     assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|   |   | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // remove and insert a half checkerboard | // remove and insert a half checkerboard | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full) | template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full) | ||||||
| { | { | ||||||
|   half.Checkerboard() = cb; |   acceleratorPickCheckerboard(cb,half,full); | ||||||
|  |  | ||||||
|   autoView( half_v, half, CpuWrite); |  | ||||||
|   autoView( full_v, full, CpuRead); |  | ||||||
|   thread_for(ss, full.Grid()->oSites(),{ |  | ||||||
|     int cbos; |  | ||||||
|     Coordinate coor; |  | ||||||
|     full.Grid()->oCoorFromOindex(coor,ss); |  | ||||||
|     cbos=half.Grid()->CheckerBoard(coor); |  | ||||||
|  |  | ||||||
|     if (cbos==cb) { |  | ||||||
|       int ssh=half.Grid()->oIndex(coor); |  | ||||||
|       half_v[ssh] = full_v[ss]; |  | ||||||
|     } |  | ||||||
|   }); |  | ||||||
| } | } | ||||||
| template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half) | template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half) | ||||||
| { | { | ||||||
|   int cb = half.Checkerboard(); |   acceleratorSetCheckerboard(full,half); | ||||||
|   autoView( half_v , half, CpuRead); |  | ||||||
|   autoView( full_v , full, CpuWrite); |  | ||||||
|   thread_for(ss,full.Grid()->oSites(),{ |  | ||||||
|  |  | ||||||
|     Coordinate coor; |  | ||||||
|     int cbos; |  | ||||||
|  |  | ||||||
|     full.Grid()->oCoorFromOindex(coor,ss); |  | ||||||
|     cbos=half.Grid()->CheckerBoard(coor); |  | ||||||
|        |  | ||||||
|     if (cbos==cb) { |  | ||||||
|       int ssh=half.Grid()->oIndex(coor); |  | ||||||
|       full_v[ss]=half_v[ssh]; |  | ||||||
|     } |  | ||||||
|   }); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0) | template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0) | ||||||
| { | { | ||||||
|   half.Checkerboard() = cb; |   half.Checkerboard() = cb; | ||||||
|   autoView(half_v, half, AcceleratorWrite); |   autoView(half_v, half, AcceleratorWrite); | ||||||
| @@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj | |||||||
|   unsigned long ndim_half          = half.Grid()->_ndimension; |   unsigned long ndim_half          = half.Grid()->_ndimension; | ||||||
|   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; |   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; | ||||||
|   Coordinate ostride_half          = half.Grid()->_ostride; |   Coordinate ostride_half          = half.Grid()->_ostride; | ||||||
|  |   int checker_dim_half             = half.Grid()->CheckerDim(); | ||||||
|   accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{ |   accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{ | ||||||
|      |      | ||||||
|     Coordinate coor; |     Coordinate coor; | ||||||
| @@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj | |||||||
|     } |     } | ||||||
|   }); |   }); | ||||||
| } | } | ||||||
| template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0) | template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0) | ||||||
| { | { | ||||||
|   int cb = half.Checkerboard(); |   int cb = half.Checkerboard(); | ||||||
|   autoView(half_v , half, AcceleratorRead); |   autoView(half_v , half, AcceleratorRead); | ||||||
| @@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full, | |||||||
|   unsigned long ndim_half          = half.Grid()->_ndimension; |   unsigned long ndim_half          = half.Grid()->_ndimension; | ||||||
|   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; |   Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; | ||||||
|   Coordinate ostride_half          = half.Grid()->_ostride; |   Coordinate ostride_half          = half.Grid()->_ostride; | ||||||
|  |   int checker_dim_half             = half.Grid()->CheckerDim(); | ||||||
|   accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{ |   accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{ | ||||||
|  |  | ||||||
|     Coordinate coor; |     Coordinate coor; | ||||||
|   | |||||||
| @@ -90,6 +90,7 @@ public: | |||||||
|         exit(1); |         exit(1); | ||||||
|       } |       } | ||||||
|       Parameters.StartingType = arg; |       Parameters.StartingType = arg; | ||||||
|  |       std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) { |     if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) { | ||||||
| @@ -97,6 +98,7 @@ public: | |||||||
|       std::vector<int> ivec(0); |       std::vector<int> ivec(0); | ||||||
|       GridCmdOptionIntVector(arg, ivec); |       GridCmdOptionIntVector(arg, ivec); | ||||||
|       Parameters.StartTrajectory = ivec[0]; |       Parameters.StartTrajectory = ivec[0]; | ||||||
|  |       std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) { |     if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) { | ||||||
| @@ -104,6 +106,7 @@ public: | |||||||
|       std::vector<int> ivec(0); |       std::vector<int> ivec(0); | ||||||
|       GridCmdOptionIntVector(arg, ivec); |       GridCmdOptionIntVector(arg, ivec); | ||||||
|       Parameters.Trajectories = ivec[0]; |       Parameters.Trajectories = ivec[0]; | ||||||
|  |       std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) { |     if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) { | ||||||
| @@ -111,6 +114,7 @@ public: | |||||||
|       std::vector<int> ivec(0); |       std::vector<int> ivec(0); | ||||||
|       GridCmdOptionIntVector(arg, ivec); |       GridCmdOptionIntVector(arg, ivec); | ||||||
|       Parameters.NoMetropolisUntil = ivec[0]; |       Parameters.NoMetropolisUntil = ivec[0]; | ||||||
|  |       std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl; | ||||||
|     } |     } | ||||||
|     if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) { |     if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) { | ||||||
|       arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile"); |       arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile"); | ||||||
|   | |||||||
| @@ -137,9 +137,11 @@ public: | |||||||
|  |  | ||||||
|       double start_force = usecond(); |       double start_force = usecond(); | ||||||
|  |  | ||||||
|  |       MemoryManager::Print(); | ||||||
|       as[level].actions.at(a)->deriv_timer_start(); |       as[level].actions.at(a)->deriv_timer_start(); | ||||||
|       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta |       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta | ||||||
|       as[level].actions.at(a)->deriv_timer_stop(); |       as[level].actions.at(a)->deriv_timer_stop(); | ||||||
|  |       MemoryManager::Print(); | ||||||
|  |  | ||||||
|       auto name = as[level].actions.at(a)->action_name(); |       auto name = as[level].actions.at(a)->action_name(); | ||||||
|  |  | ||||||
| @@ -246,7 +248,11 @@ public: | |||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   virtual ~Integrator() {} |   virtual ~Integrator() | ||||||
|  |   { | ||||||
|  |     // Pain in the ass to clean up the Level pointers | ||||||
|  |     // Guido's design is at fault as per comment above in constructor | ||||||
|  |   } | ||||||
|  |  | ||||||
|   virtual std::string integrator_name() = 0; |   virtual std::string integrator_name() = 0; | ||||||
|    |    | ||||||
| @@ -460,6 +466,7 @@ public: | |||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { |       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||||
|  |  | ||||||
|  | 	MemoryManager::Print(); | ||||||
|         // get gauge field from the SmearingPolicy and |         // get gauge field from the SmearingPolicy and | ||||||
|         // based on the boolean is_smeared in actionID |         // based on the boolean is_smeared in actionID | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||||
| @@ -468,6 +475,7 @@ public: | |||||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); |    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; |         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||||
|         H += Hterm; |         H += Hterm; | ||||||
|  | 	MemoryManager::Print(); | ||||||
|  |  | ||||||
|       } |       } | ||||||
|       as[level].apply(S_hireps, Representations, level, H); |       as[level].apply(S_hireps, Representations, level, H); | ||||||
|   | |||||||
| @@ -32,7 +32,9 @@ private: | |||||||
|   //  Smear_Stout<Gimpl> *StoutSmearing; |   //  Smear_Stout<Gimpl> *StoutSmearing; | ||||||
|   //  std::vector<GaugeField> SmearedSet; |   //  std::vector<GaugeField> SmearedSet; | ||||||
|    |    | ||||||
|  |   GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object | ||||||
|   std::vector<LatticeLorentzComplex> masks; |   std::vector<LatticeLorentzComplex> masks; | ||||||
|  |   std::vector<int> cbs; | ||||||
|  |  | ||||||
|   typedef typename SU3Adjoint::AMatrix AdjMatrix; |   typedef typename SU3Adjoint::AMatrix AdjMatrix; | ||||||
|   typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField; |   typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField; | ||||||
| @@ -147,6 +149,25 @@ private: | |||||||
|     } |     } | ||||||
|     pokeLorentz(Fdet, Fdet_pol, nu); |     pokeLorentz(Fdet, Fdet_pol, nu); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   void Compute_MpInvJx_dNxxdSy(int cb, | ||||||
|  | 			       const GaugeLinkField &PlaqL, | ||||||
|  | 			       const GaugeLinkField &PlaqR, | ||||||
|  | 			       AdjMatrixField MpInvJx, | ||||||
|  | 			       AdjVectorField &Fdet2 ) | ||||||
|  |   { | ||||||
|  |     GaugeLinkField PlaqLeo(UrbGrid); | ||||||
|  |     GaugeLinkField PlaqReo(UrbGrid); | ||||||
|  |     AdjMatrixField MpInvJxeo(UrbGrid); | ||||||
|  |     AdjVectorField Fdet2eo(UrbGrid); | ||||||
|  |     pickCheckerboard(cb,PlaqLeo,PlaqL); | ||||||
|  |     pickCheckerboard(cb,PlaqReo,PlaqR); | ||||||
|  |     pickCheckerboard(cb,MpInvJxeo,MpInvJx); | ||||||
|  |     Fdet2eo.Checkerboard()=cb; | ||||||
|  |     Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo); | ||||||
|  |     setCheckerboard(Fdet2,Fdet2eo); | ||||||
|  |   } | ||||||
|  |    | ||||||
|   void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 ) |   void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 ) | ||||||
|   { |   { | ||||||
|     GaugeLinkField UtaU(PlaqL.Grid()); |     GaugeLinkField UtaU(PlaqL.Grid()); | ||||||
| @@ -278,8 +299,9 @@ public: | |||||||
|     //////////////////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////////////////// | ||||||
|     // Mask the gauge field |     // Mask the gauge field | ||||||
|     //////////////////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////////////////// | ||||||
|  |     int cb = cbs[smr]; | ||||||
|     auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask |     auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask | ||||||
|  |      | ||||||
|     Umsk = U; |     Umsk = U; | ||||||
|     ApplyMask(Umsk,smr); |     ApplyMask(Umsk,smr); | ||||||
|     Utmp = peekLorentz(Umsk,mu); |     Utmp = peekLorentz(Umsk,mu); | ||||||
| @@ -442,7 +464,7 @@ public: | |||||||
|     AdjMatrixField MpInvJx_nu(grid); |     AdjMatrixField MpInvJx_nu(grid); | ||||||
|     MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor |     MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor | ||||||
|  |  | ||||||
|     Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV); |     Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV); | ||||||
|     Fdet2_mu=FdetV; |     Fdet2_mu=FdetV; | ||||||
|     Fdet1_mu=Zero(); |     Fdet1_mu=Zero(); | ||||||
|      |      | ||||||
| @@ -499,7 +521,7 @@ public: | |||||||
|  |  | ||||||
| 	time=-usecond(); | 	time=-usecond(); | ||||||
| 	PlaqR=(-1.0)*PlaqR; | 	PlaqR=(-1.0)*PlaqR; | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV); | ||||||
| 	Fdet2_nu = FdetV; | 	Fdet2_nu = FdetV; | ||||||
| 	time+=usecond(); | 	time+=usecond(); | ||||||
| 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl; | 	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl; | ||||||
| @@ -520,7 +542,7 @@ public: | |||||||
| 	 | 	 | ||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | 	Fdet2_nu = Fdet2_nu+FdetV; | ||||||
| 	 | 	 | ||||||
| 	///////////////// -ve nu ///////////////// | 	///////////////// -ve nu ///////////////// | ||||||
| @@ -539,7 +561,7 @@ public: | |||||||
| 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y; | 	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y; | ||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | 	Fdet2_nu = Fdet2_nu+FdetV; | ||||||
| 	 | 	 | ||||||
| 	// x== | 	// x== | ||||||
| @@ -560,7 +582,7 @@ public: | |||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | 	MpInvJx_nu = Cshift(MpInvJx,mu,-1); | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1); | 	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1); | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_nu = Fdet2_nu+FdetV; | 	Fdet2_nu = Fdet2_nu+FdetV; | ||||||
|  |  | ||||||
| 	///////////////////////////////////////////////////////////////////// | 	///////////////////////////////////////////////////////////////////// | ||||||
| @@ -589,7 +611,7 @@ public: | |||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,-1); | 	MpInvJx_nu = Cshift(MpInvJx,nu,-1); | ||||||
|  |  | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_mu = Fdet2_mu+FdetV; | 	Fdet2_mu = Fdet2_mu+FdetV; | ||||||
|  |  | ||||||
| 	//  __ | 	//  __ | ||||||
| @@ -609,7 +631,7 @@ public: | |||||||
|  |  | ||||||
| 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | 	MpInvJx_nu = Cshift(MpInvJx,nu,1); | ||||||
|  |  | ||||||
| 	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV); | 	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV); | ||||||
| 	Fdet2_mu = Fdet2_mu+FdetV; | 	Fdet2_mu = Fdet2_mu+FdetV; | ||||||
| 	 | 	 | ||||||
|       } |       } | ||||||
| @@ -931,6 +953,10 @@ private: | |||||||
| public: | public: | ||||||
|  |  | ||||||
|   /* Standard constructor */ |   /* Standard constructor */ | ||||||
|  |   virtual ~SmearedConfigurationMasked() | ||||||
|  |   { | ||||||
|  |     delete UrbGrid; | ||||||
|  |   } | ||||||
|   SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout) |   SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout) | ||||||
|     : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout) |     : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout) | ||||||
|   { |   { | ||||||
| @@ -939,7 +965,6 @@ public: | |||||||
|     // was resized in base class |     // was resized in base class | ||||||
|     assert(this->SmearedSet.size()==Nsmear); |     assert(this->SmearedSet.size()==Nsmear); | ||||||
|      |      | ||||||
|     GridRedBlackCartesian * UrbGrid; |  | ||||||
|     UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid); |     UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid); | ||||||
|     LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0); |     LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0); | ||||||
|     LatticeComplex tmp(_UGrid); |     LatticeComplex tmp(_UGrid); | ||||||
| @@ -947,10 +972,11 @@ public: | |||||||
|     for (unsigned int i = 0; i < this->smearingLevels; ++i) { |     for (unsigned int i = 0; i < this->smearingLevels; ++i) { | ||||||
|  |  | ||||||
|       masks.push_back(*(new LatticeLorentzComplex(_UGrid))); |       masks.push_back(*(new LatticeLorentzComplex(_UGrid))); | ||||||
|  |  | ||||||
|       int mu= (i/2) %Nd; |       int mu= (i/2) %Nd; | ||||||
|       int cb= (i%2); |       int cb= (i%2); | ||||||
|       LatticeComplex tmpcb(UrbGrid); |       LatticeComplex tmpcb(UrbGrid); | ||||||
|  |  | ||||||
|  |       cbs.push_back(cb); | ||||||
| 	 | 	 | ||||||
|       masks[i]=Zero(); |       masks[i]=Zero(); | ||||||
|       //////////////////// |       //////////////////// | ||||||
| @@ -962,7 +988,6 @@ public: | |||||||
|       PokeIndex<LorentzIndex>(masks[i],tmp, mu); |       PokeIndex<LorentzIndex>(masks[i],tmp, mu); | ||||||
| 	 | 	 | ||||||
|     } |     } | ||||||
|     delete UrbGrid; |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   virtual void smeared_force(GaugeField &SigmaTilde)  |   virtual void smeared_force(GaugeField &SigmaTilde)  | ||||||
|   | |||||||
| @@ -418,32 +418,32 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, | |||||||
|   int hNNm1= NNm1/2; |   int hNNm1= NNm1/2; | ||||||
|   RealD sqrt_2 = sqrt(2.0); |   RealD sqrt_2 = sqrt(2.0); | ||||||
|   Complex ci(0.0,1.0); |   Complex ci(0.0,1.0); | ||||||
|   for(int su2Index=0;su2Index<hNNm1;su2Index++){ |  | ||||||
|     int i1, i2; |   const int nsimd=  Matrix::Nsimd(); | ||||||
|     su2SubGroupIndex(i1, i2, su2Index); |   accelerator_for(ss,grid->oSites(),nsimd,{ | ||||||
|     int ax = su2Index*2; |       for(int su2Index=0;su2Index<hNNm1;su2Index++){ | ||||||
|     int ay = su2Index*2+1; | 	int i1, i2; | ||||||
|     accelerator_for(ss,grid->oSites(),1,{ | 	su2SubGroupIndex(i1, i2, su2Index); | ||||||
|  | 	int ax = su2Index*2; | ||||||
|  | 	int ay = su2Index*2+1; | ||||||
| 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian. | 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian. | ||||||
| 	// trace( Ta x Ci in) | 	// trace( Ta x Ci in) | ||||||
| 	// Bet I need to move to real part with mult by -i | 	// Bet I need to move to real part with mult by -i | ||||||
| 	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2))); | 	coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2)))); | ||||||
| 	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1))); | 	coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1)))); | ||||||
|       }); |       } | ||||||
|   } |       for(int diagIndex=0;diagIndex<N-1;diagIndex++){ | ||||||
|   for(int diagIndex=0;diagIndex<N-1;diagIndex++){ | 	int k = diagIndex + 1; // diagIndex starts from 0 | ||||||
|     int k = diagIndex + 1; // diagIndex starts from 0 | 	int a = NNm1+diagIndex; | ||||||
|     int a = NNm1+diagIndex; | 	RealD scale = 1.0/sqrt(2.0*k*(k+1)); | ||||||
|     RealD scale = 1.0/sqrt(2.0*k*(k+1)); | 	auto tmp = in_v(ss)()()(0,0); | ||||||
|     accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{ |  | ||||||
| 	auto tmp = in_v[ss]()()(0,0); |  | ||||||
| 	for(int i=1;i<k;i++){ | 	for(int i=1;i<k;i++){ | ||||||
| 	  tmp=tmp+in_v[ss]()()(i,i); | 	  tmp=tmp+in_v(ss)()()(i,i); | ||||||
| 	} | 	} | ||||||
| 	tmp = tmp - in_v[ss]()()(k,k)*k; | 	tmp = tmp - in_v(ss)()()(k,k)*k; | ||||||
| 	out_v[ss]()()(a,b) =imag(tmp) * scale; | 	coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale); | ||||||
|       }); |       } | ||||||
|     } |     }); | ||||||
| } | } | ||||||
|  |  | ||||||
|    |    | ||||||
|   | |||||||
| @@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) { | |||||||
| //////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////// | ||||||
| // Map a su2 subgroup number to the pair of rows that are non zero | // Map a su2 subgroup number to the pair of rows that are non zero | ||||||
| //////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////// | ||||||
| static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { | static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { | ||||||
|   assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); |   assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); | ||||||
|  |  | ||||||
|   int spare = su2_index; |   int spare = su2_index; | ||||||
|   | |||||||
| @@ -99,6 +99,8 @@ using std::log; | |||||||
| using std::exp; | using std::exp; | ||||||
| using std::sin; | using std::sin; | ||||||
| using std::cos; | using std::cos; | ||||||
|  | using std::asin; | ||||||
|  | using std::acos; | ||||||
|  |  | ||||||
|  |  | ||||||
| accelerator_inline RealF    conjugate(const RealF  & r){ return r; } | accelerator_inline RealF    conjugate(const RealF  & r){ return r; } | ||||||
|   | |||||||
| @@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) { | |||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  | template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {}; | ||||||
|  | template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {}; | ||||||
|  | template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {}; | ||||||
|  | #endif | ||||||
|   | |||||||
| @@ -539,12 +539,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize | |||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) |  | ||||||
| { |  | ||||||
|   acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); |  | ||||||
|   acceleratorCopySynchronise(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| ////////////////////////////////////////////// | ////////////////////////////////////////////// | ||||||
| // CPU Target - No accelerator just thread instead | // CPU Target - No accelerator just thread instead | ||||||
| ////////////////////////////////////////////// | ////////////////////////////////////////////// | ||||||
| @@ -553,7 +547,6 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) | |||||||
|  |  | ||||||
| #undef GRID_SIMT | #undef GRID_SIMT | ||||||
|  |  | ||||||
|  |  | ||||||
| inline void acceleratorMem(void) | inline void acceleratorMem(void) | ||||||
| { | { | ||||||
|   /* |   /* | ||||||
| @@ -656,6 +649,12 @@ accelerator_inline void acceleratorFence(void) | |||||||
|   return; |   return; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) | ||||||
|  | { | ||||||
|  |   acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); | ||||||
|  |   acceleratorCopySynchronise(); | ||||||
|  | } | ||||||
|  |  | ||||||
| template<class T> void acceleratorPut(T& dev,T&host) | template<class T> void acceleratorPut(T& dev,T&host) | ||||||
| { | { | ||||||
|   acceleratorCopyToDevice(&host,&dev,sizeof(T)); |   acceleratorCopyToDevice(&host,&dev,sizeof(T)); | ||||||
|   | |||||||
							
								
								
									
										238
									
								
								HMC/ComputeWilsonFlow.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										238
									
								
								HMC/ComputeWilsonFlow.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,238 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  | Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
|  | Source file: HMC/ComputeWilsonFlow.cc | ||||||
|  |  | ||||||
|  | Copyright (C) 2017 | ||||||
|  |  | ||||||
|  | Author: Guido Cossu <guido.cossu@ed.ac.uk> | ||||||
|  | Author: Shuhei Yamamoto <syamamoto@bnl.gov> | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  |  | ||||||
|  | You should have received a copy of the GNU General Public License along | ||||||
|  | with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  | See the full license in the file "LICENSE" in the top level distribution | ||||||
|  | directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  | #include <string> | ||||||
|  |  | ||||||
|  | namespace Grid{ | ||||||
|  |   struct WFParameters: Serializable { | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters, | ||||||
|  |             int, steps, | ||||||
|  |             double, step_size, | ||||||
|  |             int, meas_interval, | ||||||
|  | 	    double, maxTau, // for the adaptive algorithm | ||||||
|  | 	    int, meas_interval_density, | ||||||
|  | 	    std::string, path);  | ||||||
|  |         | ||||||
|  |  | ||||||
|  |     template <class ReaderClass > | ||||||
|  |     WFParameters(Reader<ReaderClass>& Reader){ | ||||||
|  |       read(Reader, "WilsonFlow", *this); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct ConfParameters: Serializable { | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters, | ||||||
|  | 	   std::string, conf_path, | ||||||
|  |            std::string, conf_prefix, | ||||||
|  | 	   std::string, conf_smr_prefix, | ||||||
|  |            std::string, rng_prefix, | ||||||
|  | 	   int, StartConfiguration, | ||||||
|  | 	   int, EndConfiguration, | ||||||
|  |            int, Skip); | ||||||
|  |    | ||||||
|  |     template <class ReaderClass > | ||||||
|  |     ConfParameters(Reader<ReaderClass>& Reader){ | ||||||
|  |       read(Reader, "Configurations", *this); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template <class T> void writeFile(T& in, std::string const fname){   | ||||||
|  | #ifdef HAVE_LIME | ||||||
|  |   // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111 | ||||||
|  |   std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl; | ||||||
|  |   Grid::emptyUserRecord record; | ||||||
|  |   Grid::ScidacWriter WR(in.Grid()->IsBoss()); | ||||||
|  |   WR.open(fname); | ||||||
|  |   WR.writeScidacFieldRecord(in,record,0); | ||||||
|  |   WR.close(); | ||||||
|  | #endif | ||||||
|  |   // What is the appropriate way to throw error? | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | int main(int argc, char **argv) { | ||||||
|  |   using namespace Grid; | ||||||
|  |    | ||||||
|  |   Grid_init(&argc, &argv); | ||||||
|  |   GridLogLayout(); | ||||||
|  |  | ||||||
|  |   auto latt_size   = GridDefaultLatt(); | ||||||
|  |   auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd()); | ||||||
|  |   auto mpi_layout  = GridDefaultMpi(); | ||||||
|  |   GridCartesian               Grid(latt_size, simd_layout, mpi_layout); | ||||||
|  |    | ||||||
|  |   std::vector<int> seeds({1, 2, 3, 4, 5}); | ||||||
|  |   GridSerialRNG sRNG; | ||||||
|  |   GridParallelRNG pRNG(&Grid); | ||||||
|  |   pRNG.SeedFixedIntegers(seeds); | ||||||
|  |  | ||||||
|  |   LatticeGaugeField Umu(&Grid), Uflow(&Grid); | ||||||
|  |    | ||||||
|  |   typedef Grid::XmlReader       Serialiser; | ||||||
|  |   Serialiser Reader("input.xml", false, "root"); | ||||||
|  |   WFParameters WFPar(Reader); | ||||||
|  |   ConfParameters CPar(Reader); | ||||||
|  |   CheckpointerParameters CPPar(CPar.conf_path+CPar.conf_prefix, CPar.conf_path+CPar.conf_smr_prefix, CPar.conf_path+CPar.rng_prefix); | ||||||
|  |   NerscHmcCheckpointer<PeriodicGimplR> CPNersc(CPPar); | ||||||
|  |  | ||||||
|  |   for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){ | ||||||
|  |  | ||||||
|  |   CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG); | ||||||
|  |  | ||||||
|  |   std::cout << std::setprecision(15); | ||||||
|  |   std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl; | ||||||
|  |    | ||||||
|  |   std::string file_pre  = WFPar.path; | ||||||
|  |   std::string file_post = CPar.conf_prefix + "." + std::to_string(conf); | ||||||
|  |  | ||||||
|  |   WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval); | ||||||
|  |   WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){ | ||||||
|  |      | ||||||
|  |     typedef typename PeriodicGimplR::GaugeLinkField GaugeMat; | ||||||
|  |     typedef typename PeriodicGimplR::ComplexField ComplexField; | ||||||
|  |      | ||||||
|  |     assert(Nd == 4); | ||||||
|  |  | ||||||
|  |     // NOTE: | ||||||
|  |     // Ideally, turn the folloing into methods of the appropriate class | ||||||
|  |     /////////////   Compute Energy Density via Clover Leaf    ///////////////////////////////////////////////// | ||||||
|  |     ///// Taken from qcd/smearing/WilsonFlow.h | ||||||
|  |     //         For plq, use static sitePlaquette from class WilsonLoops in Grid/qcd/utils/WilsonLoops.h and divide it by #faces=(1.0 * Nd * (Nd - 1)) / 2.0, ncol=3 | ||||||
|  |     //E = 1/2 tr( F_munu F_munu ) | ||||||
|  |     //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths: | ||||||
|  |     //F_01 F_02 F_03   F_12 F_13  F_23 | ||||||
|  |     GaugeMat F(U.Grid()); | ||||||
|  |     //LatticeComplexD R(U.Grid()); | ||||||
|  |     ComplexField R(U.Grid()); | ||||||
|  |     R = Zero(); | ||||||
|  |    | ||||||
|  |     for(int mu=0;mu<3;mu++){ | ||||||
|  |       for(int nu=mu+1;nu<4;nu++){ | ||||||
|  | 	WilsonLoops<PeriodicGimplR>::FieldStrength(F, U, mu, nu); | ||||||
|  | 	R = R + trace(F*F); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     R = (-1.0) * R; | ||||||
|  |      | ||||||
|  |     //// Taken from qcd/utils/WilsonLoops.h | ||||||
|  |      | ||||||
|  |     // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y) | ||||||
|  |     GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid()); | ||||||
|  |     WilsonLoops<PeriodicGimplR>::FieldStrength(Bx, U, Ydir, Zdir); | ||||||
|  |     WilsonLoops<PeriodicGimplR>::FieldStrength(By, U, Zdir, Xdir); | ||||||
|  |     WilsonLoops<PeriodicGimplR>::FieldStrength(Bz, U, Xdir, Ydir); | ||||||
|  |  | ||||||
|  |     // Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z) | ||||||
|  |     GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid()); | ||||||
|  |     WilsonLoops<PeriodicGimplR>::FieldStrength(Ex, U, Tdir, Xdir); | ||||||
|  |     WilsonLoops<PeriodicGimplR>::FieldStrength(Ey, U, Tdir, Ydir); | ||||||
|  |     WilsonLoops<PeriodicGimplR>::FieldStrength(Ez, U, Tdir, Zdir); | ||||||
|  |  | ||||||
|  |     double coeff = 8.0/(32.0*M_PI*M_PI); | ||||||
|  |     ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez); | ||||||
|  |     //ComplexField qfield Plq(U.Grid()); | ||||||
|  |     //WilsonLoops<PeriodicGimplR>::sitePlaquette(Plq, U); | ||||||
|  |     //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0; | ||||||
|  |     //Plq = coeff * Plq; | ||||||
|  |  | ||||||
|  |     int tau = std::round(t); | ||||||
|  |     std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post; | ||||||
|  |     writeFile(R,efile); | ||||||
|  |     std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post; | ||||||
|  |     writeFile(qfield,tfile); | ||||||
|  |  | ||||||
|  |     RealD E = real(sum(R))/ RealD(U.Grid()->gSites()); | ||||||
|  |     RealD T = real( sum(qfield) ); | ||||||
|  |     Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0; | ||||||
|  |     RealD E0 = real(peekSite(R,scoor)); | ||||||
|  |     RealD T0 = real(peekSite(qfield,scoor)); | ||||||
|  |     std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  " | ||||||
|  | 	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl; | ||||||
|  |      | ||||||
|  |   }); | ||||||
|  |    | ||||||
|  |   int t=WFPar.maxTau; | ||||||
|  |   WF.smear(Uflow, Umu); | ||||||
|  |  | ||||||
|  |   RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow); | ||||||
|  |   RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow); | ||||||
|  |   RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t | ||||||
|  |   RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow); | ||||||
|  |   std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl; | ||||||
|  |  | ||||||
|  |   std::cout<< GridLogMessage << " Admissibility check:\n"; | ||||||
|  |   const double sp_adm = 0.067;                // admissible threshold | ||||||
|  |   const double pl_adm = 1.0-sp_adm/Nc; | ||||||
|  |   std::cout << GridLogMessage << "   (pl_adm =" << pl_adm << ")\n"; | ||||||
|  |  | ||||||
|  |   // Need min and reduce min for this function | ||||||
|  |   //double sp_max = NC_*(1.0-stpl.plaq_min(U,pl_adm)); | ||||||
|  |   double sp_ave = Nc*(1.0-WFlow_plaq); | ||||||
|  |  | ||||||
|  |   //std::cout<< GridLogMessage << "   sp_max = "        << sp_max <<"\n"; | ||||||
|  |   std::cout<< GridLogMessage << "   sp_ave = "        << sp_ave <<"\n"; | ||||||
|  |   std::cout<< GridLogMessage << "   (sp_admissible = "<< sp_adm <<")\n"; | ||||||
|  |   //std::cout<< GridLogMessage << "   sp_admissible - sp_max = "<<sp_adm-sp_max <<"\n"; | ||||||
|  |   std::cout<< GridLogMessage << "   sp_admissible - sp_ave = "<<sp_adm-sp_ave <<"\n"; | ||||||
|  |   } | ||||||
|  |   Grid_finalize(); | ||||||
|  | }  // main | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* | ||||||
|  | Input file example | ||||||
|  |  | ||||||
|  |  | ||||||
|  | JSON | ||||||
|  |  | ||||||
|  | { | ||||||
|  |     "WilsonFlow":{ | ||||||
|  | 	"steps": 200, | ||||||
|  | 	"step_size": 0.01, | ||||||
|  | 	"meas_interval": 50, | ||||||
|  |   "maxTau": 2.0 | ||||||
|  |     }, | ||||||
|  |     "Configurations":{ | ||||||
|  | 	"conf_prefix": "ckpoint_lat", | ||||||
|  | 	"rng_prefix": "ckpoint_rng", | ||||||
|  | 	"StartConfiguration": 3000, | ||||||
|  | 	"EndConfiguration": 3000, | ||||||
|  | 	"Skip": 5 | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | */ | ||||||
| @@ -58,7 +58,7 @@ int main(int argc, char **argv) { | |||||||
|   HMCparameters HMCparams; |   HMCparameters HMCparams; | ||||||
|   HMCparams.StartTrajectory  = 0; |   HMCparams.StartTrajectory  = 0; | ||||||
|   HMCparams.Trajectories     = 200; |   HMCparams.Trajectories     = 200; | ||||||
|   HMCparams.NoMetropolisUntil=  20; |   HMCparams.NoMetropolisUntil=  0; | ||||||
|   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; |   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; | ||||||
|   HMCparams.StartingType     =std::string("ColdStart"); |   HMCparams.StartingType     =std::string("ColdStart"); | ||||||
|   HMCparams.MD = MD; |   HMCparams.MD = MD; | ||||||
| @@ -70,7 +70,7 @@ int main(int argc, char **argv) { | |||||||
|   CheckpointerParameters CPparams; |   CheckpointerParameters CPparams; | ||||||
|   CPparams.config_prefix = "ckpoint_EODWF_lat"; |   CPparams.config_prefix = "ckpoint_EODWF_lat"; | ||||||
|   CPparams.rng_prefix    = "ckpoint_EODWF_rng"; |   CPparams.rng_prefix    = "ckpoint_EODWF_rng"; | ||||||
|   CPparams.saveInterval  = 10; |   CPparams.saveInterval  = 1; | ||||||
|   CPparams.format        = "IEEE64BIG"; |   CPparams.format        = "IEEE64BIG"; | ||||||
|   TheHMC.Resources.LoadNerscCheckpointer(CPparams); |   TheHMC.Resources.LoadNerscCheckpointer(CPparams); | ||||||
|  |  | ||||||
| @@ -186,6 +186,8 @@ int main(int argc, char **argv) { | |||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
|   // HMC parameters are serialisable |   // HMC parameters are serialisable | ||||||
|  |   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file                                                                      | ||||||
|  |   TheHMC.initializeGaugeFieldAndRNGs(U); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " Running the HMC "<< std::endl; |   std::cout << GridLogMessage << " Running the HMC "<< std::endl; | ||||||
|   TheHMC.Run();  // no smearing |   TheHMC.Run();  // no smearing | ||||||
|   | |||||||
| @@ -30,11 +30,13 @@ directory | |||||||
| #include <string> | #include <string> | ||||||
|  |  | ||||||
| template <class T> void readFile(T& out, std::string const fname){ | template <class T> void readFile(T& out, std::string const fname){ | ||||||
|  | #ifdef HAVE_LIME | ||||||
|   Grid::emptyUserRecord record; |   Grid::emptyUserRecord record; | ||||||
|   Grid::ScidacReader RD; |   Grid::ScidacReader RD; | ||||||
|   RD.open(fname); |   RD.open(fname); | ||||||
|   RD.readScidacFieldRecord(out,record); |   RD.readScidacFieldRecord(out,record); | ||||||
|   RD.close(); |   RD.close(); | ||||||
|  | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -31,11 +31,13 @@ directory | |||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
| template <class T> void writeFile(T& out, std::string const fname){ | template <class T> void writeFile(T& out, std::string const fname){ | ||||||
|  | #ifdef HAVE_LIME | ||||||
|   emptyUserRecord record; |   emptyUserRecord record; | ||||||
|   ScidacWriter WR(out.Grid()->IsBoss()); |   ScidacWriter WR(out.Grid()->IsBoss()); | ||||||
|   WR.open(fname); |   WR.open(fname); | ||||||
|   WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC); |   WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC); | ||||||
|   WR.close(); |   WR.close(); | ||||||
|  | #endif | ||||||
| } | } | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
| int main(int argc, char **argv) { | int main(int argc, char **argv) { | ||||||
|   | |||||||
| @@ -261,23 +261,25 @@ public: | |||||||
|     fprintf(FP,"\n\n"); |     fprintf(FP,"\n\n"); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   template<class CComplex> | ||||||
|   static void BLAS(void) |   static void BLAS(void) | ||||||
|   { |   { | ||||||
|     //int nbasis, int nrhs, int coarseVol |     //int nbasis, int nrhs, int coarseVol | ||||||
|     int  basis[] = { 16,32,64 }; |     int  basis[] = { 16,32,64 }; | ||||||
|     int  rhs[]   = { 8,16,32 }; |     int  rhs[]   = { 8,12,16 }; | ||||||
|     int  vol  = 4*4*4*4; |     int  vol  = 8*8*8*8; | ||||||
|  |     int  blk  = 4*4*4*4; | ||||||
|  |  | ||||||
|     GridBLAS blas; |     GridBLAS blas; | ||||||
|      |  | ||||||
|  |     int fpbits = sizeof(CComplex)*4; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl; |     std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl; |     std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|    |    | ||||||
|     fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n"); |     fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits); | ||||||
|  |  | ||||||
|     for(int b=0;b<3;b++){ |     for(int b=0;b<3;b++){ | ||||||
|     for(int r=0;r<3;r++){ |     for(int r=0;r<3;r++){ | ||||||
| @@ -285,7 +287,7 @@ public: | |||||||
|       int N=rhs[r]; |       int N=rhs[r]; | ||||||
|       int K=basis[b]; |       int K=basis[b]; | ||||||
|       int BATCH=vol; |       int BATCH=vol; | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||||
|        |        | ||||||
| @@ -299,9 +301,9 @@ public: | |||||||
|     for(int r=0;r<3;r++){ |     for(int r=0;r<3;r++){ | ||||||
|       int M=basis[b]; |       int M=basis[b]; | ||||||
|       int N=rhs[r]; |       int N=rhs[r]; | ||||||
|       int K=vol; |       int K=blk; | ||||||
|       int BATCH=vol; |       int BATCH=vol; | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||||
| @@ -313,10 +315,10 @@ public: | |||||||
|     for(int b=0;b<3;b++){ |     for(int b=0;b<3;b++){ | ||||||
|     for(int r=0;r<3;r++){ |     for(int r=0;r<3;r++){ | ||||||
|       int M=rhs[r]; |       int M=rhs[r]; | ||||||
|       int N=vol; |       int N=blk; | ||||||
|       int K=basis[b]; |       int K=basis[b]; | ||||||
|       int BATCH=vol; |       int BATCH=vol; | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |       double p=blas.benchmark<CComplex>(M,N,K,BATCH); | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||||
| @@ -867,6 +869,7 @@ int main (int argc, char ** argv) | |||||||
|   int do_memory=1; |   int do_memory=1; | ||||||
|   int do_comms =1; |   int do_comms =1; | ||||||
|   int do_blas  =1; |   int do_blas  =1; | ||||||
|  |   int do_dslash=1; | ||||||
|  |  | ||||||
|   int sel=4; |   int sel=4; | ||||||
|   std::vector<int> L_list({8,12,16,24,32}); |   std::vector<int> L_list({8,12,16,24,32}); | ||||||
| @@ -877,6 +880,7 @@ int main (int argc, char ** argv) | |||||||
|   std::vector<double> staggered; |   std::vector<double> staggered; | ||||||
|  |  | ||||||
|   int Ls=1; |   int Ls=1; | ||||||
|  |   if (do_dslash){ | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl; |   std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl; | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
| @@ -901,6 +905,7 @@ int main (int argc, char ** argv) | |||||||
|     staggered.push_back(result); |     staggered.push_back(result); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; |   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
| @@ -909,8 +914,33 @@ int main (int argc, char ** argv) | |||||||
|     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl; |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl; | ||||||
|   } |   } | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   int NN=NN_global; |   int NN=NN_global; | ||||||
|  |   if(do_dslash){ | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl; | ||||||
|  |     fprintf(FP,"Per node summary table\n"); | ||||||
|  |     fprintf(FP,"\n"); | ||||||
|  |     fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n"); | ||||||
|  |     fprintf(FP,"\n"); | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl; | ||||||
|  |       fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.); | ||||||
|  |     } | ||||||
|  |     fprintf(FP,"\n"); | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl; | ||||||
|  |     std::cout<<std::setprecision(3); | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |    | ||||||
|   if ( do_memory ) { |   if ( do_memory ) { | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; |     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; | ||||||
| @@ -918,15 +948,6 @@ int main (int argc, char ** argv) | |||||||
|     Benchmark::Memory(); |     Benchmark::Memory(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( do_blas ) { |  | ||||||
| #if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)    |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     Benchmark::BLAS(); |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if ( do_su4 ) { |   if ( do_su4 ) { | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl; |     std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl; | ||||||
| @@ -941,28 +962,14 @@ int main (int argc, char ** argv) | |||||||
|     Benchmark::Comms(); |     Benchmark::Comms(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   if ( do_blas ) { | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; |     std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl; | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl; |     Benchmark::BLAS<ComplexD>(); | ||||||
|     fprintf(FP,"Per node summary table\n"); |     Benchmark::BLAS<ComplexF>(); | ||||||
|     fprintf(FP,"\n"); |   } | ||||||
|     fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n"); |    | ||||||
|     fprintf(FP,"\n"); |  | ||||||
|     for(int l=0;l<L_list.size();l++){ |  | ||||||
|       std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl; |  | ||||||
|       fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.); |  | ||||||
|     } |  | ||||||
|     fprintf(FP,"\n"); |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl; |  | ||||||
|     std::cout<<std::setprecision(3); |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
|   fclose(FP); |   fclose(FP); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,16 +1,18 @@ | |||||||
|  |  | ||||||
|  | export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl "  | ||||||
|  | export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc " | ||||||
| ../../configure \ | ../../configure \ | ||||||
| 	--enable-simd=GPU \ | 	--enable-simd=GPU \ | ||||||
| 	--enable-gen-simd-width=64 \ | 	--enable-gen-simd-width=64 \ | ||||||
| 	--enable-comms=mpi-auto \ | 	--enable-comms=mpi-auto \ | ||||||
|  | 	--enable-debug \ | ||||||
| 	--disable-gparity \ | 	--disable-gparity \ | ||||||
| 	--disable-fermion-reps \ | 	--disable-fermion-reps \ | ||||||
|  | 	--with-lime=$CLIME \ | ||||||
| 	--enable-shm=nvlink \ | 	--enable-shm=nvlink \ | ||||||
| 	--enable-accelerator=sycl \ | 	--enable-accelerator=sycl \ | ||||||
| 	--enable-accelerator-aware-mpi=yes\ | 	--enable-accelerator-aware-mpi=yes\ | ||||||
| 	--enable-unified=no \ | 	--enable-unified=no \ | ||||||
| 	MPICXX=mpicxx \ | 	MPICXX=mpicxx \ | ||||||
| 	CXX=icpx \ | 	CXX=icpx  | ||||||
| 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ |  | ||||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										23
									
								
								systems/Aurora/config-command-leak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								systems/Aurora/config-command-leak
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | |||||||
|  | source ~/spack/share/spack/setup-env.sh  | ||||||
|  | spack load c-lime | ||||||
|  | export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` | ||||||
|  | export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` | ||||||
|  | export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||||
|  |  | ||||||
|  | ../../configure \ | ||||||
|  | 	--enable-debug \ | ||||||
|  | 	--enable-simd=GPU \ | ||||||
|  | 	--enable-gen-simd-width=64 \ | ||||||
|  | 	--enable-comms=mpi-auto \ | ||||||
|  | 	--disable-gparity \ | ||||||
|  | 	--disable-fermion-reps \ | ||||||
|  | 	--with-lime=$CLIME \ | ||||||
|  | 	--enable-shm=nvlink \ | ||||||
|  | 	--enable-accelerator=sycl \ | ||||||
|  | 	--enable-accelerator-aware-mpi=yes\ | ||||||
|  | 	--enable-unified=no \ | ||||||
|  | 	MPICXX=mpicxx \ | ||||||
|  | 	CXX=icpx \ | ||||||
|  | 	LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl  -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \ | ||||||
|  | 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host  -fsycl -fsanitize=leak " | ||||||
|  |  | ||||||
							
								
								
									
										22
									
								
								systems/Aurora/config-command-sanitize
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								systems/Aurora/config-command-sanitize
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | |||||||
|  | # -fsycl-targets=spir64_gen -Xs\" -device pvc \" | ||||||
|  | # -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc | ||||||
|  | # -fsycl-targets=intel_gpu_pvc | ||||||
|  |  | ||||||
|  | unset DEVICE | ||||||
|  | export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -Xarch_host -fsanitize=address"  | ||||||
|  | export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -Xarch_host -fsanitize=address  -fsycl-targets=spir64_gen -Xs -device -Xs pvc " | ||||||
|  | ../../configure \ | ||||||
|  | 	--enable-simd=GPU \ | ||||||
|  | 	--enable-gen-simd-width=64 \ | ||||||
|  | 	--enable-comms=mpi-auto \ | ||||||
|  | 	--enable-debug \ | ||||||
|  | 	--disable-gparity \ | ||||||
|  | 	--disable-fermion-reps \ | ||||||
|  | 	--with-lime=$CLIME \ | ||||||
|  | 	--enable-shm=nvlink \ | ||||||
|  | 	--enable-accelerator=sycl \ | ||||||
|  | 	--enable-accelerator-aware-mpi=yes\ | ||||||
|  | 	--enable-unified=no \ | ||||||
|  | 	MPICXX=mpicxx \ | ||||||
|  | 	CXX=icpx  | ||||||
|  |  | ||||||
| @@ -1,14 +1,22 @@ | |||||||
|  | source ~/spack/share/spack/setup-env.sh  | ||||||
|  | spack load c-lime | ||||||
|  | export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` | ||||||
|  | #spack load libefence | ||||||
|  | #export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' ` | ||||||
|  | #export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH | ||||||
|  | #spack load gperftools | ||||||
|  | export TCMALLOC=/home/paboyle/gperftools/install | ||||||
|  | export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||||
|  | export INTELGT_AUTO_ATTACH_DISABLE=1 | ||||||
|  |  | ||||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 | #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 | ||||||
|  | #module load oneapi/release/2023.12.15.001 | ||||||
| module load oneapi/release/2023.12.15.001 |  | ||||||
|  |  | ||||||
| #module use /soft/modulefiles | #module use /soft/modulefiles | ||||||
| #module load intel_compute_runtime/release/agama-devel-682.22 | #module load intel_compute_runtime/release/agama-devel-682.22 | ||||||
|  |  | ||||||
| export FI_CXI_DEFAULT_CQ_SIZE=131072 | #export FI_CXI_DEFAULT_CQ_SIZE=131072 | ||||||
| export FI_CXI_CQ_FILL_PERCENT=20 | #export FI_CXI_CQ_FILL_PERCENT=20 | ||||||
|  | #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||||
| export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" |  | ||||||
| #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" | #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" | ||||||
|  |  | ||||||
| # | # | ||||||
| @@ -16,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | |||||||
| # -ftarget-register-alloc-mode=pvc:small | # -ftarget-register-alloc-mode=pvc:small | ||||||
| # -ftarget-register-alloc-mode=pvc:large | # -ftarget-register-alloc-mode=pvc:large | ||||||
| # -ftarget-register-alloc-mode=pvc:auto | # -ftarget-register-alloc-mode=pvc:auto | ||||||
| # | #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 | ||||||
|  |  | ||||||
| export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 | export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 | ||||||
| export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 | export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 | ||||||
| export http_proxy=http://proxy.alcf.anl.gov:3128 | export http_proxy=http://proxy.alcf.anl.gov:3128 | ||||||
| export https_proxy=http://proxy.alcf.anl.gov:3128 | export https_proxy=http://proxy.alcf.anl.gov:3128 | ||||||
| #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 |  | ||||||
| git config --global http.proxy http://proxy.alcf.anl.gov:3128 | git config --global http.proxy http://proxy.alcf.anl.gov:3128 | ||||||
|  |  | ||||||
|  | #source ~/spack/share/spack/setup-env.sh | ||||||
|  | #spack load gperftools | ||||||
|  | #export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` | ||||||
|  | #export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH | ||||||
|  |  | ||||||
| export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" | ||||||
|   | |||||||
							
								
								
									
										76
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.csv
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,76 @@ | |||||||
|  | Memory Bandwidth | ||||||
|  |  | ||||||
|  | Bytes, GB/s per node | ||||||
|  | 6291456, 379.297050 | ||||||
|  | 100663296, 3754.674992 | ||||||
|  | 509607936, 6521.472413 | ||||||
|  | 1610612736, 8513.456479 | ||||||
|  | 3932160000, 9018.901766 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | GEMM | ||||||
|  |  | ||||||
|  |  M, N, K, BATCH, GF/s per rank | ||||||
|  | 16, 8, 16, 256, 0.564958 | ||||||
|  | 16, 16, 16, 256, 243.148058 | ||||||
|  | 16, 32, 16, 256, 440.346877 | ||||||
|  | 32, 8, 32, 256, 439.194136 | ||||||
|  | 32, 16, 32, 256, 847.334141 | ||||||
|  | 32, 32, 32, 256, 1430.892623 | ||||||
|  | 64, 8, 64, 256, 1242.756741 | ||||||
|  | 64, 16, 64, 256, 2196.689493 | ||||||
|  | 64, 32, 64, 256, 3697.458072 | ||||||
|  | 16, 8, 256, 256, 899.582627 | ||||||
|  | 16, 16, 256, 256, 1673.537756 | ||||||
|  | 16, 32, 256, 256, 2959.597089 | ||||||
|  | 32, 8, 256, 256, 1558.858630 | ||||||
|  | 32, 16, 256, 256, 2864.839445 | ||||||
|  | 32, 32, 256, 256, 4810.671254 | ||||||
|  | 64, 8, 256, 256, 2386.092942 | ||||||
|  | 64, 16, 256, 256, 4451.665937 | ||||||
|  | 64, 32, 256, 256, 5942.124095 | ||||||
|  | 8, 256, 16, 256, 799.867271 | ||||||
|  | 16, 256, 16, 256, 1584.624888 | ||||||
|  | 32, 256, 16, 256, 1949.422338 | ||||||
|  | 8, 256, 32, 256, 1389.417474 | ||||||
|  | 16, 256, 32, 256, 2668.344493 | ||||||
|  | 32, 256, 32, 256, 3234.162120 | ||||||
|  | 8, 256, 64, 256, 2150.925128 | ||||||
|  | 16, 256, 64, 256, 4012.488132 | ||||||
|  | 32, 256, 64, 256, 5154.785521 | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Communications | ||||||
|  |  | ||||||
|  | Packet bytes, direction, GB/s per node | ||||||
|  | 4718592, 1, 245.026198 | ||||||
|  | 4718592, 2, 251.180996 | ||||||
|  | 4718592, 3, 361.110977 | ||||||
|  | 4718592, 5, 247.898447 | ||||||
|  | 4718592, 6, 249.867523 | ||||||
|  | 4718592, 7, 359.033061 | ||||||
|  | 15925248, 1, 255.030946 | ||||||
|  | 15925248, 2, 264.453890 | ||||||
|  | 15925248, 3, 392.949183 | ||||||
|  | 15925248, 5, 256.040644 | ||||||
|  | 15925248, 6, 264.681896 | ||||||
|  | 15925248, 7, 392.102622 | ||||||
|  | 37748736, 1, 258.823333 | ||||||
|  | 37748736, 2, 268.181577 | ||||||
|  | 37748736, 3, 401.478191 | ||||||
|  | 37748736, 5, 258.995363 | ||||||
|  | 37748736, 6, 268.206586 | ||||||
|  | 37748736, 7, 400.397611 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Per node summary table | ||||||
|  |  | ||||||
|  | L , Wilson, DWF4, Staggered, GF/s per node | ||||||
|  |  | ||||||
|  | 8 , 155, 1386, 50 | ||||||
|  | 12 , 694, 4208, 230 | ||||||
|  | 16 , 1841, 6675, 609 | ||||||
|  | 24 , 3934, 8573, 1641 | ||||||
|  | 32 , 5083, 9771, 3086 | ||||||
|  |  | ||||||
| 
 | 
							
								
								
									
										702
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										702
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.log
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,702 @@ | |||||||
|  | RANK 1 using GPU 1 | ||||||
|  | RANK 5 using GPU 6 | ||||||
|  | RANK 0 using GPU 0 | ||||||
|  | RANK 2 using GPU 2 | ||||||
|  | RANK 3 using GPU 3 | ||||||
|  | RANK 6 using GPU 5 | ||||||
|  | RANK 7 using GPU 4 | ||||||
|  | RANK 4 using GPU 7 | ||||||
|  | world_rank 0 has 1 devices | ||||||
|  | AcceleratorHipInit: ======================== | ||||||
|  | AcceleratorHipInit: Device Number    : 0 | ||||||
|  | AcceleratorHipInit: ======================== | ||||||
|  | AcceleratorHipInit: Device identifier: AMD Instinct MI250X | ||||||
|  | AcceleratorHipInit:   totalGlobalMem: 68702699520  | ||||||
|  | AcceleratorHipInit:   isMultiGpuBoard: 0  | ||||||
|  | AcceleratorHipInit:   warpSize: 64  | ||||||
|  | AcceleratorHipInit: using default device  | ||||||
|  | AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding  | ||||||
|  | AcceleratorHipInit: Configure options --enable-setdevice=no  | ||||||
|  | local rank 0 device 0 bus id: 0000:c1:00.0 | ||||||
|  | AcceleratorHipInit: ================================================ | ||||||
|  | SharedMemoryMpi:  World communicator of size 8 | ||||||
|  | SharedMemoryMpi:  Node  communicator of size 8 | ||||||
|  | 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 4294967296bytes at 0x7ff651800000 - 7ff7517fffff for comms buffers  | ||||||
|  | Setting up IPC | ||||||
|  |  | ||||||
|  | __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||||
|  | __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||||
|  | __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__ | ||||||
|  | __|_                                    _|__ | ||||||
|  | __|_   GGGG    RRRR    III    DDDD      _|__ | ||||||
|  | __|_  G        R   R    I     D   D     _|__ | ||||||
|  | __|_  G        R   R    I     D    D    _|__ | ||||||
|  | __|_  G  GG    RRRR     I     D    D    _|__ | ||||||
|  | __|_  G   G    R  R     I     D   D     _|__ | ||||||
|  | __|_   GGGG    R   R   III    DDDD      _|__ | ||||||
|  | __|_                                    _|__ | ||||||
|  | __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||||
|  | __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | ||||||
|  |   |  |  |  |  |  |  |  |  |  |  |  |  |  |   | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  | Current Grid git commit hash=9a1ad6a5eb29a369d74784e7483c60e578323d76: (HEAD -> develop, origin/develop, origin/HEAD) clean | ||||||
|  |  | ||||||
|  | Grid : Message : ================================================  | ||||||
|  | Grid : Message : MPI is initialised and logging filters activated  | ||||||
|  | Grid : Message : ================================================  | ||||||
|  | Grid : Message : This rank is running on host frontier01320 | ||||||
|  | Grid : Message : Requested 4294967296 byte stencil comms buffers  | ||||||
|  | Grid : Message : MemoryManager Cache 54962159616 bytes  | ||||||
|  | Grid : Message : MemoryManager::Init() setting up | ||||||
|  | Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0 | ||||||
|  | Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0 | ||||||
|  | Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0 | ||||||
|  | Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory | ||||||
|  | Grid : Message : MemoryManager::Init() Using hipMalloc | ||||||
|  | Grid : Message : 0.293720 s : ================================================================================== | ||||||
|  | Grid : Message : 0.293790 s : = Grid is setup to use 1 threads | ||||||
|  | Grid : Message : 0.293800 s : ================================================================================== | ||||||
|  | Grid : Message : 0.293810 s : Grid Default Decomposition patterns | ||||||
|  | Grid : Message : 0.293810 s : 	OpenMP threads : 1 | ||||||
|  | Grid : Message : 0.293820 s : 	MPI tasks      : 1 2 2 2  | ||||||
|  | Grid : Message : 0.293870 s : 	vReal          : 512bits ; 1 2 2 2  | ||||||
|  | Grid : Message : 0.293890 s : 	vRealF         : 512bits ; 2 2 2 2  | ||||||
|  | Grid : Message : 0.293910 s : 	vRealD         : 512bits ; 1 2 2 2  | ||||||
|  | Grid : Message : 0.293920 s : 	vComplex       : 512bits ; 1 1 2 2  | ||||||
|  | Grid : Message : 0.293930 s : 	vComplexF      : 512bits ; 1 2 2 2  | ||||||
|  | Grid : Message : 0.293960 s : 	vComplexD      : 512bits ; 1 1 2 2  | ||||||
|  | Grid : Message : 0.293970 s : ================================================================================== | ||||||
|  | Grid : Message : 0.293980 s : ================================================================================== | ||||||
|  | Grid : Message : 0.293990 s :  Clover dslash 4D vectorised (temporarily Wilson) | ||||||
|  | Grid : Message : 0.294000 s : ================================================================================== | ||||||
|  | Grid : Message : 0.301330 s : ================================================================================== | ||||||
|  | Grid : Message : 0.301360 s : Benchmark DWF on 8^4 local volume  | ||||||
|  | Grid : Message : 0.301370 s : * Nc             : 3 | ||||||
|  | Grid : Message : 0.301380 s : * Global volume  : 8 16 16 16  | ||||||
|  | Grid : Message : 0.301410 s : * Ls             : 1 | ||||||
|  | Grid : Message : 0.301420 s : * ranks          : 8 | ||||||
|  | Grid : Message : 0.301430 s : * nodes          : 1 | ||||||
|  | Grid : Message : 0.301440 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 0.301450 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 0.301460 s : * Using 1 threads | ||||||
|  | Grid : Message : 0.301470 s : ================================================================================== | ||||||
|  | Grid : Message : 0.345030 s : Initialised RNGs | ||||||
|  | Grid : Message : 0.158302 s : ================================================================================== | ||||||
|  | Grid : Message : 0.158310 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 0.158311 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 0.158312 s : * SINGLE precision  | ||||||
|  | Grid : Message : 0.158313 s : ================================================================================== | ||||||
|  | Grid : Message : 0.240681 s : Deo FlopsPerSite is 1344 | ||||||
|  | Grid : Message : 0.240711 s : Deo mflop/s =   154914.0 (130.8) 139367.7-159565.9 | ||||||
|  | Grid : Message : 0.240715 s : Deo mflop/s per rank   19364.3 | ||||||
|  | Grid : Message : 0.240716 s : Deo mflop/s per node   154914.0 | ||||||
|  | Grid : Message : 0.240718 s : ================================================================================== | ||||||
|  | Grid : Message : 0.240719 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 0.240719 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 0.240719 s : * SINGLE precision  | ||||||
|  | Grid : Message : 0.240719 s : ================================================================================== | ||||||
|  | Grid : Message : 0.315028 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 0.315033 s : Deo mflop/s =   151459.5 (142.0) 131856.9-157286.4 | ||||||
|  | Grid : Message : 0.315036 s : Deo mflop/s per rank   18932.4 | ||||||
|  | Grid : Message : 0.315037 s : Deo mflop/s per node   151459.5 | ||||||
|  | Grid : Message : 0.315038 s : ================================================================================== | ||||||
|  | Grid : Message : 0.315040 s : 8^4 x 1 Deo Best  mflop/s        =   154914.0 ; 154914.0 per node  | ||||||
|  | Grid : Message : 0.315042 s : 8^4 x 1 Deo Worst mflop/s        =   151459.5 ; 151459.5 per node  | ||||||
|  | Grid : Message : 0.315043 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 0.315043 s : 154914.0 ; 151459.5 ;  | ||||||
|  | Grid : Message : 0.315044 s : ================================================================================== | ||||||
|  | Grid : Message : 0.316507 s : ================================================================================== | ||||||
|  | Grid : Message : 0.316510 s : Benchmark DWF on 12^4 local volume  | ||||||
|  | Grid : Message : 0.316511 s : * Nc             : 3 | ||||||
|  | Grid : Message : 0.316512 s : * Global volume  : 12 24 24 24  | ||||||
|  | Grid : Message : 0.316515 s : * Ls             : 1 | ||||||
|  | Grid : Message : 0.316516 s : * ranks          : 8 | ||||||
|  | Grid : Message : 0.316517 s : * nodes          : 1 | ||||||
|  | Grid : Message : 0.316518 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 0.316518 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 0.316519 s : * Using 1 threads | ||||||
|  | Grid : Message : 0.316520 s : ================================================================================== | ||||||
|  | Grid : Message : 0.327883 s : Initialised RNGs | ||||||
|  | Grid : Message : 0.786395 s : ================================================================================== | ||||||
|  | Grid : Message : 0.786404 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 0.786405 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 0.786406 s : * SINGLE precision  | ||||||
|  | Grid : Message : 0.786406 s : ================================================================================== | ||||||
|  | Grid : Message : 0.871646 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 0.871659 s : Deo mflop/s =   684982.2 (632.4) 609162.5-714594.5 | ||||||
|  | Grid : Message : 0.871663 s : Deo mflop/s per rank   85622.8 | ||||||
|  | Grid : Message : 0.871664 s : Deo mflop/s per node   684982.2 | ||||||
|  | Grid : Message : 0.871665 s : ================================================================================== | ||||||
|  | Grid : Message : 0.871665 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 0.871665 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 0.871665 s : * SINGLE precision  | ||||||
|  | Grid : Message : 0.871665 s : ================================================================================== | ||||||
|  | Grid : Message : 0.953697 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 0.953702 s : Deo mflop/s =   693556.6 (576.5) 663552.0-719204.7 | ||||||
|  | Grid : Message : 0.953705 s : Deo mflop/s per rank   86694.6 | ||||||
|  | Grid : Message : 0.953706 s : Deo mflop/s per node   693556.6 | ||||||
|  | Grid : Message : 0.953707 s : ================================================================================== | ||||||
|  | Grid : Message : 0.953708 s : 12^4 x 1 Deo Best  mflop/s        =   693556.6 ; 693556.6 per node  | ||||||
|  | Grid : Message : 0.953710 s : 12^4 x 1 Deo Worst mflop/s        =   684982.2 ; 684982.2 per node  | ||||||
|  | Grid : Message : 0.953712 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 0.953712 s : 684982.2 ; 693556.6 ;  | ||||||
|  | Grid : Message : 0.953713 s : ================================================================================== | ||||||
|  | Grid : Message : 0.957609 s : ================================================================================== | ||||||
|  | Grid : Message : 0.957613 s : Benchmark DWF on 16^4 local volume  | ||||||
|  | Grid : Message : 0.957614 s : * Nc             : 3 | ||||||
|  | Grid : Message : 0.957615 s : * Global volume  : 16 32 32 32  | ||||||
|  | Grid : Message : 0.957620 s : * Ls             : 1 | ||||||
|  | Grid : Message : 0.957621 s : * ranks          : 8 | ||||||
|  | Grid : Message : 0.957622 s : * nodes          : 1 | ||||||
|  | Grid : Message : 0.957623 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 0.957623 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 0.957624 s : * Using 1 threads | ||||||
|  | Grid : Message : 0.957625 s : ================================================================================== | ||||||
|  | Grid : Message : 0.985828 s : Initialised RNGs | ||||||
|  | Grid : Message : 2.379761 s : ================================================================================== | ||||||
|  | Grid : Message : 2.379772 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 2.379773 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 2.379774 s : * SINGLE precision  | ||||||
|  | Grid : Message : 2.379775 s : ================================================================================== | ||||||
|  | Grid : Message : 2.486712 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 2.486725 s : Deo mflop/s =   1803226.1 (1139.4) 1646362.3-1864135.1 | ||||||
|  | Grid : Message : 2.486729 s : Deo mflop/s per rank   225403.3 | ||||||
|  | Grid : Message : 2.486731 s : Deo mflop/s per node   1803226.1 | ||||||
|  | Grid : Message : 2.486732 s : ================================================================================== | ||||||
|  | Grid : Message : 2.486732 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 2.486732 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 2.486732 s : * SINGLE precision  | ||||||
|  | Grid : Message : 2.486732 s : ================================================================================== | ||||||
|  | Grid : Message : 2.584407 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 2.584412 s : Deo mflop/s =   1840587.3 (1119.6) 1779401.7-1914791.0 | ||||||
|  | Grid : Message : 2.584415 s : Deo mflop/s per rank   230073.4 | ||||||
|  | Grid : Message : 2.584416 s : Deo mflop/s per node   1840587.3 | ||||||
|  | Grid : Message : 2.584417 s : ================================================================================== | ||||||
|  | Grid : Message : 2.584418 s : 16^4 x 1 Deo Best  mflop/s        =   1840587.3 ; 1840587.3 per node  | ||||||
|  | Grid : Message : 2.584420 s : 16^4 x 1 Deo Worst mflop/s        =   1803226.1 ; 1803226.1 per node  | ||||||
|  | Grid : Message : 2.584422 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 2.584422 s : 1803226.1 ; 1840587.3 ;  | ||||||
|  | Grid : Message : 2.584423 s : ================================================================================== | ||||||
|  | Grid : Message : 2.592858 s : ================================================================================== | ||||||
|  | Grid : Message : 2.592862 s : Benchmark DWF on 24^4 local volume  | ||||||
|  | Grid : Message : 2.592863 s : * Nc             : 3 | ||||||
|  | Grid : Message : 2.592864 s : * Global volume  : 24 48 48 48  | ||||||
|  | Grid : Message : 2.592869 s : * Ls             : 1 | ||||||
|  | Grid : Message : 2.592870 s : * ranks          : 8 | ||||||
|  | Grid : Message : 2.592871 s : * nodes          : 1 | ||||||
|  | Grid : Message : 2.592872 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 2.592872 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 2.592873 s : * Using 1 threads | ||||||
|  | Grid : Message : 2.592874 s : ================================================================================== | ||||||
|  | Grid : Message : 2.715623 s : Initialised RNGs | ||||||
|  | Grid : Message : 9.608838 s : ================================================================================== | ||||||
|  | Grid : Message : 9.608852 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 9.608853 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 9.608854 s : * SINGLE precision  | ||||||
|  | Grid : Message : 9.608855 s : ================================================================================== | ||||||
|  | Grid : Message : 9.870294 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 9.870309 s : Deo mflop/s =   3861903.3 (1708.9) 3511078.3-3937368.2 | ||||||
|  | Grid : Message : 9.870313 s : Deo mflop/s per rank   482737.9 | ||||||
|  | Grid : Message : 9.870314 s : Deo mflop/s per node   3861903.3 | ||||||
|  | Grid : Message : 9.870315 s : ================================================================================== | ||||||
|  | Grid : Message : 9.870316 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 9.870316 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 9.870317 s : * SINGLE precision  | ||||||
|  | Grid : Message : 9.870317 s : ================================================================================== | ||||||
|  | Grid : Message : 10.101619 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 10.101624 s : Deo mflop/s =   3933599.5 (1412.7) 3835758.7-4008152.3 | ||||||
|  | Grid : Message : 10.101627 s : Deo mflop/s per rank   491699.9 | ||||||
|  | Grid : Message : 10.101628 s : Deo mflop/s per node   3933599.5 | ||||||
|  | Grid : Message : 10.101629 s : ================================================================================== | ||||||
|  | Grid : Message : 10.101629 s : 24^4 x 1 Deo Best  mflop/s        =   3933599.5 ; 3933599.5 per node  | ||||||
|  | Grid : Message : 10.101631 s : 24^4 x 1 Deo Worst mflop/s        =   3861903.3 ; 3861903.3 per node  | ||||||
|  | Grid : Message : 10.101633 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 10.101633 s : 3861903.3 ; 3933599.5 ;  | ||||||
|  | Grid : Message : 10.101634 s : ================================================================================== | ||||||
|  | Grid : Message : 10.139642 s : ================================================================================== | ||||||
|  | Grid : Message : 10.139652 s : Benchmark DWF on 32^4 local volume  | ||||||
|  | Grid : Message : 10.139653 s : * Nc             : 3 | ||||||
|  | Grid : Message : 10.139654 s : * Global volume  : 32 64 64 64  | ||||||
|  | Grid : Message : 10.139661 s : * Ls             : 1 | ||||||
|  | Grid : Message : 10.139661 s : * ranks          : 8 | ||||||
|  | Grid : Message : 10.139662 s : * nodes          : 1 | ||||||
|  | Grid : Message : 10.139662 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 10.139662 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 10.139663 s : * Using 1 threads | ||||||
|  | Grid : Message : 10.139663 s : ================================================================================== | ||||||
|  | Grid : Message : 10.502161 s : Initialised RNGs | ||||||
|  | Grid : Message : 32.211092 s : ================================================================================== | ||||||
|  | Grid : Message : 32.211107 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 32.211108 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 32.211109 s : * SINGLE precision  | ||||||
|  | Grid : Message : 32.211110 s : ================================================================================== | ||||||
|  | Grid : Message : 32.841718 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 32.841732 s : Deo mflop/s =   4988499.9 (2722.5) 4244837.8-5120022.3 | ||||||
|  | Grid : Message : 32.841736 s : Deo mflop/s per rank   623562.5 | ||||||
|  | Grid : Message : 32.841737 s : Deo mflop/s per node   4988499.9 | ||||||
|  | Grid : Message : 32.841738 s : ================================================================================== | ||||||
|  | Grid : Message : 32.841739 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 32.841739 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 32.841740 s : * SINGLE precision  | ||||||
|  | Grid : Message : 32.841740 s : ================================================================================== | ||||||
|  | Grid : Message : 33.407434 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 33.407442 s : Deo mflop/s =   5082758.0 (1883.1) 4971027.0-5205119.6 | ||||||
|  | Grid : Message : 33.407446 s : Deo mflop/s per rank   635344.7 | ||||||
|  | Grid : Message : 33.407447 s : Deo mflop/s per node   5082758.0 | ||||||
|  | Grid : Message : 33.407448 s : ================================================================================== | ||||||
|  | Grid : Message : 33.407448 s : 32^4 x 1 Deo Best  mflop/s        =   5082758.0 ; 5082758.0 per node  | ||||||
|  | Grid : Message : 33.407450 s : 32^4 x 1 Deo Worst mflop/s        =   4988499.9 ; 4988499.9 per node  | ||||||
|  | Grid : Message : 33.407452 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 33.407452 s : 4988499.9 ; 5082758.0 ;  | ||||||
|  | Grid : Message : 33.407453 s : ================================================================================== | ||||||
|  | Grid : Message : 33.506785 s : ================================================================================== | ||||||
|  | Grid : Message : 33.506798 s :  Domain wall dslash 4D vectorised | ||||||
|  | Grid : Message : 33.506799 s : ================================================================================== | ||||||
|  | Grid : Message : 33.530686 s : ================================================================================== | ||||||
|  | Grid : Message : 33.530689 s : Benchmark DWF on 8^4 local volume  | ||||||
|  | Grid : Message : 33.530690 s : * Nc             : 3 | ||||||
|  | Grid : Message : 33.530691 s : * Global volume  : 8 16 16 16  | ||||||
|  | Grid : Message : 33.530698 s : * Ls             : 12 | ||||||
|  | Grid : Message : 33.530699 s : * ranks          : 8 | ||||||
|  | Grid : Message : 33.530700 s : * nodes          : 1 | ||||||
|  | Grid : Message : 33.530701 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 33.530702 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 33.530703 s : * Using 1 threads | ||||||
|  | Grid : Message : 33.530704 s : ================================================================================== | ||||||
|  | Grid : Message : 33.545465 s : Initialised RNGs | ||||||
|  | Grid : Message : 33.752384 s : ================================================================================== | ||||||
|  | Grid : Message : 33.752397 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 33.752398 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 33.752399 s : * SINGLE precision  | ||||||
|  | Grid : Message : 33.752400 s : ================================================================================== | ||||||
|  | Grid : Message : 33.851964 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 33.851977 s : Deo mflop/s =   1383287.7 (849.8) 1321205.8-1420651.4 | ||||||
|  | Grid : Message : 33.851981 s : Deo mflop/s per rank   172911.0 | ||||||
|  | Grid : Message : 33.851983 s : Deo mflop/s per node   1383287.7 | ||||||
|  | Grid : Message : 33.851984 s : ================================================================================== | ||||||
|  | Grid : Message : 33.851984 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 33.851984 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 33.851984 s : * SINGLE precision  | ||||||
|  | Grid : Message : 33.851984 s : ================================================================================== | ||||||
|  | Grid : Message : 33.949235 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 33.949240 s : Deo mflop/s =   1386335.8 (734.6) 1341325.6-1428330.6 | ||||||
|  | Grid : Message : 33.949243 s : Deo mflop/s per rank   173292.0 | ||||||
|  | Grid : Message : 33.949244 s : Deo mflop/s per node   1386335.8 | ||||||
|  | Grid : Message : 33.949245 s : ================================================================================== | ||||||
|  | Grid : Message : 33.949245 s : 8^4 x 12 Deo Best  mflop/s        =   1386335.8 ; 1386335.8 per node  | ||||||
|  | Grid : Message : 33.949247 s : 8^4 x 12 Deo Worst mflop/s        =   1383287.7 ; 1383287.7 per node  | ||||||
|  | Grid : Message : 33.949249 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 33.949249 s : 1383287.7 ; 1386335.8 ;  | ||||||
|  | Grid : Message : 33.949250 s : ================================================================================== | ||||||
|  | Grid : Message : 33.952789 s : ================================================================================== | ||||||
|  | Grid : Message : 33.952793 s : Benchmark DWF on 12^4 local volume  | ||||||
|  | Grid : Message : 33.952794 s : * Nc             : 3 | ||||||
|  | Grid : Message : 33.952795 s : * Global volume  : 12 24 24 24  | ||||||
|  | Grid : Message : 33.952800 s : * Ls             : 12 | ||||||
|  | Grid : Message : 33.952801 s : * ranks          : 8 | ||||||
|  | Grid : Message : 33.952802 s : * nodes          : 1 | ||||||
|  | Grid : Message : 33.952803 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 33.952803 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 33.952804 s : * Using 1 threads | ||||||
|  | Grid : Message : 33.952805 s : ================================================================================== | ||||||
|  | Grid : Message : 34.362200 s : Initialised RNGs | ||||||
|  | Grid : Message : 34.969821 s : ================================================================================== | ||||||
|  | Grid : Message : 34.969832 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 34.969833 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 34.969834 s : * SINGLE precision  | ||||||
|  | Grid : Message : 34.969835 s : ================================================================================== | ||||||
|  | Grid : Message : 35.135545 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 35.135558 s : Deo mflop/s =   4208495.6 (2165.0) 4053699.5-4315228.5 | ||||||
|  | Grid : Message : 35.135562 s : Deo mflop/s per rank   526062.0 | ||||||
|  | Grid : Message : 35.135563 s : Deo mflop/s per node   4208495.6 | ||||||
|  | Grid : Message : 35.135564 s : ================================================================================== | ||||||
|  | Grid : Message : 35.135565 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 35.135565 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 35.135565 s : * SINGLE precision  | ||||||
|  | Grid : Message : 35.135565 s : ================================================================================== | ||||||
|  | Grid : Message : 35.299710 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 35.299715 s : Deo mflop/s =   4156968.7 (1450.2) 4053699.5-4219939.5 | ||||||
|  | Grid : Message : 35.299718 s : Deo mflop/s per rank   519621.1 | ||||||
|  | Grid : Message : 35.299719 s : Deo mflop/s per node   4156968.7 | ||||||
|  | Grid : Message : 35.299721 s : ================================================================================== | ||||||
|  | Grid : Message : 35.299721 s : 12^4 x 12 Deo Best  mflop/s        =   4208495.6 ; 4208495.6 per node  | ||||||
|  | Grid : Message : 35.299723 s : 12^4 x 12 Deo Worst mflop/s        =   4156968.7 ; 4156968.7 per node  | ||||||
|  | Grid : Message : 35.299725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 35.299725 s : 4208495.6 ; 4156968.7 ;  | ||||||
|  | Grid : Message : 35.299726 s : ================================================================================== | ||||||
|  | Grid : Message : 35.309687 s : ================================================================================== | ||||||
|  | Grid : Message : 35.309693 s : Benchmark DWF on 16^4 local volume  | ||||||
|  | Grid : Message : 35.309694 s : * Nc             : 3 | ||||||
|  | Grid : Message : 35.309695 s : * Global volume  : 16 32 32 32  | ||||||
|  | Grid : Message : 35.309701 s : * Ls             : 12 | ||||||
|  | Grid : Message : 35.309702 s : * ranks          : 8 | ||||||
|  | Grid : Message : 35.309703 s : * nodes          : 1 | ||||||
|  | Grid : Message : 35.309704 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 35.309704 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 35.309705 s : * Using 1 threads | ||||||
|  | Grid : Message : 35.309706 s : ================================================================================== | ||||||
|  | Grid : Message : 35.448780 s : Initialised RNGs | ||||||
|  | Grid : Message : 38.468764 s : ================================================================================== | ||||||
|  | Grid : Message : 38.468777 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 38.468778 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 38.468779 s : * SINGLE precision  | ||||||
|  | Grid : Message : 38.468780 s : ================================================================================== | ||||||
|  | Grid : Message : 38.801024 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 38.801040 s : Deo mflop/s =   6674673.6 (2168.6) 6484445.4-6797200.1 | ||||||
|  | Grid : Message : 38.801044 s : Deo mflop/s per rank   834334.2 | ||||||
|  | Grid : Message : 38.801045 s : Deo mflop/s per node   6674673.6 | ||||||
|  | Grid : Message : 38.801046 s : ================================================================================== | ||||||
|  | Grid : Message : 38.801047 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 38.801048 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 38.801049 s : * SINGLE precision  | ||||||
|  | Grid : Message : 38.801049 s : ================================================================================== | ||||||
|  | Grid : Message : 39.129777 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 39.129783 s : Deo mflop/s =   6560128.4 (2117.4) 6405846.1-6679081.3 | ||||||
|  | Grid : Message : 39.129786 s : Deo mflop/s per rank   820016.1 | ||||||
|  | Grid : Message : 39.129787 s : Deo mflop/s per node   6560128.4 | ||||||
|  | Grid : Message : 39.129788 s : ================================================================================== | ||||||
|  | Grid : Message : 39.129788 s : 16^4 x 12 Deo Best  mflop/s        =   6674673.6 ; 6674673.6 per node  | ||||||
|  | Grid : Message : 39.129790 s : 16^4 x 12 Deo Worst mflop/s        =   6560128.4 ; 6560128.4 per node  | ||||||
|  | Grid : Message : 39.129792 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 39.129793 s : 6674673.6 ; 6560128.4 ;  | ||||||
|  | Grid : Message : 39.129795 s : ================================================================================== | ||||||
|  | Grid : Message : 39.161251 s : ================================================================================== | ||||||
|  | Grid : Message : 39.161265 s : Benchmark DWF on 24^4 local volume  | ||||||
|  | Grid : Message : 39.161266 s : * Nc             : 3 | ||||||
|  | Grid : Message : 39.161267 s : * Global volume  : 24 48 48 48  | ||||||
|  | Grid : Message : 39.161274 s : * Ls             : 12 | ||||||
|  | Grid : Message : 39.161275 s : * ranks          : 8 | ||||||
|  | Grid : Message : 39.161276 s : * nodes          : 1 | ||||||
|  | Grid : Message : 39.161277 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 39.161277 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 39.161278 s : * Using 1 threads | ||||||
|  | Grid : Message : 39.161279 s : ================================================================================== | ||||||
|  | Grid : Message : 39.911996 s : Initialised RNGs | ||||||
|  | Grid : Message : 54.971914 s : ================================================================================== | ||||||
|  | Grid : Message : 54.971928 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 54.971929 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 54.971930 s : * SINGLE precision  | ||||||
|  | Grid : Message : 54.971931 s : ================================================================================== | ||||||
|  | Grid : Message : 56.309445 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 56.309462 s : Deo mflop/s =   8572660.7 (1374.9) 8483366.4-8644399.6 | ||||||
|  | Grid : Message : 56.309467 s : Deo mflop/s per rank   1071582.6 | ||||||
|  | Grid : Message : 56.309468 s : Deo mflop/s per node   8572660.7 | ||||||
|  | Grid : Message : 56.309469 s : ================================================================================== | ||||||
|  | Grid : Message : 56.309471 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 56.309472 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 56.309473 s : * SINGLE precision  | ||||||
|  | Grid : Message : 56.309474 s : ================================================================================== | ||||||
|  | Grid : Message : 57.640707 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 57.640714 s : Deo mflop/s =   8200141.3 (1445.8) 8113545.6-8286307.9 | ||||||
|  | Grid : Message : 57.640717 s : Deo mflop/s per rank   1025017.7 | ||||||
|  | Grid : Message : 57.640718 s : Deo mflop/s per node   8200141.3 | ||||||
|  | Grid : Message : 57.640719 s : ================================================================================== | ||||||
|  | Grid : Message : 57.640720 s : 24^4 x 12 Deo Best  mflop/s        =   8572660.7 ; 8572660.7 per node  | ||||||
|  | Grid : Message : 57.640723 s : 24^4 x 12 Deo Worst mflop/s        =   8200141.3 ; 8200141.3 per node  | ||||||
|  | Grid : Message : 57.640725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 57.640725 s : 8572660.7 ; 8200141.3 ;  | ||||||
|  | Grid : Message : 57.640727 s : ================================================================================== | ||||||
|  | Grid : Message : 57.806175 s : ================================================================================== | ||||||
|  | Grid : Message : 57.806190 s : Benchmark DWF on 32^4 local volume  | ||||||
|  | Grid : Message : 57.806191 s : * Nc             : 3 | ||||||
|  | Grid : Message : 57.806192 s : * Global volume  : 32 64 64 64  | ||||||
|  | Grid : Message : 57.806200 s : * Ls             : 12 | ||||||
|  | Grid : Message : 57.806200 s : * ranks          : 8 | ||||||
|  | Grid : Message : 57.806200 s : * nodes          : 1 | ||||||
|  | Grid : Message : 57.806200 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 57.806200 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 57.806201 s : * Using 1 threads | ||||||
|  | Grid : Message : 57.806201 s : ================================================================================== | ||||||
|  | Grid : Message : 60.313153 s : Initialised RNGs | ||||||
|  | Grid : Message : 107.830286 s : ================================================================================== | ||||||
|  | Grid : Message : 107.830306 s : * Using GENERIC Nc WilsonKernels | ||||||
|  | Grid : Message : 107.830307 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 107.830308 s : * SINGLE precision  | ||||||
|  | Grid : Message : 107.830309 s : ================================================================================== | ||||||
|  | Grid : Message : 111.479603 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 111.479625 s : Deo mflop/s =   9771387.8 (1000.8) 9688589.9-9830800.0 | ||||||
|  | Grid : Message : 111.479629 s : Deo mflop/s per rank   1221423.5 | ||||||
|  | Grid : Message : 111.479630 s : Deo mflop/s per node   9771387.8 | ||||||
|  | Grid : Message : 111.479631 s : ================================================================================== | ||||||
|  | Grid : Message : 111.479631 s : * Using UNROLLED WilsonKernels | ||||||
|  | Grid : Message : 111.479631 s : * Using Overlapped Comms/Compute | ||||||
|  | Grid : Message : 111.479631 s : * SINGLE precision  | ||||||
|  | Grid : Message : 111.479631 s : ================================================================================== | ||||||
|  | Grid : Message : 115.406559 s : Deo FlopsPerSite is 1344.0 | ||||||
|  | Grid : Message : 115.406573 s : Deo mflop/s =   8785297.3 (1739.6) 8628282.5-8911307.5 | ||||||
|  | Grid : Message : 115.406576 s : Deo mflop/s per rank   1098162.2 | ||||||
|  | Grid : Message : 115.406577 s : Deo mflop/s per node   8785297.3 | ||||||
|  | Grid : Message : 115.406578 s : ================================================================================== | ||||||
|  | Grid : Message : 115.406578 s : 32^4 x 12 Deo Best  mflop/s        =   9771387.8 ; 9771387.8 per node  | ||||||
|  | Grid : Message : 115.406580 s : 32^4 x 12 Deo Worst mflop/s        =   8785297.3 ; 8785297.3 per node  | ||||||
|  | Grid : Message : 115.406581 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 115.406581 s : 9771387.8 ; 8785297.3 ;  | ||||||
|  | Grid : Message : 115.406582 s : ================================================================================== | ||||||
|  | Grid : Message : 115.918888 s : ================================================================================== | ||||||
|  | Grid : Message : 115.918902 s :  Improved Staggered dslash 4D vectorised | ||||||
|  | Grid : Message : 115.918903 s : ================================================================================== | ||||||
|  | Grid : Message : 115.920344 s : ================================================================================== | ||||||
|  | Grid : Message : 115.920346 s : Benchmark ImprovedStaggered on 8^4 local volume  | ||||||
|  | Grid : Message : 115.920347 s : * Global volume  : 8 16 16 16  | ||||||
|  | Grid : Message : 115.920354 s : * ranks          : 8 | ||||||
|  | Grid : Message : 115.920355 s : * nodes          : 1 | ||||||
|  | Grid : Message : 115.920356 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 115.920357 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 115.920376 s : * Using 1 threads | ||||||
|  | Grid : Message : 115.920377 s : ================================================================================== | ||||||
|  | Grid : Message : 115.923522 s : Initialised RNGs | ||||||
|  | Grid : Message : 116.904870 s : ================================================================================== | ||||||
|  | Grid : Message : 116.904950 s : * Using GENERIC Nc StaggeredKernels | ||||||
|  | Grid : Message : 116.904960 s : * SINGLE precision  | ||||||
|  | Grid : Message : 116.904970 s : ================================================================================== | ||||||
|  | Grid : Message : 116.288979 s : Deo mflop/s =   49708.9 (22.9) 44075.3-50609.3 | ||||||
|  | Grid : Message : 116.289000 s : Deo mflop/s per rank   6213.6 | ||||||
|  | Grid : Message : 116.289002 s : Deo mflop/s per node   49708.9 | ||||||
|  | Grid : Message : 116.289003 s : ================================================================================== | ||||||
|  | Grid : Message : 116.289004 s : * SINGLE precision  | ||||||
|  | Grid : Message : 116.289005 s : ================================================================================== | ||||||
|  | Grid : Message : 116.481632 s : Deo mflop/s =   49737.1 (13.5) 48517.0-50338.0 | ||||||
|  | Grid : Message : 116.481639 s : Deo mflop/s per rank   6217.1 | ||||||
|  | Grid : Message : 116.481640 s : Deo mflop/s per node   49737.1 | ||||||
|  | Grid : Message : 116.481641 s : ================================================================================== | ||||||
|  | Grid : Message : 116.481642 s : 8^4  Deo Best  mflop/s        =   49737.1 ; 49737.1 per node  | ||||||
|  | Grid : Message : 116.481644 s : 8^4  Deo Worst mflop/s        =   49708.9 ; 49708.9 per node  | ||||||
|  | Grid : Message : 116.481646 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 116.481646 s : 49708.9 ; 49737.1 ;  | ||||||
|  | Grid : Message : 116.481647 s : ================================================================================== | ||||||
|  | Grid : Message : 116.483458 s : ================================================================================== | ||||||
|  | Grid : Message : 116.483461 s : Benchmark ImprovedStaggered on 12^4 local volume  | ||||||
|  | Grid : Message : 116.483462 s : * Global volume  : 12 24 24 24  | ||||||
|  | Grid : Message : 116.483465 s : * ranks          : 8 | ||||||
|  | Grid : Message : 116.483466 s : * nodes          : 1 | ||||||
|  | Grid : Message : 116.483466 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 116.483466 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 116.483467 s : * Using 1 threads | ||||||
|  | Grid : Message : 116.483468 s : ================================================================================== | ||||||
|  | Grid : Message : 116.489279 s : Initialised RNGs | ||||||
|  | Grid : Message : 116.945016 s : ================================================================================== | ||||||
|  | Grid : Message : 116.945025 s : * Using GENERIC Nc StaggeredKernels | ||||||
|  | Grid : Message : 116.945026 s : * SINGLE precision  | ||||||
|  | Grid : Message : 116.945027 s : ================================================================================== | ||||||
|  | Grid : Message : 117.159821 s : Deo mflop/s =   229778.4 (89.5) 223656.1-233547.5 | ||||||
|  | Grid : Message : 117.159835 s : Deo mflop/s per rank   28722.3 | ||||||
|  | Grid : Message : 117.159837 s : Deo mflop/s per node   229778.4 | ||||||
|  | Grid : Message : 117.159838 s : ================================================================================== | ||||||
|  | Grid : Message : 117.159838 s : * SINGLE precision  | ||||||
|  | Grid : Message : 117.159838 s : ================================================================================== | ||||||
|  | Grid : Message : 117.371102 s : Deo mflop/s =   229516.6 (61.8) 225781.1-233547.5 | ||||||
|  | Grid : Message : 117.371109 s : Deo mflop/s per rank   28689.6 | ||||||
|  | Grid : Message : 117.371110 s : Deo mflop/s per node   229516.6 | ||||||
|  | Grid : Message : 117.371111 s : ================================================================================== | ||||||
|  | Grid : Message : 117.371111 s : 12^4  Deo Best  mflop/s        =   229778.4 ; 229778.4 per node  | ||||||
|  | Grid : Message : 117.371113 s : 12^4  Deo Worst mflop/s        =   229516.6 ; 229516.6 per node  | ||||||
|  | Grid : Message : 117.371115 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 117.371115 s : 229778.4 ; 229516.6 ;  | ||||||
|  | Grid : Message : 117.371116 s : ================================================================================== | ||||||
|  | Grid : Message : 117.373669 s : ================================================================================== | ||||||
|  | Grid : Message : 117.373673 s : Benchmark ImprovedStaggered on 16^4 local volume  | ||||||
|  | Grid : Message : 117.373674 s : * Global volume  : 16 32 32 32  | ||||||
|  | Grid : Message : 117.373678 s : * ranks          : 8 | ||||||
|  | Grid : Message : 117.373679 s : * nodes          : 1 | ||||||
|  | Grid : Message : 117.373679 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 117.373679 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 117.373680 s : * Using 1 threads | ||||||
|  | Grid : Message : 117.373681 s : ================================================================================== | ||||||
|  | Grid : Message : 117.386495 s : Initialised RNGs | ||||||
|  | Grid : Message : 118.755695 s : ================================================================================== | ||||||
|  | Grid : Message : 118.755706 s : * Using GENERIC Nc StaggeredKernels | ||||||
|  | Grid : Message : 118.755707 s : * SINGLE precision  | ||||||
|  | Grid : Message : 118.755708 s : ================================================================================== | ||||||
|  | Grid : Message : 119.178990 s : Deo mflop/s =   608844.0 (126.1) 596065.5-615608.7 | ||||||
|  | Grid : Message : 119.179160 s : Deo mflop/s per rank   76105.5 | ||||||
|  | Grid : Message : 119.179180 s : Deo mflop/s per node   608844.0 | ||||||
|  | Grid : Message : 119.179190 s : ================================================================================== | ||||||
|  | Grid : Message : 119.179200 s : * SINGLE precision  | ||||||
|  | Grid : Message : 119.179200 s : ================================================================================== | ||||||
|  | Grid : Message : 119.271093 s : Deo mflop/s =   605259.7 (188.7) 591372.1-614349.7 | ||||||
|  | Grid : Message : 119.271101 s : Deo mflop/s per rank   75657.5 | ||||||
|  | Grid : Message : 119.271103 s : Deo mflop/s per node   605259.7 | ||||||
|  | Grid : Message : 119.271104 s : ================================================================================== | ||||||
|  | Grid : Message : 119.271105 s : 16^4  Deo Best  mflop/s        =   608844.0 ; 608844.0 per node  | ||||||
|  | Grid : Message : 119.271107 s : 16^4  Deo Worst mflop/s        =   605259.7 ; 605259.7 per node  | ||||||
|  | Grid : Message : 119.271109 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 119.271109 s : 608844.0 ; 605259.7 ;  | ||||||
|  | Grid : Message : 119.271110 s : ================================================================================== | ||||||
|  | Grid : Message : 119.275303 s : ================================================================================== | ||||||
|  | Grid : Message : 119.275308 s : Benchmark ImprovedStaggered on 24^4 local volume  | ||||||
|  | Grid : Message : 119.275309 s : * Global volume  : 24 48 48 48  | ||||||
|  | Grid : Message : 119.275315 s : * ranks          : 8 | ||||||
|  | Grid : Message : 119.275316 s : * nodes          : 1 | ||||||
|  | Grid : Message : 119.275317 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 119.275317 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 119.275318 s : * Using 1 threads | ||||||
|  | Grid : Message : 119.275319 s : ================================================================================== | ||||||
|  | Grid : Message : 119.328765 s : Initialised RNGs | ||||||
|  | Grid : Message : 126.866160 s : ================================================================================== | ||||||
|  | Grid : Message : 126.866270 s : * Using GENERIC Nc StaggeredKernels | ||||||
|  | Grid : Message : 126.866280 s : * SINGLE precision  | ||||||
|  | Grid : Message : 126.866290 s : ================================================================================== | ||||||
|  | Grid : Message : 126.604376 s : Deo mflop/s =   1641161.6 (335.5) 1619660.5-1663961.9 | ||||||
|  | Grid : Message : 126.604392 s : Deo mflop/s per rank   205145.2 | ||||||
|  | Grid : Message : 126.604394 s : Deo mflop/s per node   1641161.6 | ||||||
|  | Grid : Message : 126.604395 s : ================================================================================== | ||||||
|  | Grid : Message : 126.604396 s : * SINGLE precision  | ||||||
|  | Grid : Message : 126.604396 s : ================================================================================== | ||||||
|  | Grid : Message : 127.829420 s : Deo mflop/s =   1620972.4 (344.9) 1602593.4-1644174.3 | ||||||
|  | Grid : Message : 127.829520 s : Deo mflop/s per rank   202621.6 | ||||||
|  | Grid : Message : 127.829530 s : Deo mflop/s per node   1620972.4 | ||||||
|  | Grid : Message : 127.829540 s : ================================================================================== | ||||||
|  | Grid : Message : 127.829550 s : 24^4  Deo Best  mflop/s        =   1641161.6 ; 1641161.6 per node  | ||||||
|  | Grid : Message : 127.829570 s : 24^4  Deo Worst mflop/s        =   1620972.4 ; 1620972.4 per node  | ||||||
|  | Grid : Message : 127.829590 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 127.829590 s : 1641161.6 ; 1620972.4 ;  | ||||||
|  | Grid : Message : 127.829600 s : ================================================================================== | ||||||
|  | Grid : Message : 127.107891 s : ================================================================================== | ||||||
|  | Grid : Message : 127.107903 s : Benchmark ImprovedStaggered on 32^4 local volume  | ||||||
|  | Grid : Message : 127.107904 s : * Global volume  : 32 64 64 64  | ||||||
|  | Grid : Message : 127.107912 s : * ranks          : 8 | ||||||
|  | Grid : Message : 127.107913 s : * nodes          : 1 | ||||||
|  | Grid : Message : 127.107914 s : * ranks/node     : 8 | ||||||
|  | Grid : Message : 127.107914 s : * ranks geom     : 1 2 2 2  | ||||||
|  | Grid : Message : 127.107915 s : * Using 1 threads | ||||||
|  | Grid : Message : 127.107916 s : ================================================================================== | ||||||
|  | Grid : Message : 127.257116 s : Initialised RNGs | ||||||
|  | Grid : Message : 148.527930 s : ================================================================================== | ||||||
|  | Grid : Message : 148.527941 s : * Using GENERIC Nc StaggeredKernels | ||||||
|  | Grid : Message : 148.527942 s : * SINGLE precision  | ||||||
|  | Grid : Message : 148.527943 s : ================================================================================== | ||||||
|  | Grid : Message : 149.401625 s : Deo mflop/s =   3085543.7 (956.0) 2934476.4-3115147.4 | ||||||
|  | Grid : Message : 149.401643 s : Deo mflop/s per rank   385693.0 | ||||||
|  | Grid : Message : 149.401645 s : Deo mflop/s per node   3085543.7 | ||||||
|  | Grid : Message : 149.401646 s : ================================================================================== | ||||||
|  | Grid : Message : 149.401647 s : * SINGLE precision  | ||||||
|  | Grid : Message : 149.401648 s : ================================================================================== | ||||||
|  | Grid : Message : 150.204533 s : Deo mflop/s =   3053468.5 (343.9) 3030688.8-3077255.0 | ||||||
|  | Grid : Message : 150.204540 s : Deo mflop/s per rank   381683.6 | ||||||
|  | Grid : Message : 150.204541 s : Deo mflop/s per node   3053468.5 | ||||||
|  | Grid : Message : 150.204542 s : ================================================================================== | ||||||
|  | Grid : Message : 150.204543 s : 32^4  Deo Best  mflop/s        =   3085543.7 ; 3085543.7 per node  | ||||||
|  | Grid : Message : 150.204545 s : 32^4  Deo Worst mflop/s        =   3053468.5 ; 3053468.5 per node  | ||||||
|  | Grid : Message : 150.204547 s : G/S/C ; G/O/C ; G/S/S ; G/O/S  | ||||||
|  | Grid : Message : 150.204547 s : 3085543.7 ; 3053468.5 ;  | ||||||
|  | Grid : Message : 150.204548 s : ================================================================================== | ||||||
|  | Grid : Message : 150.292848 s : ================================================================================== | ||||||
|  | Grid : Message : 150.292864 s :  Summary table Ls=12 | ||||||
|  | Grid : Message : 150.292866 s : ================================================================================== | ||||||
|  | Grid : Message : 150.292866 s : L 		 Clover 		 DWF4 		 Staggered | ||||||
|  | Grid : Message : 150.292867 s : 8 		 154914.0 		 1386335.8 		 49737.1 | ||||||
|  | Grid : Message : 150.292880 s : 12 		 693556.6 		 4208495.6 		 229778.4 | ||||||
|  | Grid : Message : 150.292882 s : 16 		 1840587.3 		 6674673.6 		 608844.0 | ||||||
|  | Grid : Message : 150.292884 s : 24 		 3933599.5 		 8572660.7 		 1641161.6 | ||||||
|  | Grid : Message : 150.292886 s : 32 		 5082758.0 		 9771387.8 		 3085543.7 | ||||||
|  | Grid : Message : 150.292888 s : ================================================================================== | ||||||
|  | Grid : Message : 150.292888 s : ================================================================================== | ||||||
|  | Grid : Message : 150.292888 s :  Memory benchmark  | ||||||
|  | Grid : Message : 150.292888 s : ================================================================================== | ||||||
|  | Grid : Message : 150.295495 s : ================================================================================== | ||||||
|  | Grid : Message : 150.295497 s : = Benchmarking a*x + y bandwidth | ||||||
|  | Grid : Message : 150.295498 s : ================================================================================== | ||||||
|  | Grid : Message : 150.295499 s :   L  		bytes			GB/s		Gflop/s		 seconds		GB/s / node | ||||||
|  | Grid : Message : 150.295500 s : ---------------------------------------------------------- | ||||||
|  | Grid : Message : 160.682233 s : 8		6291456.000   		379.297		31.608		10.367		379.297 | ||||||
|  | Grid : Message : 161.851979 s : 16		100663296.000   		3754.675		312.890		1.047		3754.675 | ||||||
|  | Grid : Message : 162.458098 s : 24		509607936.000   		6521.472		543.456		0.603		6521.472 | ||||||
|  | Grid : Message : 162.924116 s : 32		1610612736.000   		8513.456		709.455		0.462		8513.456 | ||||||
|  | Grid : Message : 163.363877 s : 40		3932160000.000   		9018.902		751.575		0.436		9018.902 | ||||||
|  | Grid : Message : 163.363976 s : ================================================================================== | ||||||
|  | Grid : Message : 163.363978 s :  Batched BLAS benchmark  | ||||||
|  | Grid : Message : 163.363979 s : ================================================================================== | ||||||
|  | hipblasCreate | ||||||
|  | Grid : Message : 163.364046 s : ================================================================================== | ||||||
|  | Grid : Message : 163.364048 s : = batched GEMM (double precision)  | ||||||
|  | Grid : Message : 163.364048 s : ================================================================================== | ||||||
|  | Grid : Message : 163.364048 s :   M  		N			K		Gflop/s / rank (coarse mrhs) | ||||||
|  | Grid : Message : 163.364049 s : ---------------------------------------------------------- | ||||||
|  | Grid : Message : 163.438476 s : 16		8		16		256		0.565 | ||||||
|  | Grid : Message : 163.438944 s : 16		16		16		256		243.148 | ||||||
|  | Grid : Message : 163.439501 s : 16		32		16		256		440.347 | ||||||
|  | Grid : Message : 163.440003 s : 32		8		32		256		439.194 | ||||||
|  | Grid : Message : 163.440463 s : 32		16		32		256		847.334 | ||||||
|  | Grid : Message : 163.441051 s : 32		32		32		256		1430.893 | ||||||
|  | Grid : Message : 163.441679 s : 64		8		64		256		1242.757 | ||||||
|  | Grid : Message : 163.442354 s : 64		16		64		256		2196.689 | ||||||
|  | Grid : Message : 163.443196 s : 64		32		64		256		3697.458 | ||||||
|  | Grid : Message : 163.443200 s : ---------------------------------------------------------- | ||||||
|  | Grid : Message : 163.443201 s :   M  		N			K		Gflop/s / rank (block project) | ||||||
|  | Grid : Message : 163.443202 s : ---------------------------------------------------------- | ||||||
|  | Grid : Message : 163.444013 s : 16		8		256		256		899.583 | ||||||
|  | Grid : Message : 163.444933 s : 16		16		256		256		1673.538 | ||||||
|  | Grid : Message : 163.446013 s : 16		32		256		256		2959.597 | ||||||
|  | Grid : Message : 163.446951 s : 32		8		256		256		1558.859 | ||||||
|  | Grid : Message : 163.447970 s : 32		16		256		256		2864.839 | ||||||
|  | Grid : Message : 163.449240 s : 32		32		256		256		4810.671 | ||||||
|  | Grid : Message : 163.450524 s : 64		8		256		256		2386.093 | ||||||
|  | Grid : Message : 163.451877 s : 64		16		256		256		4451.666 | ||||||
|  | Grid : Message : 163.453806 s : 64		32		256		256		5942.124 | ||||||
|  | Grid : Message : 163.453809 s : ---------------------------------------------------------- | ||||||
|  | Grid : Message : 163.453810 s :   M  		N			K		Gflop/s / rank (block promote) | ||||||
|  | Grid : Message : 163.453811 s : ---------------------------------------------------------- | ||||||
|  | Grid : Message : 163.454716 s : 8		256		16		256		799.867 | ||||||
|  | Grid : Message : 163.455690 s : 16		256		16		256		1584.625 | ||||||
|  | Grid : Message : 163.457209 s : 32		256		16		256		1949.422 | ||||||
|  | Grid : Message : 163.458254 s : 8		256		32		256		1389.417 | ||||||
|  | Grid : Message : 163.459339 s : 16		256		32		256		2668.344 | ||||||
|  | Grid : Message : 163.461158 s : 32		256		32		256		3234.162 | ||||||
|  | Grid : Message : 163.462566 s : 8		256		64		256		2150.925 | ||||||
|  | Grid : Message : 163.464066 s : 16		256		64		256		4012.488 | ||||||
|  | Grid : Message : 163.466272 s : 32		256		64		256		5154.786 | ||||||
|  | Grid : Message : 163.466276 s : ================================================================================== | ||||||
|  | Grid : Message : 163.466277 s : ================================================================================== | ||||||
|  | Grid : Message : 163.466278 s :  Communications benchmark  | ||||||
|  | Grid : Message : 163.466279 s : ================================================================================== | ||||||
|  | Grid : Message : 163.466280 s : ==================================================================================================== | ||||||
|  | Grid : Message : 163.466280 s : = Benchmarking threaded STENCIL halo exchange in 3 dimensions | ||||||
|  | Grid : Message : 163.466281 s : ==================================================================================================== | ||||||
|  | Grid : Message : 163.466281 s :  L  	 Ls  	bytes	 MB/s uni  		 MB/s bidi  | ||||||
|  | Grid : Message : 163.521339 s : 16	12	 4718592 	 122513.099		245026.198 | ||||||
|  | Grid : Message : 163.551417 s : 16	12	 4718592 	 125590.498		251180.996 | ||||||
|  | Grid : Message : 163.572339 s : 16	12	 4718592 	 180555.489		361110.977 | ||||||
|  | Grid : Message : 163.602810 s : 16	12	 4718592 	 123949.223		247898.447 | ||||||
|  | Grid : Message : 163.633041 s : 16	12	 4718592 	 124933.761		249867.523 | ||||||
|  | Grid : Message : 163.654084 s : 16	12	 4718592 	 179516.530		359033.061 | ||||||
|  | Grid : Message : 163.756280 s : 24	12	 15925248 	 127515.473		255030.946 | ||||||
|  | Grid : Message : 163.852651 s : 24	12	 15925248 	 132226.945		264453.890 | ||||||
|  | Grid : Message : 163.917510 s : 24	12	 15925248 	 196474.591		392949.183 | ||||||
|  | Grid : Message : 164.170390 s : 24	12	 15925248 	 128020.322		256040.644 | ||||||
|  | Grid : Message : 164.113321 s : 24	12	 15925248 	 132340.948		264681.896 | ||||||
|  | Grid : Message : 164.178314 s : 24	12	 15925248 	 196051.311		392102.622 | ||||||
|  | Grid : Message : 164.413983 s : 32	12	 37748736 	 129411.666		258823.333 | ||||||
|  | Grid : Message : 164.639218 s : 32	12	 37748736 	 134090.789		268181.577 | ||||||
|  | Grid : Message : 164.789675 s : 32	12	 37748736 	 200739.096		401478.191 | ||||||
|  | Grid : Message : 165.228910 s : 32	12	 37748736 	 129497.681		258995.363 | ||||||
|  | Grid : Message : 165.248096 s : 32	12	 37748736 	 134103.293		268206.586 | ||||||
|  | Grid : Message : 165.398958 s : 32	12	 37748736 	 200198.805		400397.611 | ||||||
|  | Grid : Message : 165.399411 s : ================================================================================== | ||||||
|  | Grid : Message : 165.399413 s :  Per Node Summary table Ls=12 | ||||||
|  | Grid : Message : 165.399414 s : ================================================================================== | ||||||
|  | Grid : Message : 165.399414 s :  L 		 Clover		 DWF4		 Staggered (GF/s per node) | ||||||
|  | Grid : Message : 165.399417 s : 8 		 154914.003 	 1386335.817 	 49737.127 | ||||||
|  | Grid : Message : 165.399423 s : 12 		 693556.579 	 4208495.611 	 229778.435 | ||||||
|  | Grid : Message : 165.399426 s : 16 		 1840587.280 	 6674673.647 	 608844.000 | ||||||
|  | Grid : Message : 165.399429 s : 24 		 3933599.545 	 8572660.656 	 1641161.613 | ||||||
|  | Grid : Message : 165.399432 s : 32 		 5082757.996 	 9771387.820 	 3085543.742 | ||||||
|  | Grid : Message : 165.399435 s : ================================================================================== | ||||||
|  | Grid : Message : 165.399435 s : ================================================================================== | ||||||
|  | Grid : Message : 165.399435 s :  Comparison point     result: 9172024.238 Mflop/s per node | ||||||
|  | Grid : Message : 165.399436 s :  Comparison point is 0.5*(9771387.820+8572660.656)  | ||||||
|  | Grid : Message : 165.399438 s : ================================================================================== | ||||||
|  | Grid : Message : 165.399438 s : ******************************************* | ||||||
|  | Grid : Message : 165.399438 s : ******* Grid Finalize                ****** | ||||||
|  | Grid : Message : 165.399438 s : ******************************************* | ||||||
							
								
								
									
										38
									
								
								systems/Frontier/benchmarks/benchusqcd.slurm
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										38
									
								
								systems/Frontier/benchmarks/benchusqcd.slurm
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,38 @@ | |||||||
|  | #!/bin/bash -l | ||||||
|  | #SBATCH --job-name=bench | ||||||
|  | ##SBATCH --partition=small-g | ||||||
|  | ##SBATCH -q debug | ||||||
|  | #SBATCH --nodes=1 | ||||||
|  | #SBATCH --ntasks-per-node=8 | ||||||
|  | #SBATCH --cpus-per-task=7 | ||||||
|  | #SBATCH --gpus-per-node=8 | ||||||
|  | #SBATCH --time=00:30:00 | ||||||
|  | #SBATCH --account=phy157_dwf | ||||||
|  | #SBATCH --gpu-bind=none | ||||||
|  | #SBATCH --exclusive | ||||||
|  | #SBATCH --mem=0 | ||||||
|  |  | ||||||
|  | cat << EOF > select_gpu | ||||||
|  | #!/bin/bash | ||||||
|  | export GPU_MAP=(0 1 2 3 7 6 5 4) | ||||||
|  | export NUMA_MAP=(3 3 1 1 2 2 0 0) | ||||||
|  | export GPU=\${GPU_MAP[\$SLURM_LOCALID]} | ||||||
|  | export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} | ||||||
|  | export HIP_VISIBLE_DEVICES=\$GPU | ||||||
|  | unset ROCR_VISIBLE_DEVICES | ||||||
|  | echo RANK \$SLURM_LOCALID using GPU \$GPU     | ||||||
|  | exec numactl -m \$NUMA -N \$NUMA \$* | ||||||
|  | EOF | ||||||
|  |  | ||||||
|  | chmod +x ./select_gpu | ||||||
|  |  | ||||||
|  | root=$HOME/Frontier/Grid/systems/Frontier/ | ||||||
|  | source ${root}/sourceme.sh | ||||||
|  |  | ||||||
|  | export OMP_NUM_THREADS=7 | ||||||
|  | export MPICH_GPU_SUPPORT_ENABLED=1 | ||||||
|  | #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM | ||||||
|  |  | ||||||
|  | srun ./select_gpu ./Benchmark_usqcd --grid 32.32.32.32 --mpi 1.2.2.2 --accelerator-threads 8 --comms-overlap --shm 4096 --shm-mpi 0 --grid $vol  > Benchmark_usqcd.log | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -3,7 +3,7 @@ spack load c-lime | |||||||
| module load emacs  | module load emacs  | ||||||
| module load PrgEnv-gnu | module load PrgEnv-gnu | ||||||
| module load rocm | module load rocm | ||||||
| module load cray-mpich/8.1.23 | module load cray-mpich | ||||||
| module load gmp | module load gmp | ||||||
| module load cray-fftw | module load cray-fftw | ||||||
| module load craype-accel-amd-gfx90a | module load craype-accel-amd-gfx90a | ||||||
|   | |||||||
| @@ -2,11 +2,11 @@ | |||||||
|     --enable-comms=mpi \ |     --enable-comms=mpi \ | ||||||
|     --enable-simd=GPU \ |     --enable-simd=GPU \ | ||||||
|     --enable-shm=nvlink \ |     --enable-shm=nvlink \ | ||||||
|     --enable-gen-simd-width=64 \ |  | ||||||
|     --enable-accelerator=cuda \ |     --enable-accelerator=cuda \ | ||||||
|  |     --enable-gen-simd-width=64 \ | ||||||
|  |     --disable-gparity \ | ||||||
|     --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \ |     --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \ | ||||||
|     --enable-accelerator-cshift \ |  | ||||||
|     --disable-unified \ |     --disable-unified \ | ||||||
|     CXX=nvcc \ |     CXX=nvcc \ | ||||||
|     LDFLAGS="-cudart shared " \ |     LDFLAGS="-cudart shared -lcublas " \ | ||||||
|     CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" |     CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared --diag-suppress 177,550,611" | ||||||
|   | |||||||
| @@ -1,6 +1,7 @@ | |||||||
| module load cuda/11.4.1  openmpi/4.1.1-cuda11.4.1  ucx/1.12.0-cuda11.4.1   | module load cuda/12.3  | ||||||
| #module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1 | module load ucx/1.15.0-cuda12.3   | ||||||
| export PREFIX=/home/tc002/tc002/shared/env/prefix/ | module load openmpi/4.1.5-cuda12.3 | ||||||
| export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH | source /home/dp207/dp207/shared/env/production/env-base.sh  | ||||||
|  | source /home/dp207/dp207/shared/env/production/env-gpu.sh  | ||||||
| unset SBATCH_EXPORT | unset SBATCH_EXPORT | ||||||
|  |  | ||||||
|   | |||||||
| @@ -142,7 +142,9 @@ int main (int argc, char ** argv) | |||||||
|   std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl; |   std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl; | ||||||
|   int iters; |   int iters; | ||||||
|  |  | ||||||
|  |   time_t now; | ||||||
|   time_t start = time(NULL); |   time_t start = time(NULL); | ||||||
|  |   UGrid->Broadcast(0,(void *)&start,sizeof(start)); | ||||||
|  |  | ||||||
|   FlightRecorder::ContinueOnFail = 0; |   FlightRecorder::ContinueOnFail = 0; | ||||||
|   FlightRecorder::PrintEntireLog = 0; |   FlightRecorder::PrintEntireLog = 0; | ||||||
| @@ -162,9 +164,9 @@ int main (int argc, char ** argv) | |||||||
|     } |     } | ||||||
|     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; |     std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl; | ||||||
|     result_o = Zero(); |     result_o = Zero(); | ||||||
|     t1=usecond(); |     t1=usecond();  | ||||||
|     mCG(src_o,result_o); |     mCG(src_o,result_o); | ||||||
|     t2=usecond(); |     t2=usecond();  | ||||||
|     iters = mCG.TotalInnerIterations; //Number of inner CG iterations |     iters = mCG.TotalInnerIterations; //Number of inner CG iterations | ||||||
|     flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; |     flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; | ||||||
|     flops+= CGsiteflops*FrbGrid->gSites()*iters; |     flops+= CGsiteflops*FrbGrid->gSites()*iters; | ||||||
| @@ -176,7 +178,8 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|     std::cout << " FlightRecorder is OK! "<<std::endl; |     std::cout << " FlightRecorder is OK! "<<std::endl; | ||||||
|     iter ++; |     iter ++; | ||||||
|   } while (time(NULL) < (start + nsecs/10) ); |     now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now)); | ||||||
|  |   } while (now < (start + nsecs/10) ); | ||||||
|      |      | ||||||
|   std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; |   std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; | ||||||
|   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); |   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); | ||||||
| @@ -189,7 +192,7 @@ int main (int argc, char ** argv) | |||||||
|     } |     } | ||||||
|     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; |     std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl; | ||||||
|     result_o_2 = Zero(); |     result_o_2 = Zero(); | ||||||
|     t1=usecond(); |     t1=usecond();  | ||||||
|     CG(HermOpEO,src_o,result_o_2); |     CG(HermOpEO,src_o,result_o_2); | ||||||
|     t2=usecond(); |     t2=usecond(); | ||||||
|     iters = CG.IterationsToComplete; |     iters = CG.IterationsToComplete; | ||||||
| @@ -201,8 +204,9 @@ int main (int argc, char ** argv) | |||||||
|     std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl; |     std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl; | ||||||
|     assert(FlightRecorder::ErrorCount()==0); |     assert(FlightRecorder::ErrorCount()==0); | ||||||
|     std::cout << " FlightRecorder is OK! "<<std::endl; |     std::cout << " FlightRecorder is OK! "<<std::endl; | ||||||
|  |     now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now)); | ||||||
|     i++; |     i++; | ||||||
|   } while (time(NULL) < (start + nsecs) ); |   } while (now < (start + nsecs) ); | ||||||
|  |  | ||||||
|   LatticeFermionD diff_o(FrbGrid); |   LatticeFermionD diff_o(FrbGrid); | ||||||
|   RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2); |   RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2); | ||||||
|   | |||||||
							
								
								
									
										118
									
								
								tests/debug/Test_8888.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										118
									
								
								tests/debug/Test_8888.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,118 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./tests/Test_general_coarse_hdcg.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2023 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <pboyle@bnl.gov> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  | #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h> | ||||||
|  | #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h> | ||||||
|  | #include <Grid/algorithms/iterative/AdefMrhs.h> | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |   const int Ls=8; | ||||||
|  |   const int nbasis = 40; | ||||||
|  |   const int cb = 0 ; | ||||||
|  |   RealD mass=0.01; | ||||||
|  |   RealD M5=1.8; | ||||||
|  |   RealD b=1.0; | ||||||
|  |   RealD c=0.0; | ||||||
|  |  | ||||||
|  |   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), | ||||||
|  | 								   GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||||
|  | 								   GridDefaultMpi()); | ||||||
|  |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
|  |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |   ///////////////////////// RNGs ///////////////////////////////// | ||||||
|  |   std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |   std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |   std::vector<int> cseeds({5,6,7,8}); | ||||||
|  |  | ||||||
|  |   GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5); | ||||||
|  |   GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |  | ||||||
|  |   ///////////////////////// Configuration ///////////////////////////////// | ||||||
|  |   LatticeGaugeField Umu(UGrid); | ||||||
|  |  | ||||||
|  |   FieldMetaData header; | ||||||
|  |   std::string file("ckpoint_EODWF_lat.125"); | ||||||
|  |   NerscIO::readConfiguration(Umu,header,file); | ||||||
|  |  | ||||||
|  |   //////////////////////// Fermion action ////////////////////////////////// | ||||||
|  |   MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); | ||||||
|  |  | ||||||
|  |   MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf); | ||||||
|  |  | ||||||
|  |    | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "         Fine Power method            "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |  | ||||||
|  |   LatticeFermionD pm_src(FGrid); | ||||||
|  |   pm_src = ComplexD(1.0); | ||||||
|  |   PowerMethod<LatticeFermionD>       fPM; | ||||||
|  |   fPM(HermOp,pm_src); | ||||||
|  |  | ||||||
|  |    | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "         Fine Lanczos  (poly, low)    "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |    | ||||||
|  |   int Nk=80; | ||||||
|  |   int Nm=Nk*3; | ||||||
|  |   int Nstop=8; | ||||||
|  |   int Nconv_test_interval=1; | ||||||
|  |    | ||||||
|  |   //  Chebyshev<LatticeFermionD>      IRLChebyLo(0.2,64.0,201);  // 1 iter | ||||||
|  |   Chebyshev<LatticeFermionD>      IRLChebyLo(0.0,55.0,101);  // 1 iter | ||||||
|  |   FunctionHermOp<LatticeFermionD>    PolyOp(IRLChebyLo,HermOp); | ||||||
|  |   PlainHermOp<LatticeFermionD>          Op(HermOp); | ||||||
|  |  | ||||||
|  |   ImplicitlyRestartedLanczos IRL(PolyOp, | ||||||
|  | 				 Op, | ||||||
|  | 				 Nk, // sought vecs | ||||||
|  | 				 Nk, // sought vecs | ||||||
|  | 				 Nm, // spare vecs | ||||||
|  | 				 1.0e-8, | ||||||
|  | 				 10 // Max iterations | ||||||
|  | 				 ); | ||||||
|  |  | ||||||
|  |   int Nconv; | ||||||
|  |   std::vector<RealD>            eval(Nm); | ||||||
|  |   std::vector<LatticeFermionD>     evec(Nm,FGrid); | ||||||
|  |   LatticeFermionD     irl_src(FGrid); | ||||||
|  |  | ||||||
|  |   IRL.calc(eval,evec,irl_src,Nconv); | ||||||
|  |  | ||||||
|  |   Grid_finalize(); | ||||||
|  |   return 0; | ||||||
|  | } | ||||||
| @@ -392,9 +392,27 @@ void  TestCGschur(What & Ddwf, | |||||||
| 		   GridParallelRNG *RNG5) | 		   GridParallelRNG *RNG5) | ||||||
| { | { | ||||||
|   LatticeFermion src   (FGrid); random(*RNG5,src); |   LatticeFermion src   (FGrid); random(*RNG5,src); | ||||||
|   LatticeFermion result(FGrid); result=Zero(); |   LatticeFermion result1(FGrid); result1=Zero(); | ||||||
|  |   LatticeFermion result2(FGrid); result2=Zero(); | ||||||
|  |   LatticeFermion result3(FGrid); result3=Zero(); | ||||||
|  |  | ||||||
|   ConjugateGradient<LatticeFermion> CG(1.0e-8,10000); |   ConjugateGradient<LatticeFermion> CG(1.0e-8,10000); | ||||||
|   SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG); |   SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG); | ||||||
|   SchurSolver(Ddwf,src,result); |   SchurSolver(Ddwf,src,result1); | ||||||
|  |  | ||||||
|  |   SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG); | ||||||
|  |   SchurSolverSymm1(Ddwf,src,result2); | ||||||
|  |  | ||||||
|  |   SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG); | ||||||
|  |   SchurSolverSymm2(Ddwf,src,result3); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << " Symm1    " <<norm2(result2)<<std::endl;  | ||||||
|  |   result2=result2-result1; | ||||||
|  |   std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl;  | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << " Symm2    " <<norm2(result3)<<std::endl;  | ||||||
|  |   result3=result3-result1; | ||||||
|  |   std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl;  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -244,7 +244,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  |   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  | ||||||
|  |  | ||||||
|    | #if 0   | ||||||
|   MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs); |   MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs); | ||||||
|   typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t; |   typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t; | ||||||
|    |    | ||||||
| @@ -307,7 +307,8 @@ int main (int argc, char ** argv) | |||||||
|     rh_res= Zero(); |     rh_res= Zero(); | ||||||
|     mrhsCG(MrhsCoarseOp,rh_src,rh_res); |     mrhsCG(MrhsCoarseOp,rh_src,rh_res); | ||||||
|   } |   } | ||||||
|    |  | ||||||
|  | #endif | ||||||
|   std::cout<<GridLogMessage<<std::endl; |   std::cout<<GridLogMessage<<std::endl; | ||||||
|   std::cout<<GridLogMessage<<std::endl; |   std::cout<<GridLogMessage<<std::endl; | ||||||
|   std::cout<<GridLogMessage<<"*******************************************"<<std::endl; |   std::cout<<GridLogMessage<<"*******************************************"<<std::endl; | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
|     /************************************************************************************* | /************************************************************************************* | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
| @@ -26,84 +26,13 @@ Author: Peter Boyle <pboyle@bnl.gov> | |||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/Grid.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/lattice/PaddedCell.h> | #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h> | ||||||
| #include <Grid/stencil/GeneralLocalStencil.h> | #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h> | ||||||
| //#include <Grid/algorithms/GeneralCoarsenedMatrix.h> | #include <Grid/algorithms/iterative/AdefMrhs.h> | ||||||
| #include <Grid/algorithms/iterative/AdefGeneric.h> |  | ||||||
|  |  | ||||||
| using namespace std; | using namespace std; | ||||||
| using namespace Grid; | using namespace Grid; | ||||||
|  |  | ||||||
| template<class Coarsened> |  | ||||||
| void SaveOperator(Coarsened &Operator,std::string file) |  | ||||||
| { |  | ||||||
| #ifdef HAVE_LIME |  | ||||||
|   emptyUserRecord record; |  | ||||||
|   ScidacWriter WR(Operator.Grid()->IsBoss()); |  | ||||||
|   assert(Operator._A.size()==Operator.geom.npoint); |  | ||||||
|   WR.open(file); |  | ||||||
|   for(int p=0;p<Operator._A.size();p++){ |  | ||||||
|     auto tmp = Operator.Cell.Extract(Operator._A[p]); |  | ||||||
|     WR.writeScidacFieldRecord(tmp,record); |  | ||||||
|   } |  | ||||||
|   WR.close(); |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
| template<class Coarsened> |  | ||||||
| void LoadOperator(Coarsened &Operator,std::string file) |  | ||||||
| { |  | ||||||
| #ifdef HAVE_LIME |  | ||||||
|   emptyUserRecord record; |  | ||||||
|   Grid::ScidacReader RD ; |  | ||||||
|   RD.open(file); |  | ||||||
|   assert(Operator._A.size()==Operator.geom.npoint); |  | ||||||
|   for(int p=0;p<Operator.geom.npoint;p++){ |  | ||||||
|     conformable(Operator._A[p].Grid(),Operator.CoarseGrid()); |  | ||||||
|     RD.readScidacFieldRecord(Operator._A[p],record); |  | ||||||
|   }     |  | ||||||
|   RD.close(); |  | ||||||
|   Operator.ExchangeCoarseLinks(); |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
| template<class aggregation> |  | ||||||
| void SaveBasis(aggregation &Agg,std::string file) |  | ||||||
| { |  | ||||||
| #ifdef HAVE_LIME |  | ||||||
|   emptyUserRecord record; |  | ||||||
|   ScidacWriter WR(Agg.FineGrid->IsBoss()); |  | ||||||
|   WR.open(file); |  | ||||||
|   for(int b=0;b<Agg.subspace.size();b++){ |  | ||||||
|     WR.writeScidacFieldRecord(Agg.subspace[b],record); |  | ||||||
|   } |  | ||||||
|   WR.close(); |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
| template<class aggregation> |  | ||||||
| void LoadBasis(aggregation &Agg, std::string file) |  | ||||||
| { |  | ||||||
| #ifdef HAVE_LIME |  | ||||||
|   emptyUserRecord record; |  | ||||||
|   ScidacReader RD ; |  | ||||||
|   RD.open(file); |  | ||||||
|   for(int b=0;b<Agg.subspace.size();b++){ |  | ||||||
|     RD.readScidacFieldRecord(Agg.subspace[b],record); |  | ||||||
|   }     |  | ||||||
|   RD.close(); |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Field> class TestSolver : public LinearFunction<Field> { |  | ||||||
| public: |  | ||||||
|   TestSolver() {}; |  | ||||||
|   void operator() (const Field &in, Field &out){    out = Zero();  }      |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| RealD InverseApproximation(RealD x){ |  | ||||||
|   return 1.0/x; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // Want Op in CoarsenOp to call MatPcDagMatPc | // Want Op in CoarsenOp to call MatPcDagMatPc | ||||||
| template<class Field> | template<class Field> | ||||||
| class HermOpAdaptor : public LinearOperatorBase<Field> | class HermOpAdaptor : public LinearOperatorBase<Field> | ||||||
| @@ -119,33 +48,37 @@ public: | |||||||
|   void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  }; |   void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  }; | ||||||
|   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  } |   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  } | ||||||
| }; | }; | ||||||
| template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field> |  | ||||||
|  | template<class Field> class CGSmoother : public LinearFunction<Field> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|   using LinearFunction<Field>::operator(); |   using LinearFunction<Field>::operator(); | ||||||
|   typedef LinearOperatorBase<Field> FineOperator; |   typedef LinearOperatorBase<Field> FineOperator; | ||||||
|   FineOperator   & _SmootherOperator; |   FineOperator   & _SmootherOperator; | ||||||
|   Chebyshev<Field> Cheby; |   int iters; | ||||||
|   ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) : |   CGSmoother(int _iters, FineOperator &SmootherOperator) : | ||||||
|     _SmootherOperator(SmootherOperator), |     _SmootherOperator(SmootherOperator), | ||||||
|     Cheby(_lo,_hi,_ord,InverseApproximation) |     iters(_iters) | ||||||
|   { |   { | ||||||
|     std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl; |     std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl; | ||||||
|   }; |   }; | ||||||
|   void operator() (const Field &in, Field &out)  |   void operator() (const Field &in, Field &out)  | ||||||
|   { |   { | ||||||
|     Field tmp(in.Grid()); |     ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother | ||||||
|     tmp = in; |  | ||||||
|     Cheby(_SmootherOperator,tmp,out); |     out=Zero(); | ||||||
|  |  | ||||||
|  |     CG(_SmootherOperator,in,out); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) | int main (int argc, char ** argv) | ||||||
| { | { | ||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|   const int Ls=24; |   const int Ls=24; | ||||||
|   const int nbasis = 40; |   const int nbasis = 60; | ||||||
|   const int cb = 0 ; |   const int cb = 0 ; | ||||||
|   RealD mass=0.00078; |   RealD mass=0.00078; | ||||||
|   RealD M5=1.8; |   RealD M5=1.8; | ||||||
| @@ -160,10 +93,12 @@ int main (int argc, char ** argv) | |||||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|   // Construct a coarsened grid with 4^4 cell |   // Construct a coarsened grid with 4^4 cell | ||||||
|  |   Coordinate Block({4,4,4,4}); | ||||||
|   Coordinate clatt = GridDefaultLatt(); |   Coordinate clatt = GridDefaultLatt(); | ||||||
|   for(int d=0;d<clatt.size();d++){ |   for(int d=0;d<clatt.size();d++){ | ||||||
|     clatt[d] = clatt[d]/4; |     clatt[d] = clatt[d]/Block[d]; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, |   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, | ||||||
| 							    GridDefaultSimd(Nd,vComplex::Nsimd()), | 							    GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||||
| 							    GridDefaultMpi());; | 							    GridDefaultMpi());; | ||||||
| @@ -182,7 +117,7 @@ int main (int argc, char ** argv) | |||||||
|   LatticeGaugeField Umu(UGrid); |   LatticeGaugeField Umu(UGrid); | ||||||
|  |  | ||||||
|   FieldMetaData header; |   FieldMetaData header; | ||||||
|   std::string file("ckpoint_lat.4000"); |   std::string file("ckpoint_EODWF_lat.125"); | ||||||
|   NerscIO::readConfiguration(Umu,header,file); |   NerscIO::readConfiguration(Umu,header,file); | ||||||
|  |  | ||||||
|   //////////////////////// Fermion action ////////////////////////////////// |   //////////////////////// Fermion action ////////////////////////////////// | ||||||
| @@ -192,15 +127,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix; |   typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix; | ||||||
|   HermFineMatrix FineHermOp(HermOpEO); |   HermFineMatrix FineHermOp(HermOpEO); | ||||||
|    |  | ||||||
|   LatticeFermion result(FrbGrid); result=Zero(); |  | ||||||
|  |  | ||||||
|   LatticeFermion    src(FrbGrid); random(RNG5,src); |  | ||||||
|  |  | ||||||
|   // Run power method on FineHermOp |  | ||||||
|   PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src); |  | ||||||
|  |  | ||||||
|   |  | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   ///////////// Coarse basis and Little Dirac Operator /////// |   ///////////// Coarse basis and Little Dirac Operator /////// | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
| @@ -208,219 +135,170 @@ int main (int argc, char ** argv) | |||||||
|   typedef LittleDiracOperator::CoarseVector CoarseVector; |   typedef LittleDiracOperator::CoarseVector CoarseVector; | ||||||
|  |  | ||||||
|   NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); |   NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); | ||||||
|   NearestStencilGeometry5D geom_nn(Coarse5d); |  | ||||||
|    |  | ||||||
|   // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp |  | ||||||
|   typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace; |   typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace; | ||||||
|   Subspace Aggregates(Coarse5d,FrbGrid,cb); |   Subspace Aggregates(Coarse5d,FrbGrid,cb); | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   // Need to check about red-black grid coarsening |   // Need to check about red-black grid coarsening | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d); |  | ||||||
|  |  | ||||||
|   bool load=false; |   int refine=1; | ||||||
|   if ( load ) { |     //    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO, | ||||||
|     LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac"); |     //    					0.0003,1.0e-5,2000); // Lo, tol, maxit | ||||||
|     LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac"); |     //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run | ||||||
|   } else { |   std::cout << "**************************************"<<std::endl; | ||||||
|     Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, |   std::cout << "Create Subspace"<<std::endl; | ||||||
| 				       95.0,0.1, |   std::cout << "**************************************"<<std::endl; | ||||||
| 				       //				     400,200,200 -- 48 iters |   Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);  | ||||||
| 				       //				     600,200,200 -- 38 iters, 162s |  | ||||||
| 				       //				     600,200,100 -- 38 iters, 169s |  | ||||||
| 				       //				     600,200,50  -- 88 iters. 370s  |  | ||||||
| 				       800, |  | ||||||
| 				       200, |  | ||||||
| 				       100, |  | ||||||
| 				       0.0); |  | ||||||
|     LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates); |  | ||||||
|     SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac"); |  | ||||||
|     SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac"); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // Try projecting to one hop only |  | ||||||
|   LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d); |  | ||||||
|   LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n |  | ||||||
|  |  | ||||||
|   typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix; |   std::cout << "**************************************"<<std::endl; | ||||||
|   HermMatrix CoarseOp     (LittleDiracOp); |   std::cout << "Refine Subspace"<<std::endl; | ||||||
|   HermMatrix CoarseOpProj (LittleDiracOpProj); |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters | ||||||
|    |    | ||||||
|   ////////////////////////////////////////// |   std::cout << "**************************************"<<std::endl; | ||||||
|   // Build a coarse lanczos |   std::cout << "Coarsen after refine"<<std::endl; | ||||||
|   ////////////////////////////////////////// |   std::cout << "**************************************"<<std::endl; | ||||||
|   Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter |   Aggregates.Orthogonalise(); | ||||||
|   FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp); |  | ||||||
|   PlainHermOp<CoarseVector>    IRLOp    (CoarseOp); |   std::cout << "**************************************"<<std::endl; | ||||||
|   int Nk=48; |   std::cout << "Building MultiRHS Coarse operator"<<std::endl; | ||||||
|   int Nm=64; |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true); | ||||||
|  |      | ||||||
|  |   const int nrhs=12; | ||||||
|  |      | ||||||
|  |   Coordinate mpi=GridDefaultMpi(); | ||||||
|  |   Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]}); | ||||||
|  |   Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]}); | ||||||
|  |   Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1}); | ||||||
|  |      | ||||||
|  |   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  | ||||||
|  |   typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t; | ||||||
|  |   MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs); | ||||||
|  |  | ||||||
|  |   mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d); | ||||||
|  |    | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "         Coarse Lanczos               "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |  | ||||||
|  |   typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix; | ||||||
|  |   Chebyshev<CoarseVector>      IRLCheby(0.01,42.0,301);  // 1 iter | ||||||
|  |   MrhsHermMatrix MrhsCoarseOp     (mrhs); | ||||||
|  |  | ||||||
|  |   CoarseVector pm_src(CoarseMrhs); | ||||||
|  |   pm_src = ComplexD(1.0); | ||||||
|  |   PowerMethod<CoarseVector>       cPM; | ||||||
|  |   cPM(MrhsCoarseOp,pm_src); | ||||||
|  |  | ||||||
|  |   int Nk=192; | ||||||
|  |   int Nm=384; | ||||||
|   int Nstop=Nk; |   int Nstop=Nk; | ||||||
|   ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20); |   int Nconv_test_interval=1; | ||||||
|  |    | ||||||
|  |   ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp, | ||||||
|  | 							  Coarse5d, | ||||||
|  | 							  CoarseMrhs, | ||||||
|  | 							  nrhs, | ||||||
|  | 							  IRLCheby, | ||||||
|  | 							  Nstop, | ||||||
|  | 							  Nconv_test_interval, | ||||||
|  | 							  nrhs, | ||||||
|  | 							  Nk, | ||||||
|  | 							  Nm, | ||||||
|  | 							  1e-5,10); | ||||||
|  |  | ||||||
|   int Nconv; |   int Nconv; | ||||||
|   std::vector<RealD>            eval(Nm); |   std::vector<RealD>            eval(Nm); | ||||||
|   std::vector<CoarseVector>     evec(Nm,Coarse5d); |   std::vector<CoarseVector>     evec(Nm,Coarse5d); | ||||||
|   CoarseVector c_src(Coarse5d); |   std::vector<CoarseVector>     c_src(nrhs,Coarse5d); | ||||||
|   //c_src=1.0; |  | ||||||
|   random(CRNG,c_src); |  | ||||||
|  |  | ||||||
|   CoarseVector c_res(Coarse5d);  |  | ||||||
|   CoarseVector c_ref(Coarse5d);  |  | ||||||
|  |  | ||||||
|   PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src); |  | ||||||
|  |  | ||||||
|   IRL.calc(eval,evec,c_src,Nconv); |  | ||||||
|   DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|   // Build a coarse space solver |   // Block projector for coarse/fine | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|   int maxit=20000; |  | ||||||
|   ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false); |  | ||||||
|   ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,10000,false); |  | ||||||
|   ZeroGuesser<CoarseVector> CoarseZeroGuesser; |  | ||||||
|  |  | ||||||
|   //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser); |   std::cout << "**************************************"<<std::endl; | ||||||
|   HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser); |   std::cout << "Calling mRHS HDCG"<<std::endl; | ||||||
|   c_res=Zero(); |   std::cout << "**************************************"<<std::endl; | ||||||
|   HPDSolve(c_src,c_res); c_ref = c_res; |   MultiRHSBlockProject<LatticeFermionD> MrhsProjector; | ||||||
|   std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl; |   MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d); | ||||||
|   std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl; |   MrhsProjector.ImportBasis(Aggregates.subspace); | ||||||
|   ////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Deflated (with real op EV's) solve for the projected coarse op |  | ||||||
|   // Work towards ADEF1 in the coarse space |  | ||||||
|   ////////////////////////////////////////////////////////////////////////// |  | ||||||
|   HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser); |  | ||||||
|   c_res=Zero(); |  | ||||||
|   HPDSolveProj(c_src,c_res); |  | ||||||
|   std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl; |  | ||||||
|   std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl; |  | ||||||
|   c_res = c_res - c_ref; |  | ||||||
|   std::cout << "Projected solver error "<<norm2(c_res)<<std::endl; |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////// |   std::cout << "**************************************"<<std::endl; | ||||||
|   // Coarse ADEF1 with deflation space |   std::cout << " Recompute coarse evecs  "<<std::endl; | ||||||
|   ////////////////////////////////////////////////////////////////////// |   std::cout << "**************************************"<<std::endl; | ||||||
|   ChebyshevSmoother<CoarseVector,HermMatrix > |   evec.resize(Nm,Coarse5d); | ||||||
|     CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence |   eval.resize(Nm); | ||||||
|     //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  // |   for(int r=0;r<nrhs;r++){ | ||||||
|   //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s |     random(CRNG,c_src[r]); | ||||||
|   //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s |  | ||||||
|   //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter |  | ||||||
|   //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter |  | ||||||
|   //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311 |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   // CG, Cheby mode spacing 200,200 |  | ||||||
|   // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s |  | ||||||
|   // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s |  | ||||||
|   // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs  |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain |  | ||||||
|   // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s |  | ||||||
|   // HDCG 38 iters 162s |  | ||||||
|   // |  | ||||||
|   // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs  |  | ||||||
|   // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain |  | ||||||
|   // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s |  | ||||||
|   // HDCG 38 iters 169s |  | ||||||
|  |  | ||||||
|   TwoLevelADEF1defl<CoarseVector> |  | ||||||
|     cADEF1(1.0e-8, 500, |  | ||||||
| 	   CoarseOp, |  | ||||||
| 	   CoarseSmoother, |  | ||||||
| 	   evec,eval); |  | ||||||
|  |  | ||||||
|   c_res=Zero(); |  | ||||||
|   cADEF1(c_src,c_res); |  | ||||||
|   std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl; |  | ||||||
|   std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl; |  | ||||||
|   c_res = c_res - c_ref; |  | ||||||
|   std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl; |  | ||||||
|    |  | ||||||
|   //  cADEF1.Tolerance = 4.0e-2; |  | ||||||
|   //  cADEF1.Tolerance = 1.0e-1; |  | ||||||
|   cADEF1.Tolerance = 5.0e-2; |  | ||||||
|   c_res=Zero(); |  | ||||||
|   cADEF1(c_src,c_res); |  | ||||||
|   std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl; |  | ||||||
|   std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl; |  | ||||||
|   c_res = c_res - c_ref; |  | ||||||
|   std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl; |  | ||||||
|    |  | ||||||
|   ////////////////////////////////////////// |  | ||||||
|   // Build a smoother |  | ||||||
|   ////////////////////////////////////////// |  | ||||||
|   //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499 |  | ||||||
|   //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383 |  | ||||||
|   //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328 |  | ||||||
|   //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1 |  | ||||||
|   //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations |  | ||||||
|   //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters |  | ||||||
|   //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters |  | ||||||
|  |  | ||||||
|   // |  | ||||||
|   // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40 |  | ||||||
|   // Need to measure cost of coarse space. |  | ||||||
|   // |  | ||||||
|   // -- i) Reduce coarse residual   -- 0.04 |  | ||||||
|   // -- ii) Lanczos on coarse space -- done |  | ||||||
|   // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and |  | ||||||
|   //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec |  | ||||||
|   // |  | ||||||
|   std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters |  | ||||||
|  |  | ||||||
|   //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults) |  | ||||||
|   std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)   |  | ||||||
|  |  | ||||||
|   for(int l=0;l<los.size();l++){ |  | ||||||
|  |  | ||||||
|     RealD lo = los[l]; |  | ||||||
|  |  | ||||||
|     for(int o=0;o<ords.size();o++){ |  | ||||||
|  |  | ||||||
|       ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false); |  | ||||||
|       HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser); |  | ||||||
|        |  | ||||||
|       //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case |  | ||||||
|       ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp);  // 311 |  | ||||||
|  |  | ||||||
|       ////////////////////////////////////////// |  | ||||||
|       // Build a HDCG solver |  | ||||||
|       ////////////////////////////////////////// |  | ||||||
|       TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace> |  | ||||||
| 	HDCG(1.0e-8, 100, |  | ||||||
| 	     FineHermOp, |  | ||||||
| 	     Smoother, |  | ||||||
| 	     HPDSolveSloppy, |  | ||||||
| 	     HPDSolve, |  | ||||||
| 	     Aggregates); |  | ||||||
|  |  | ||||||
|       TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace> |  | ||||||
| 	HDCGdefl(1.0e-8, 100, |  | ||||||
| 		 FineHermOp, |  | ||||||
| 		 Smoother, |  | ||||||
| 		 cADEF1, |  | ||||||
| 		 HPDSolve, |  | ||||||
| 		 Aggregates); |  | ||||||
|        |  | ||||||
|       result=Zero(); |  | ||||||
|       HDCGdefl(src,result); |  | ||||||
|  |  | ||||||
|       result=Zero(); |  | ||||||
|       HDCG(src,result); |  | ||||||
|  |  | ||||||
|        |  | ||||||
|     } |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // Standard CG |   IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl); | ||||||
|   result=Zero(); |  | ||||||
|   CGfine(HermOpEO, src, result); |   /////////////////////// | ||||||
|  |   // Deflation guesser object | ||||||
|  |   /////////////////////// | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << " Reimport coarse evecs  "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   MultiRHSDeflation<CoarseVector> MrhsGuesser; | ||||||
|  |   MrhsGuesser.ImportEigenBasis(evec,eval); | ||||||
|  |        | ||||||
|  |   ////////////////////////// | ||||||
|  |   // Extra HDCG parameters | ||||||
|  |   ////////////////////////// | ||||||
|  |   int maxit=3000; | ||||||
|  |   ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false); | ||||||
|  |   RealD lo=2.0; | ||||||
|  |   int ord = 7; | ||||||
|  |  | ||||||
|  |   DoNothingGuesser<CoarseVector> DoNothing; | ||||||
|  |   HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing); | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////// | ||||||
|  |   // Mirs smoother | ||||||
|  |   ///////////////////////////////////////////////// | ||||||
|  |   RealD MirsShift = lo; | ||||||
|  |   ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift); | ||||||
|  |   CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ; | ||||||
|  |  | ||||||
|  |   TwoLevelADEF2mrhs<LatticeFermion,CoarseVector> | ||||||
|  |     HDCGmrhs(1.0e-8, 500, | ||||||
|  | 	     FineHermOp, | ||||||
|  | 	     CGsmooth, | ||||||
|  | 	     HPDSolveMrhs,    // Used in M1 | ||||||
|  | 	     HPDSolveMrhs,          // Used in Vstart | ||||||
|  | 	     MrhsProjector, | ||||||
|  | 	     MrhsGuesser, | ||||||
|  | 	     CoarseMrhs); | ||||||
|  |      | ||||||
|  |   std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid); | ||||||
|  |   std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid); | ||||||
|    |    | ||||||
|  |   for(int r=0;r<nrhs;r++){ | ||||||
|  |     random(RNG5,src_mrhs[r]); | ||||||
|  |     res_mrhs[r]=Zero(); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   HDCGmrhs(src_mrhs,res_mrhs); | ||||||
|  |  | ||||||
|  |   // Standard CG | ||||||
|  | #if 1 | ||||||
|  |   { | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "Calling red black CG"<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |        | ||||||
|  |     LatticeFermion result(FrbGrid); result=Zero(); | ||||||
|  |     LatticeFermion    src(FrbGrid); random(RNG5,src); | ||||||
|  |     result=Zero(); | ||||||
|  |  | ||||||
|  |     ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false); | ||||||
|  |     CGfine(HermOpEO, src, result); | ||||||
|  |   } | ||||||
|  | #endif   | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
|   return 0; |   return 0; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -145,7 +145,7 @@ int main (int argc, char ** argv) | |||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|   const int Ls=24; |   const int Ls=24; | ||||||
|   const int nbasis = 60; |   const int nbasis = 62; | ||||||
|   const int cb = 0 ; |   const int cb = 0 ; | ||||||
|   RealD mass=0.00078; |   RealD mass=0.00078; | ||||||
|   RealD M5=1.8; |   RealD M5=1.8; | ||||||
| @@ -160,7 +160,7 @@ int main (int argc, char ** argv) | |||||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|   // Construct a coarsened grid with 4^4 cell |   // Construct a coarsened grid with 4^4 cell | ||||||
|   Coordinate Block({4,4,4,4}); |   Coordinate Block({4,4,6,4}); | ||||||
|   Coordinate clatt = GridDefaultLatt(); |   Coordinate clatt = GridDefaultLatt(); | ||||||
|   for(int d=0;d<clatt.size();d++){ |   for(int d=0;d<clatt.size();d++){ | ||||||
|     clatt[d] = clatt[d]/Block[d]; |     clatt[d] = clatt[d]/Block[d]; | ||||||
|   | |||||||
							
								
								
									
										396
									
								
								tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										396
									
								
								tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,396 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./tests/Test_general_coarse_hdcg.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2023 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <pboyle@bnl.gov> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  | #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h> | ||||||
|  | #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h> | ||||||
|  | #include <Grid/algorithms/iterative/AdefMrhs.h> | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  |  | ||||||
|  | template<class aggregation> | ||||||
|  | void SaveBasis(aggregation &Agg,std::string file) | ||||||
|  | { | ||||||
|  | #ifdef HAVE_LIME | ||||||
|  |   emptyUserRecord record; | ||||||
|  |   ScidacWriter WR(Agg.FineGrid->IsBoss()); | ||||||
|  |   WR.open(file); | ||||||
|  |   for(int b=0;b<Agg.subspace.size();b++){ | ||||||
|  |     WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC); | ||||||
|  |     //    WR.writeScidacFieldRecord(Agg.subspace[b],record); | ||||||
|  |   } | ||||||
|  |   WR.close(); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | template<class aggregation> | ||||||
|  | void LoadBasis(aggregation &Agg, std::string file) | ||||||
|  | { | ||||||
|  | #ifdef HAVE_LIME | ||||||
|  |   emptyUserRecord record; | ||||||
|  |   ScidacReader RD ; | ||||||
|  |   RD.open(file); | ||||||
|  |   for(int b=0;b<Agg.subspace.size();b++){ | ||||||
|  |     RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC); | ||||||
|  |     //    RD.readScidacFieldRecord(Agg.subspace[b],record,0); | ||||||
|  |   }     | ||||||
|  |   RD.close(); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | template<class CoarseVector> | ||||||
|  | void SaveEigenvectors(std::vector<RealD>            &eval, | ||||||
|  | 		      std::vector<CoarseVector>     &evec, | ||||||
|  | 		      std::string evec_file, | ||||||
|  | 		      std::string eval_file) | ||||||
|  | { | ||||||
|  | #ifdef HAVE_LIME | ||||||
|  |   emptyUserRecord record; | ||||||
|  |   ScidacWriter WR(evec[0].Grid()->IsBoss()); | ||||||
|  |   WR.open(evec_file); | ||||||
|  |   for(int b=0;b<evec.size();b++){ | ||||||
|  |     WR.writeScidacFieldRecord(evec[b],record,0,0); | ||||||
|  |   } | ||||||
|  |   WR.close(); | ||||||
|  |   XmlWriter WRx(eval_file); | ||||||
|  |   write(WRx,"evals",eval); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | template<class CoarseVector> | ||||||
|  | void LoadEigenvectors(std::vector<RealD>            &eval, | ||||||
|  | 		      std::vector<CoarseVector>     &evec, | ||||||
|  | 		      std::string evec_file, | ||||||
|  | 		      std::string eval_file) | ||||||
|  | { | ||||||
|  | #ifdef HAVE_LIME | ||||||
|  |     XmlReader RDx(eval_file); | ||||||
|  |     read(RDx,"evals",eval); | ||||||
|  |     emptyUserRecord record; | ||||||
|  |  | ||||||
|  |     Grid::ScidacReader RD ; | ||||||
|  |     RD.open(evec_file); | ||||||
|  |     assert(evec.size()==eval.size()); | ||||||
|  |     for(int k=0;k<eval.size();k++) { | ||||||
|  |       RD.readScidacFieldRecord(evec[k],record); | ||||||
|  |     } | ||||||
|  |     RD.close(); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Want Op in CoarsenOp to call MatPcDagMatPc | ||||||
|  | template<class Field> | ||||||
|  | class HermOpAdaptor : public LinearOperatorBase<Field> | ||||||
|  | { | ||||||
|  |   LinearOperatorBase<Field> & wrapped; | ||||||
|  | public: | ||||||
|  |   HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {}; | ||||||
|  |   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  } | ||||||
|  |   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); } | ||||||
|  |   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  } | ||||||
|  |   void OpDiag (const Field &in, Field &out)                  {    assert(0);  } | ||||||
|  |   void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  } | ||||||
|  |   void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  }; | ||||||
|  |   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template<class Field> class CGSmoother : public LinearFunction<Field> | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |   using LinearFunction<Field>::operator(); | ||||||
|  |   typedef LinearOperatorBase<Field> FineOperator; | ||||||
|  |   FineOperator   & _SmootherOperator; | ||||||
|  |   int iters; | ||||||
|  |   CGSmoother(int _iters, FineOperator &SmootherOperator) : | ||||||
|  |     _SmootherOperator(SmootherOperator), | ||||||
|  |     iters(_iters) | ||||||
|  |   { | ||||||
|  |     std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl; | ||||||
|  |   }; | ||||||
|  |   void operator() (const Field &in, Field &out)  | ||||||
|  |   { | ||||||
|  |     ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother | ||||||
|  |  | ||||||
|  |     out=Zero(); | ||||||
|  |  | ||||||
|  |     CG(_SmootherOperator,in,out); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |   const int Ls=24; | ||||||
|  |   const int nbasis = 60; | ||||||
|  |   const int cb = 0 ; | ||||||
|  |   RealD mass=0.00078; | ||||||
|  |   RealD M5=1.8; | ||||||
|  |   RealD b=1.5; | ||||||
|  |   RealD c=0.5; | ||||||
|  |  | ||||||
|  |   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), | ||||||
|  | 								   GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||||
|  | 								   GridDefaultMpi()); | ||||||
|  |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
|  |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |   // Construct a coarsened grid with 4^4 cell | ||||||
|  |   //  Coordinate Block({4,4,6,4}); | ||||||
|  |   Coordinate Block({4,4,4,4}); | ||||||
|  |   Coordinate clatt = GridDefaultLatt(); | ||||||
|  |   for(int d=0;d<clatt.size();d++){ | ||||||
|  |     clatt[d] = clatt[d]/Block[d]; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, | ||||||
|  | 							    GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||||
|  | 							    GridDefaultMpi());; | ||||||
|  |   GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d); | ||||||
|  |  | ||||||
|  |   ///////////////////////// RNGs ///////////////////////////////// | ||||||
|  |   std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |   std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |   std::vector<int> cseeds({5,6,7,8}); | ||||||
|  |  | ||||||
|  |   GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5); | ||||||
|  |   GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |   GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); | ||||||
|  |  | ||||||
|  |   ///////////////////////// Configuration ///////////////////////////////// | ||||||
|  |   LatticeGaugeField Umu(UGrid); | ||||||
|  |  | ||||||
|  |   FieldMetaData header; | ||||||
|  |   std::string file("/lustre/orion/phy157/proj-shared/phy157_dwf/lehner/ensemble-Ha/ckpoint_lat.2250"); | ||||||
|  |   NerscIO::readConfiguration(Umu,header,file); | ||||||
|  |  | ||||||
|  |   //////////////////////// Fermion action ////////////////////////////////// | ||||||
|  |   MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); | ||||||
|  |  | ||||||
|  |   SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf); | ||||||
|  |  | ||||||
|  |   typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix; | ||||||
|  |   HermFineMatrix FineHermOp(HermOpEO); | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////// | ||||||
|  |   ///////////// Coarse basis and Little Dirac Operator /////// | ||||||
|  |   //////////////////////////////////////////////////////////// | ||||||
|  |   typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator; | ||||||
|  |   typedef LittleDiracOperator::CoarseVector CoarseVector; | ||||||
|  |  | ||||||
|  |   NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); | ||||||
|  |  | ||||||
|  |   typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace; | ||||||
|  |   Subspace Aggregates(Coarse5d,FrbGrid,cb); | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////// | ||||||
|  |   // Need to check about red-black grid coarsening | ||||||
|  |   //////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  |   //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys96.mixed.2500.60"); | ||||||
|  |   std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60"); | ||||||
|  |   std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60_v2"); | ||||||
|  |   std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys96.mixed.60"); | ||||||
|  |   std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac"); | ||||||
|  |   std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml"); | ||||||
|  |   bool load_agg=true; | ||||||
|  |   bool load_refine=true; | ||||||
|  |   bool load_mat=false; | ||||||
|  |   bool load_evec=false; | ||||||
|  |  | ||||||
|  |   int refine=1; | ||||||
|  |   if ( load_agg ) { | ||||||
|  |     if ( !(refine) || (!load_refine) ) {  | ||||||
|  |       LoadBasis(Aggregates,subspace_file); | ||||||
|  |     } | ||||||
|  |   } else { | ||||||
|  |     Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);  | ||||||
|  |     SaveBasis(Aggregates,subspace_file); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( load_refine ) { | ||||||
|  |     std::cout << " Load Refine "<< refine_file <<std::endl; | ||||||
|  |     LoadBasis(Aggregates,refine_file); | ||||||
|  |   } else { | ||||||
|  |     Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,3000); // 172 iters | ||||||
|  |     //    Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,2500); // 172 iters | ||||||
|  |     SaveBasis(Aggregates,refine_file); | ||||||
|  |   } | ||||||
|  |   Aggregates.Orthogonalise(); | ||||||
|  |    | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "Building MultiRHS Coarse operator"<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |      | ||||||
|  |   const int nrhs=12; | ||||||
|  |      | ||||||
|  |   Coordinate mpi=GridDefaultMpi(); | ||||||
|  |   Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]}); | ||||||
|  |   Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]}); | ||||||
|  |   Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1}); | ||||||
|  |      | ||||||
|  |   GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);  | ||||||
|  |   typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t; | ||||||
|  |   MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs); | ||||||
|  |  | ||||||
|  |   /////////////////////// | ||||||
|  |   // Deflation guesser object | ||||||
|  |   /////////////////////// | ||||||
|  |   MultiRHSDeflation<CoarseVector> MrhsGuesser; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////// | ||||||
|  |   // Block projector for coarse/fine | ||||||
|  |   ////////////////////////////////////////// | ||||||
|  |   MultiRHSBlockProject<LatticeFermionD> MrhsProjector; | ||||||
|  |  | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "Coarsen after refine"<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d); | ||||||
|  |  | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "         Coarse Lanczos               "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |  | ||||||
|  |   typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix; | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,301);  // 4.4.6.4 | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,501);  // for 4.4.4.4 blocking 350 evs | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.0014,42.0,501);  // for 4.4.4.4 blocking 700 evs | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.002,42.0,501);  // for 4.4.4.4 blocking 1226 evs | ||||||
|  |   //  Chebyshev<CoarseVector>      IRLCheby(0.0025,42.0,501);  // for 4.4.4.4 blocking 1059 evs | ||||||
|  | 							  //							  3e-4,2); | ||||||
|  |   Chebyshev<CoarseVector>      IRLCheby(0.0018,42.0,301);  // for 4.4.4.4 blocking  // 790 evs | ||||||
|  |    | ||||||
|  |   MrhsHermMatrix MrhsCoarseOp     (mrhs); | ||||||
|  |  | ||||||
|  |   CoarseVector pm_src(CoarseMrhs); | ||||||
|  |   pm_src = ComplexD(1.0); | ||||||
|  |   PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src); | ||||||
|  |  | ||||||
|  |   //  int Nk=nrhs*30; // 4.4.6.4 | ||||||
|  |   //  int Nk=nrhs*80; | ||||||
|  |   int Nk=nrhs*60; // 720 | ||||||
|  |   int Nm=Nk*4;    // 2880 ; generally finishes at 1440 | ||||||
|  |   int Nstop=512; | ||||||
|  |   int Nconv_test_interval=1; | ||||||
|  |    | ||||||
|  |   ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp, | ||||||
|  | 							  Coarse5d, | ||||||
|  | 							  CoarseMrhs, | ||||||
|  | 							  nrhs, | ||||||
|  | 							  IRLCheby, | ||||||
|  | 							  Nstop, | ||||||
|  | 							  Nconv_test_interval, | ||||||
|  | 							  nrhs, | ||||||
|  | 							  Nk, | ||||||
|  | 							  Nm, | ||||||
|  | 							  3e-4,2); | ||||||
|  |  | ||||||
|  |   std::vector<RealD>            eval(Nm); | ||||||
|  |   std::vector<CoarseVector>     evec(Nm,Coarse5d); | ||||||
|  |   std::vector<CoarseVector>     c_src(nrhs,Coarse5d); | ||||||
|  |  | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << " Recompute coarse evecs  "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   evec.resize(Nm,Coarse5d); | ||||||
|  |   eval.resize(Nm); | ||||||
|  |   for(int r=0;r<nrhs;r++){ | ||||||
|  |     random(CRNG,c_src[r]); | ||||||
|  |   } | ||||||
|  |   int Nconv; | ||||||
|  |   IRL.calc(eval,evec,c_src,Nconv,LanczosType::rbl); | ||||||
|  |   Nconv = eval.size(); | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << " import coarse evecs  "<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   MrhsGuesser.ImportEigenBasis(evec,eval); | ||||||
|  |  | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "Calling mRHS HDCG"<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d); | ||||||
|  |   MrhsProjector.ImportBasis(Aggregates.subspace); | ||||||
|  |  | ||||||
|  |   ////////////////////////// | ||||||
|  |   // Extra HDCG parameters | ||||||
|  |   ////////////////////////// | ||||||
|  |   int maxit=3000; | ||||||
|  |   ConjugateGradient<CoarseVector>  CG(7.5e-2,maxit,false); | ||||||
|  |   RealD lo=2.0; | ||||||
|  |   int ord = 7; | ||||||
|  |  | ||||||
|  |   DoNothingGuesser<CoarseVector> DoNothing; | ||||||
|  |   HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing); | ||||||
|  |   HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing); | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////// | ||||||
|  |   // Mirs smoother | ||||||
|  |   ///////////////////////////////////////////////// | ||||||
|  |   RealD MirsShift = lo; | ||||||
|  |   ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift); | ||||||
|  |   CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ; | ||||||
|  |    | ||||||
|  |    | ||||||
|  |   TwoLevelADEF2mrhs<LatticeFermion,CoarseVector> | ||||||
|  |     HDCGmrhs(1.0e-8, 500, | ||||||
|  | 	     FineHermOp, | ||||||
|  | 	     CGsmooth, | ||||||
|  | 	     HPDSolveMrhs,    // Used in M1 | ||||||
|  | 	     HPDSolveMrhs,          // Used in Vstart | ||||||
|  | 	     MrhsProjector, | ||||||
|  | 	     MrhsGuesser, | ||||||
|  | 	     CoarseMrhs); | ||||||
|  |      | ||||||
|  |   std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid); | ||||||
|  |   std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid); | ||||||
|  |    | ||||||
|  |   for(int r=0;r<nrhs;r++){ | ||||||
|  |     random(RNG5,src_mrhs[r]); | ||||||
|  |     res_mrhs[r]=Zero(); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   HDCGmrhs(src_mrhs,res_mrhs); | ||||||
|  |  | ||||||
|  |   // Standard CG | ||||||
|  | #if 0 | ||||||
|  |   { | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |   std::cout << "Calling red black CG"<<std::endl; | ||||||
|  |   std::cout << "**************************************"<<std::endl; | ||||||
|  |        | ||||||
|  |     LatticeFermion result(FrbGrid); result=Zero(); | ||||||
|  |     LatticeFermion    src(FrbGrid); random(RNG5,src); | ||||||
|  |     result=Zero(); | ||||||
|  |  | ||||||
|  |     ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false); | ||||||
|  |     CGfine(HermOpEO, src, result); | ||||||
|  |   } | ||||||
|  | #endif   | ||||||
|  |   Grid_finalize(); | ||||||
|  |   return 0; | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user