mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-02 21:14:32 +00:00 
			
		
		
		
	Compare commits
	
		
			76 Commits
		
	
	
		
			ad14a82742
			...
			feature/ft
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					bffd30abec | ||
| 
						 | 
					da919949f9 | ||
| 
						 | 
					b12b4fdaff | ||
| 
						 | 
					557fa483ff | ||
| 
						 | 
					fc15d55df6 | ||
| 
						 | 
					53573d7d94 | ||
| 
						 | 
					bb3c177000 | ||
| 
						 | 
					a3322b470f | ||
| 
						 | 
					f8f408e7a9 | ||
| 
						 | 
					baac1127d0 | ||
| 
						 | 
					6f1328160c | ||
| 
						 | 
					04cf902791 | ||
| 
						 | 
					7a5b1c1a19 | ||
| 
						 | 
					18d2d7da4a | ||
| 
						 | 
					b461184797 | ||
| 
						 | 
					4563b39305 | ||
| 
						 | 
					c9d5674d5b | ||
| 
						 | 
					486412635a | ||
| 
						 | 
					8b23a1546a | ||
| 
						 | 
					a901e4e369 | ||
| 
						 | 
					804d9367d4 | ||
| 
						 | 
					41d8adca95 | ||
| 
						 | 
					059e8e5bb0 | ||
| 
						 | 
					b3ee8ded96 | ||
| 
						 | 
					cf3584ad15 | ||
| 
						 | 
					a66973163f | ||
| 
						 | 
					4502a8c8a1 | ||
| 
						 | 
					9c902e4c2d | ||
| 
						 | 
					f3eb36adcf | ||
| 
						 | 
					7c246606c1 | ||
| 
						 | 
					172c75029e | ||
| 
						 | 
					6ae52da571 | ||
| 
						 | 
					4ee9c68053 | ||
| 
						 | 
					a15b4378a3 | ||
| 
						 | 
					89fdd7f8dd | ||
| 
						 | 
					c328be24b7 | ||
| 
						 | 
					a73dc6dbf4 | ||
| 
						 | 
					eee2a2657f | ||
| 
						 | 
					12b8be7cb9 | ||
| 
						 | 
					63c223ea5d | ||
| 
						 | 
					2877fb4a2c | ||
| 
						 | 
					d299c86633 | ||
| 
						 | 
					6ce52092e8 | ||
| 
						 | 
					b5926c1d21 | ||
| 
						 | 
					9563238e9b | ||
| 
						 | 
					fb9b1d76ca | ||
| 
						 | 
					1739146599 | ||
| 
						 | 
					ed20b39ab3 | ||
| 
						 | 
					284fc05f15 | ||
| 
						 | 
					07a07b6fa3 | ||
| 
						 | 
					dc80b08969 | ||
| 
						 | 
					a49a161f8d | ||
| 
						 | 
					a6479ca50f | ||
| 
						 | 
					0e607a55e7 | ||
| 
						 | 
					c4b9f71357 | ||
| 
						 | 
					394e506aea | ||
| 
						 | 
					e19b26341b | ||
| 
						 | 
					cfe1b13225 | ||
| 
						 | 
					890c5ea1cd | ||
| 
						 | 
					a87378d3b6 | ||
| 
						 | 
					832fc08809 | ||
| 
						 | 
					9a1ad6a5eb | ||
| 
						 | 
					a90eafad24 | ||
| 
						 | 
					ccf147d6c1 | ||
| 
						 | 
					7aa12b446f | ||
| 
						 | 
					c293228102 | ||
| 
						 | 
					5c4c9f721a | ||
| 
						 | 
					057f86c1de | ||
| 
						 | 
					cd52e3cbc2 | ||
| 
						 | 
					24602e1259 | ||
| 
						 | 
					8a098889fc | ||
| 
						 | 
					461cd045c6 | ||
| 
						 | 
					fee65d7a75 | ||
| 
						 | 
					31f9971dbf | ||
| 
						 | 
					d87296f3e8 | ||
| 
						 | 
					be94cf1c6f | 
							
								
								
									
										1052
									
								
								BLAS_benchmark/BatchBlasBench.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1052
									
								
								BLAS_benchmark/BatchBlasBench.cc
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										2
									
								
								BLAS_benchmark/compile-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								BLAS_benchmark/compile-command
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,2 @@
 | 
			
		||||
 | 
			
		||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
 | 
			
		||||
@@ -30,9 +30,14 @@ directory
 | 
			
		||||
 | 
			
		||||
#include <type_traits>
 | 
			
		||||
#include <cassert>
 | 
			
		||||
#include <exception>
 | 
			
		||||
 | 
			
		||||
#define NAMESPACE_BEGIN(A) namespace A {
 | 
			
		||||
#define NAMESPACE_END(A)   }
 | 
			
		||||
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
 | 
			
		||||
#define GRID_NAMESPACE_END   NAMESPACE_END(Grid)
 | 
			
		||||
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  ); 
 | 
			
		||||
 | 
			
		||||
#define EXCEPTION_CHECK_BEGIN(A) try {
 | 
			
		||||
#define EXCEPTION_CHECK_END(A)   } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -89,9 +89,10 @@ public:
 | 
			
		||||
      gridblasHandle = theGridAccelerator;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_ONE_MKL
 | 
			
		||||
      cl::sycl::cpu_selector selector;
 | 
			
		||||
      cl::sycl::gpu_selector selector;
 | 
			
		||||
      cl::sycl::device selectedDevice { selector };
 | 
			
		||||
      gridblasHandle =new sycl::queue (selectedDevice);
 | 
			
		||||
      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
 | 
			
		||||
      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 | 
			
		||||
#endif
 | 
			
		||||
      gridblasInit=1;
 | 
			
		||||
    }
 | 
			
		||||
@@ -207,6 +208,9 @@ public:
 | 
			
		||||
    assert(Bkn.size()==batchCount);
 | 
			
		||||
    assert(Cmn.size()==batchCount);
 | 
			
		||||
 | 
			
		||||
    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
 | 
			
		||||
    assert(OpB!=GridBLAS_OP_T);
 | 
			
		||||
 | 
			
		||||
    int lda = m; // m x k column major
 | 
			
		||||
    int ldb = k; // k x n column major
 | 
			
		||||
    int ldc = m; // m x b column major
 | 
			
		||||
@@ -266,26 +270,130 @@ public:
 | 
			
		||||
    assert(err==CUBLAS_STATUS_SUCCESS);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
    //MKL’s cblas_<T>gemm_batch & OneAPI
 | 
			
		||||
#warning "oneMKL implementation not built "
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
 | 
			
		||||
    // Need a default/reference implementation
 | 
			
		||||
    int sda = lda*k;
 | 
			
		||||
    int sdb = ldb*k;
 | 
			
		||||
    int sdc = ldc*n;
 | 
			
		||||
    for (int p = 0; p < batchCount; ++p) {
 | 
			
		||||
      for (int mm = 0; mm < m; ++mm) {
 | 
			
		||||
	for (int nn = 0; nn < n; ++nn) {
 | 
			
		||||
	  ComplexD c_mn(0.0);
 | 
			
		||||
	  for (int kk = 0; kk < k; ++kk)
 | 
			
		||||
	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
 | 
			
		||||
	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
 | 
			
		||||
      int64_t m64=m;
 | 
			
		||||
      int64_t n64=n;
 | 
			
		||||
      int64_t k64=k;
 | 
			
		||||
      int64_t lda64=lda;
 | 
			
		||||
      int64_t ldb64=ldb;
 | 
			
		||||
      int64_t ldc64=ldc;
 | 
			
		||||
      int64_t batchCount64=batchCount;
 | 
			
		||||
 | 
			
		||||
      oneapi::mkl::transpose iOpA;
 | 
			
		||||
      oneapi::mkl::transpose iOpB;
 | 
			
		||||
      
 | 
			
		||||
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
 | 
			
		||||
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
 | 
			
		||||
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
 | 
			
		||||
 | 
			
		||||
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 | 
			
		||||
						  &iOpA,
 | 
			
		||||
						  &iOpB,
 | 
			
		||||
						  &m64,&n64,&k64,
 | 
			
		||||
						  (ComplexD *) &alpha_p[0],
 | 
			
		||||
						  (const ComplexD **)&Amk[0], (const int64_t *)&lda64,
 | 
			
		||||
						  (const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
 | 
			
		||||
						  (ComplexD *) &beta_p[0],
 | 
			
		||||
						  (ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
 | 
			
		||||
						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
 | 
			
		||||
      synchronise();
 | 
			
		||||
#if 0
 | 
			
		||||
      // This code was used to check the mat mul on Sunspot/OneMKL
 | 
			
		||||
      std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
 | 
			
		||||
      std::vector<ComplexD> A(m*k);  // pointer list to matrices
 | 
			
		||||
      std::vector<ComplexD> B(k*n);
 | 
			
		||||
      std::vector<ComplexD> C(m*n);
 | 
			
		||||
      //      int sda = lda*k;
 | 
			
		||||
      //      int sdb = ldb*k;
 | 
			
		||||
      //      int sdc = ldc*n;
 | 
			
		||||
      std::cerr << " Checking the GEMM results "<<std::endl;
 | 
			
		||||
      for (int p = 0; p < 1; ++p) {
 | 
			
		||||
	ComplexD * Amk_p;  // pointer list to matrices
 | 
			
		||||
	ComplexD * Bkn_p;  // pointer list to matrices
 | 
			
		||||
	ComplexD * Cmn_p;  // pointer list to matrices
 | 
			
		||||
	acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
 | 
			
		||||
	acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
 | 
			
		||||
	acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
 | 
			
		||||
	std::cerr << " p " << p << " copied pointers "<<std::endl;
 | 
			
		||||
	acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
 | 
			
		||||
	acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
 | 
			
		||||
	acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
 | 
			
		||||
	std::cerr << " p " << p << " copied matrices "<<std::endl;
 | 
			
		||||
	std::cerr << " C[0] "<<C[0]<<std::endl;
 | 
			
		||||
	std::cerr << " A[0] "<<A[0]<<std::endl;
 | 
			
		||||
	std::cerr << " B[0] "<<B[0]<<std::endl;
 | 
			
		||||
	std::cerr << " m "<<m<<std::endl;
 | 
			
		||||
	std::cerr << " n "<<n<<std::endl;
 | 
			
		||||
	std::cerr << " k "<<k<<std::endl;
 | 
			
		||||
	for (int mm = 0; mm < m; ++mm) {
 | 
			
		||||
	  for (int nn = 0; nn < n; ++nn) {
 | 
			
		||||
	    ComplexD c_mn(0.0);
 | 
			
		||||
	    for (int kk = 0; kk < k; ++kk) {
 | 
			
		||||
	      int idx_a, idx_b;
 | 
			
		||||
	      //    int lda = m; // m x k column major
 | 
			
		||||
	      //    int ldb = k; // k x n column major
 | 
			
		||||
	      //    int ldc = m; // m x b column major
 | 
			
		||||
	      if(OpA!=GridBLAS_OP_N) {
 | 
			
		||||
		idx_a =kk + mm*lda;
 | 
			
		||||
	      } else {
 | 
			
		||||
		idx_a =mm + kk*lda;
 | 
			
		||||
	      }
 | 
			
		||||
	      if(OpB!=GridBLAS_OP_N) {
 | 
			
		||||
		idx_b =nn + kk*ldb;
 | 
			
		||||
	      } else {
 | 
			
		||||
		idx_b =kk + nn*ldb;
 | 
			
		||||
	      }
 | 
			
		||||
	      //	      std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
 | 
			
		||||
 | 
			
		||||
	      ComplexD Ac = A[idx_a];
 | 
			
		||||
	      ComplexD Bc = B[idx_b];
 | 
			
		||||
	      if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
 | 
			
		||||
	      if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
 | 
			
		||||
	      
 | 
			
		||||
	      c_mn += Ac*Bc;
 | 
			
		||||
	    }
 | 
			
		||||
	    std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
    //    synchronise();
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
 | 
			
		||||
    // Need a default/reference implementation; use Eigen
 | 
			
		||||
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 | 
			
		||||
        });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 | 
			
		||||
	  } );
 | 
			
		||||
      } else { 
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }
 | 
			
		||||
#endif
 | 
			
		||||
     RealD t1=usecond();
 | 
			
		||||
     RealD flops = 8.0*m*n*k*batchCount;
 | 
			
		||||
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
 | 
			
		||||
@@ -306,6 +414,9 @@ public:
 | 
			
		||||
    RealD t2=usecond();
 | 
			
		||||
    int32_t batchCount = Amk.size();
 | 
			
		||||
 | 
			
		||||
    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
 | 
			
		||||
    assert(OpB!=GridBLAS_OP_T);
 | 
			
		||||
 | 
			
		||||
    int lda = m; // m x k column major
 | 
			
		||||
    int ldb = k; // k x n column major
 | 
			
		||||
    int ldc = m; // m x b column major
 | 
			
		||||
@@ -366,26 +477,69 @@ public:
 | 
			
		||||
    assert(err==CUBLAS_STATUS_SUCCESS);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
    //MKL’s cblas_<T>gemm_batch & OneAPI
 | 
			
		||||
#warning "oneMKL implementation not built "
 | 
			
		||||
      int64_t m64=m;
 | 
			
		||||
      int64_t n64=n;
 | 
			
		||||
      int64_t k64=k;
 | 
			
		||||
      int64_t lda64=lda;
 | 
			
		||||
      int64_t ldb64=ldb;
 | 
			
		||||
      int64_t ldc64=ldc;
 | 
			
		||||
      int64_t batchCount64=batchCount;
 | 
			
		||||
 | 
			
		||||
      oneapi::mkl::transpose iOpA;
 | 
			
		||||
      oneapi::mkl::transpose iOpB;
 | 
			
		||||
      
 | 
			
		||||
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
 | 
			
		||||
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
 | 
			
		||||
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
 | 
			
		||||
 | 
			
		||||
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 | 
			
		||||
						  &iOpA,
 | 
			
		||||
						  &iOpB,
 | 
			
		||||
						  &m64,&n64,&k64,
 | 
			
		||||
						  (ComplexF *) &alpha_p[0],
 | 
			
		||||
						  (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
 | 
			
		||||
						  (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
 | 
			
		||||
						  (ComplexF *) &beta_p[0],
 | 
			
		||||
						  (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
 | 
			
		||||
						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
 | 
			
		||||
    synchronise();
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
 | 
			
		||||
    int sda = lda*k;
 | 
			
		||||
    int sdb = ldb*k;
 | 
			
		||||
    int sdc = ldc*n;
 | 
			
		||||
    ComplexF alphaf(real(alpha),imag(alpha));
 | 
			
		||||
    ComplexF betaf(real(beta),imag(beta));
 | 
			
		||||
    // Need a default/reference implementation
 | 
			
		||||
    for (int p = 0; p < batchCount; ++p) {
 | 
			
		||||
      for (int mm = 0; mm < m; ++mm) {
 | 
			
		||||
	for (int nn = 0; nn < n; ++nn) {
 | 
			
		||||
	  ComplexF c_mn(0.0);
 | 
			
		||||
	  for (int kk = 0; kk < k; ++kk)
 | 
			
		||||
	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
 | 
			
		||||
	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
 | 
			
		||||
	}
 | 
			
		||||
    // Need a default/reference implementation; use Eigen
 | 
			
		||||
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
 | 
			
		||||
	  } );
 | 
			
		||||
      } else { 
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
     RealD t1=usecond();
 | 
			
		||||
     RealD flops = 8.0*m*n*k*batchCount;
 | 
			
		||||
@@ -408,6 +562,9 @@ public:
 | 
			
		||||
    RealD t2=usecond();
 | 
			
		||||
    int32_t batchCount = Amk.size();
 | 
			
		||||
 | 
			
		||||
    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
 | 
			
		||||
    assert(OpB!=GridBLAS_OP_C);
 | 
			
		||||
 | 
			
		||||
    int lda = m; // m x k column major
 | 
			
		||||
    int ldb = k; // k x n column major
 | 
			
		||||
    int ldc = m; // m x b column major
 | 
			
		||||
@@ -467,24 +624,69 @@ public:
 | 
			
		||||
    assert(err==CUBLAS_STATUS_SUCCESS);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
    //MKL’s cblas_<T>gemm_batch & OneAPI
 | 
			
		||||
#warning "oneMKL implementation not built "
 | 
			
		||||
      int64_t m64=m;
 | 
			
		||||
      int64_t n64=n;
 | 
			
		||||
      int64_t k64=k;
 | 
			
		||||
      int64_t lda64=lda;
 | 
			
		||||
      int64_t ldb64=ldb;
 | 
			
		||||
      int64_t ldc64=ldc;
 | 
			
		||||
      int64_t batchCount64=batchCount;
 | 
			
		||||
 | 
			
		||||
      oneapi::mkl::transpose iOpA;
 | 
			
		||||
      oneapi::mkl::transpose iOpB;
 | 
			
		||||
      
 | 
			
		||||
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
 | 
			
		||||
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
 | 
			
		||||
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
 | 
			
		||||
 | 
			
		||||
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 | 
			
		||||
						  &iOpA,
 | 
			
		||||
						  &iOpB,
 | 
			
		||||
						  &m64,&n64,&k64,
 | 
			
		||||
						  (float *) &alpha_p[0],
 | 
			
		||||
						  (const float **)&Amk[0], (const int64_t *)&lda64,
 | 
			
		||||
						  (const float **)&Bkn[0], (const int64_t *)&ldb64,
 | 
			
		||||
						  (float *) &beta_p[0],
 | 
			
		||||
						  (float **)&Cmn[0], (const int64_t *)&ldc64,
 | 
			
		||||
						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
 | 
			
		||||
      synchronise();
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
 | 
			
		||||
    int sda = lda*k;
 | 
			
		||||
    int sdb = ldb*k;
 | 
			
		||||
    int sdc = ldc*n;
 | 
			
		||||
    // Need a default/reference implementation
 | 
			
		||||
    for (int p = 0; p < batchCount; ++p) {
 | 
			
		||||
      for (int mm = 0; mm < m; ++mm) {
 | 
			
		||||
	for (int nn = 0; nn < n; ++nn) {
 | 
			
		||||
	  RealD c_mn(0.0);
 | 
			
		||||
	  for (int kk = 0; kk < k; ++kk)
 | 
			
		||||
	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
 | 
			
		||||
	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
 | 
			
		||||
	}
 | 
			
		||||
    // Need a default/reference implementation; use Eigen
 | 
			
		||||
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 | 
			
		||||
	  } );
 | 
			
		||||
      } else { 
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
     RealD t1=usecond();
 | 
			
		||||
     RealD flops = 2.0*m*n*k*batchCount;
 | 
			
		||||
@@ -495,7 +697,6 @@ public:
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Double precision real GEMM
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  void gemmBatched(GridBLASOperation_t OpA,
 | 
			
		||||
		   GridBLASOperation_t OpB,
 | 
			
		||||
		   int m,int n, int k,
 | 
			
		||||
@@ -508,6 +709,9 @@ public:
 | 
			
		||||
    RealD t2=usecond();
 | 
			
		||||
    int32_t batchCount = Amk.size();
 | 
			
		||||
 | 
			
		||||
    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
 | 
			
		||||
    assert(OpB!=GridBLAS_OP_C);
 | 
			
		||||
 | 
			
		||||
    int lda = m; // m x k column major
 | 
			
		||||
    int ldb = k; // k x n column major
 | 
			
		||||
    int ldc = m; // m x b column major
 | 
			
		||||
@@ -568,160 +772,124 @@ public:
 | 
			
		||||
    assert(err==CUBLAS_STATUS_SUCCESS);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
    /*
 | 
			
		||||
      int64_t m64=m;
 | 
			
		||||
      int64_t n64=n;
 | 
			
		||||
      int64_t k64=k;
 | 
			
		||||
      int64_t lda64=lda;
 | 
			
		||||
      int64_t ldb64=ldb;
 | 
			
		||||
      int64_t ldc64=ldc;
 | 
			
		||||
      int64_t batchCount64=batchCount;
 | 
			
		||||
      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
 | 
			
		||||
      onemkl::transpose::N,
 | 
			
		||||
      onemkl::transpose::N,
 | 
			
		||||
      &m64,&n64,&k64,
 | 
			
		||||
      (double *) &alpha_p[0],
 | 
			
		||||
      (double **)&Amk[0], lda,
 | 
			
		||||
      (double **)&Bkn[0], ldb,
 | 
			
		||||
      (double *) &beta_p[0],
 | 
			
		||||
      (double **)&Cmn[0], ldc,
 | 
			
		||||
      1,&batchCount64);
 | 
			
		||||
     */
 | 
			
		||||
    //MKL’s cblas_<T>gemm_batch & OneAPI
 | 
			
		||||
#warning "oneMKL implementation not built "
 | 
			
		||||
 | 
			
		||||
      oneapi::mkl::transpose iOpA;
 | 
			
		||||
      oneapi::mkl::transpose iOpB;
 | 
			
		||||
      
 | 
			
		||||
      if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
 | 
			
		||||
      if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
 | 
			
		||||
      if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
 | 
			
		||||
      if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
 | 
			
		||||
 | 
			
		||||
      oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 | 
			
		||||
						  &iOpA,
 | 
			
		||||
						  &iOpB,
 | 
			
		||||
						  &m64,&n64,&k64,
 | 
			
		||||
						  (double *) &alpha_p[0],
 | 
			
		||||
						  (const double **)&Amk[0], (const int64_t *)&lda64,
 | 
			
		||||
						  (const double **)&Bkn[0], (const int64_t *)&ldb64,
 | 
			
		||||
						  (double *) &beta_p[0],
 | 
			
		||||
						  (double **)&Cmn[0], (const int64_t *)&ldc64,
 | 
			
		||||
						  (int64_t)1,&batchCount64,std::vector<sycl::event>());
 | 
			
		||||
      synchronise();
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
 | 
			
		||||
    int sda = lda*k;
 | 
			
		||||
    int sdb = ldb*k;
 | 
			
		||||
    int sdc = ldc*n;
 | 
			
		||||
    // Need a default/reference implementation
 | 
			
		||||
    for (int p = 0; p < batchCount; ++p) {
 | 
			
		||||
      for (int mm = 0; mm < m; ++mm) {
 | 
			
		||||
	for (int nn = 0; nn < n; ++nn) {
 | 
			
		||||
	  RealD c_mn(0.0);
 | 
			
		||||
	  for (int kk = 0; kk < k; ++kk)
 | 
			
		||||
	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
 | 
			
		||||
	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
 | 
			
		||||
	}
 | 
			
		||||
    // Need a default/reference implementation; use Eigen
 | 
			
		||||
      if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk * eBkn ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
 | 
			
		||||
	thread_for (p, batchCount, {
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
 | 
			
		||||
	  Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
 | 
			
		||||
	  eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
 | 
			
		||||
	  });
 | 
			
		||||
      } else { 
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
     RealD t1=usecond();
 | 
			
		||||
     RealD flops = 2.0*m*n*k*batchCount;
 | 
			
		||||
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Strided case used by benchmark, but generally unused in Grid
 | 
			
		||||
  // Keep a code example in double complex, but don't generate the single and real variants for now
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  
 | 
			
		||||
  void gemmStridedBatched(int m,int n, int k,
 | 
			
		||||
			  ComplexD alpha,
 | 
			
		||||
			  ComplexD* Amk,  // pointer list to matrices
 | 
			
		||||
			  ComplexD* Bkn,
 | 
			
		||||
			  ComplexD beta,
 | 
			
		||||
			  ComplexD* Cmn,
 | 
			
		||||
			  int batchCount)
 | 
			
		||||
  {
 | 
			
		||||
    // Use C-row major storage, so transpose calls
 | 
			
		||||
    int lda = m; // m x k column major
 | 
			
		||||
    int ldb = k; // k x n column major
 | 
			
		||||
    int ldc = m; // m x b column major
 | 
			
		||||
    int sda = m*k;
 | 
			
		||||
    int sdb = k*n;
 | 
			
		||||
    int sdc = m*n;
 | 
			
		||||
    deviceVector<ComplexD> alpha_p(1);
 | 
			
		||||
    deviceVector<ComplexD> beta_p(1);
 | 
			
		||||
    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
 | 
			
		||||
    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
 | 
			
		||||
    //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
 | 
			
		||||
    //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
 | 
			
		||||
    //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
    auto err = hipblasZgemmStridedBatched(gridblasHandle,
 | 
			
		||||
					  HIPBLAS_OP_N,
 | 
			
		||||
					  HIPBLAS_OP_N,
 | 
			
		||||
					  m,n,k,
 | 
			
		||||
					  (hipblasDoubleComplex *) &alpha_p[0],
 | 
			
		||||
					  (hipblasDoubleComplex *) Amk, lda, sda,
 | 
			
		||||
					  (hipblasDoubleComplex *) Bkn, ldb, sdb,
 | 
			
		||||
					  (hipblasDoubleComplex *) &beta_p[0],
 | 
			
		||||
					  (hipblasDoubleComplex *) Cmn, ldc, sdc,
 | 
			
		||||
					  batchCount);
 | 
			
		||||
    assert(err==HIPBLAS_STATUS_SUCCESS);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
    cublasZgemmStridedBatched(gridblasHandle,
 | 
			
		||||
			      CUBLAS_OP_N,
 | 
			
		||||
			      CUBLAS_OP_N,
 | 
			
		||||
			      m,n,k,
 | 
			
		||||
			      (cuDoubleComplex *) &alpha_p[0],
 | 
			
		||||
			      (cuDoubleComplex *) Amk, lda, sda,
 | 
			
		||||
			      (cuDoubleComplex *) Bkn, ldb, sdb,
 | 
			
		||||
			      (cuDoubleComplex *) &beta_p[0],
 | 
			
		||||
			      (cuDoubleComplex *) Cmn, ldc, sdc,
 | 
			
		||||
			      batchCount);
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
 | 
			
		||||
    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 | 
			
		||||
						oneapi::mkl::transpose::N,
 | 
			
		||||
						oneapi::mkl::transpose::N,
 | 
			
		||||
						m,n,k,
 | 
			
		||||
						alpha,
 | 
			
		||||
						(const ComplexD *)Amk,lda,sda,
 | 
			
		||||
						(const ComplexD *)Bkn,ldb,sdb,
 | 
			
		||||
						beta,
 | 
			
		||||
						(ComplexD *)Cmn,ldc,sdc,
 | 
			
		||||
						batchCount);
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
 | 
			
		||||
     // Need a default/reference implementation
 | 
			
		||||
     for (int p = 0; p < batchCount; ++p) {
 | 
			
		||||
       for (int mm = 0; mm < m; ++mm) {
 | 
			
		||||
	 for (int nn = 0; nn < n; ++nn) {
 | 
			
		||||
	   ComplexD c_mn(0.0);
 | 
			
		||||
	   for (int kk = 0; kk < k; ++kk)
 | 
			
		||||
	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
 | 
			
		||||
	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
 | 
			
		||||
	 }
 | 
			
		||||
       }
 | 
			
		||||
     }
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class CComplex>
 | 
			
		||||
  double benchmark(int M, int N, int K, int BATCH)
 | 
			
		||||
  {
 | 
			
		||||
    int32_t N_A = M*K*BATCH;
 | 
			
		||||
    int32_t N_B = K*N*BATCH;
 | 
			
		||||
    int32_t N_C = M*N*BATCH;
 | 
			
		||||
    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
 | 
			
		||||
    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
 | 
			
		||||
    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
 | 
			
		||||
    ComplexD alpha(1.0);
 | 
			
		||||
    ComplexD beta (1.0);
 | 
			
		||||
    deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
 | 
			
		||||
    deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
 | 
			
		||||
    deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
 | 
			
		||||
    CComplex alpha(1.0);
 | 
			
		||||
    CComplex beta (1.0);
 | 
			
		||||
    RealD flops = 8.0*M*N*K*BATCH;
 | 
			
		||||
    int ncall=10;
 | 
			
		||||
    int ncall=1000;
 | 
			
		||||
    deviceVector<CComplex *> As(BATCH);
 | 
			
		||||
    deviceVector<CComplex *> Bs(BATCH);
 | 
			
		||||
    deviceVector<CComplex *> Cs(BATCH);
 | 
			
		||||
    for(int b = 0 ; b < BATCH;b++) {
 | 
			
		||||
      CComplex *ptr;
 | 
			
		||||
      ptr = &A[b*M*K];      acceleratorPut(As[b],ptr);
 | 
			
		||||
      ptr = &B[b*K*N];      acceleratorPut(Bs[b],ptr);
 | 
			
		||||
      ptr = &C[b*M*N];      acceleratorPut(Cs[b],ptr);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Warm up call
 | 
			
		||||
    gemmBatched(M,N,K,
 | 
			
		||||
		alpha,
 | 
			
		||||
		As, // m x k 
 | 
			
		||||
		Bs, // k x n
 | 
			
		||||
		beta, 
 | 
			
		||||
		Cs);
 | 
			
		||||
    synchronise();
 | 
			
		||||
 | 
			
		||||
    RealD t0 = usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      gemmStridedBatched(M,N,K,
 | 
			
		||||
			 alpha,
 | 
			
		||||
			 &A[0], // m x k 
 | 
			
		||||
			 &B[0], // k x n
 | 
			
		||||
			 beta, 
 | 
			
		||||
			 &C[0], // m x n
 | 
			
		||||
			 BATCH);
 | 
			
		||||
      gemmBatched(M,N,K,
 | 
			
		||||
		  alpha,
 | 
			
		||||
		  As, // m x k 
 | 
			
		||||
		  Bs, // k x n
 | 
			
		||||
		  beta, 
 | 
			
		||||
		  Cs);
 | 
			
		||||
      synchronise();
 | 
			
		||||
    }
 | 
			
		||||
    synchronise();
 | 
			
		||||
    RealD t1 = usecond();
 | 
			
		||||
    RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
 | 
			
		||||
    RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
 | 
			
		||||
    flops = 8.0*M*N*K*BATCH*ncall;
 | 
			
		||||
    flops = flops/(t1-t0)/1.e3;
 | 
			
		||||
    return flops; // Returns gigaflops
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 
 | 
			
		||||
@@ -279,11 +279,11 @@ public:
 | 
			
		||||
      Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
 | 
			
		||||
      diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
 | 
			
		||||
      _sort.push(eval2,Nm);
 | 
			
		||||
      //      Glog << "#Ritz value before shift: "<< std::endl;
 | 
			
		||||
      Glog << "#Ritz value before shift: "<< std::endl;
 | 
			
		||||
      for(int i=0; i<Nm; ++i){
 | 
			
		||||
	//        std::cout.precision(13);
 | 
			
		||||
	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 | 
			
		||||
	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
 | 
			
		||||
	std::cout.precision(13);
 | 
			
		||||
	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 | 
			
		||||
	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      //----------------------------------------------------------------------
 | 
			
		||||
@@ -297,7 +297,8 @@ public:
 | 
			
		||||
        
 | 
			
		||||
        unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
 | 
			
		||||
 | 
			
		||||
        for(int ip=Nk; ip<Nm; ++ip){ 
 | 
			
		||||
        for(int ip=Nk; ip<Nm; ++ip){
 | 
			
		||||
	  Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
 | 
			
		||||
          shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
@@ -325,7 +326,7 @@ public:
 | 
			
		||||
        Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
 | 
			
		||||
        diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
 | 
			
		||||
        _sort.push(eval2,Nk);
 | 
			
		||||
	//        Glog << "#Ritz value after shift: "<< std::endl;
 | 
			
		||||
	Glog << "#Ritz value after shift: "<< std::endl;
 | 
			
		||||
        for(int i=0; i<Nk; ++i){
 | 
			
		||||
	  //          std::cout.precision(13);
 | 
			
		||||
	  //          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 | 
			
		||||
@@ -467,10 +468,10 @@ public:
 | 
			
		||||
  
 | 
			
		||||
    // set initial vector
 | 
			
		||||
    for (int i=0; i<Nu; ++i) {
 | 
			
		||||
      //      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
 | 
			
		||||
      Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
 | 
			
		||||
      evec[i] = src[i];
 | 
			
		||||
      orthogonalize(evec[i],evec,i);
 | 
			
		||||
      //      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
 | 
			
		||||
      Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
//    exit(-43);
 | 
			
		||||
    
 | 
			
		||||
@@ -506,11 +507,11 @@ public:
 | 
			
		||||
      Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
 | 
			
		||||
      diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
 | 
			
		||||
      _sort.push(eval2,Nr);
 | 
			
		||||
      //      Glog << "#Ritz value: "<< std::endl;
 | 
			
		||||
      Glog << "#Ritz value: "<< std::endl;
 | 
			
		||||
      for(int i=0; i<Nr; ++i){
 | 
			
		||||
	//        std::cout.precision(13);
 | 
			
		||||
	//        std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 | 
			
		||||
	//        std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
 | 
			
		||||
	std::cout.precision(13);
 | 
			
		||||
	std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 | 
			
		||||
	std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // Convergence test
 | 
			
		||||
@@ -570,6 +571,7 @@ public:
 | 
			
		||||
      Glog << fname + " NOT converged ; Summary :\n";
 | 
			
		||||
    } else {
 | 
			
		||||
      Glog << fname + " CONVERGED ; Summary :\n";
 | 
			
		||||
      Nstop = Nconv_guess; // Just take them all
 | 
			
		||||
      // Sort convered eigenpairs.
 | 
			
		||||
      std::vector<Field>  Btmp(Nstop,grid); // waste of space replicating
 | 
			
		||||
 | 
			
		||||
@@ -642,7 +644,7 @@ private:
 | 
			
		||||
      //      for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
 | 
			
		||||
      k_start +=mrhs;
 | 
			
		||||
    }
 | 
			
		||||
    //    Glog << "LinAlg "<< std::endl;
 | 
			
		||||
    Glog << "LinAlg "<< std::endl;
 | 
			
		||||
    
 | 
			
		||||
    if (b>0) {
 | 
			
		||||
      for (int u=0; u<Nu; ++u) {
 | 
			
		||||
@@ -676,7 +678,7 @@ private:
 | 
			
		||||
      }
 | 
			
		||||
      w_copy[u] = w[u];
 | 
			
		||||
    }
 | 
			
		||||
    //    Glog << "LinAlg done"<< std::endl;
 | 
			
		||||
    Glog << "LinAlg done"<< std::endl;
 | 
			
		||||
    
 | 
			
		||||
    // In block version, the steps 6 and 7 in Lanczos construction is
 | 
			
		||||
    // replaced by the QR decomposition of new basis block.
 | 
			
		||||
@@ -689,15 +691,15 @@ private:
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // re-orthogonalization for numerical stability
 | 
			
		||||
    //    Glog << "Gram Schmidt"<< std::endl;
 | 
			
		||||
    Glog << "Gram Schmidt"<< std::endl;
 | 
			
		||||
    orthogonalize(w,Nu,evec,R);
 | 
			
		||||
    // QR part
 | 
			
		||||
    for (int u=1; u<Nu; ++u) {
 | 
			
		||||
      orthogonalize(w[u],w,u);
 | 
			
		||||
    }
 | 
			
		||||
    //    Glog << "Gram Schmidt done "<< std::endl;
 | 
			
		||||
    Glog << "Gram Schmidt done "<< std::endl;
 | 
			
		||||
    
 | 
			
		||||
    //    Glog << "LinAlg "<< std::endl;
 | 
			
		||||
    Glog << "LinAlg "<< std::endl;
 | 
			
		||||
    for (int u=0; u<Nu; ++u) {
 | 
			
		||||
      //for (int v=0; v<Nu; ++v) {
 | 
			
		||||
      for (int v=u; v<Nu; ++v) {
 | 
			
		||||
@@ -714,7 +716,7 @@ private:
 | 
			
		||||
	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    //    Glog << "LinAlg done "<< std::endl;
 | 
			
		||||
    Glog << "LinAlg done "<< std::endl;
 | 
			
		||||
 | 
			
		||||
    if (b < Nm/Nu-1) {
 | 
			
		||||
      for (int u=0; u<Nu; ++u) {
 | 
			
		||||
@@ -779,7 +781,7 @@ private:
 | 
			
		||||
    
 | 
			
		||||
    for ( int u=0; u<Nu; ++u ) {
 | 
			
		||||
      for (int k=0; k<Nk; ++k ) {
 | 
			
		||||
//        Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
 | 
			
		||||
	//	Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
 | 
			
		||||
        BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
@@ -933,7 +935,7 @@ if (1){
 | 
			
		||||
         int Nu, int Nb, int Nk, int Nm,
 | 
			
		||||
         Eigen::MatrixXcd& M)
 | 
			
		||||
  {
 | 
			
		||||
    //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
 | 
			
		||||
    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
 | 
			
		||||
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
 | 
			
		||||
    assert( Nk <= Nm );
 | 
			
		||||
    M = Eigen::MatrixXcd::Zero(Nk,Nk);
 | 
			
		||||
@@ -951,7 +953,7 @@ if (1){
 | 
			
		||||
        M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl; 
 | 
			
		||||
    Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; 
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
@@ -961,7 +963,7 @@ if (1){
 | 
			
		||||
         int Nu, int Nb, int Nk, int Nm,
 | 
			
		||||
         Eigen::MatrixXcd& M)
 | 
			
		||||
  {
 | 
			
		||||
    //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
 | 
			
		||||
    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
 | 
			
		||||
    assert( Nk%Nu == 0 && Nm%Nu == 0 );
 | 
			
		||||
    assert( Nk <= Nm );
 | 
			
		||||
    
 | 
			
		||||
@@ -977,7 +979,7 @@ if (1){
 | 
			
		||||
        lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl; 
 | 
			
		||||
    Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -986,7 +988,7 @@ if (1){
 | 
			
		||||
		            RealD Dsh,
 | 
			
		||||
		            Eigen::MatrixXcd& Qprod)
 | 
			
		||||
  {
 | 
			
		||||
    //Glog << "shiftedQRDecompEigen() begin" << '\n'; 
 | 
			
		||||
    Glog << "shiftedQRDecompEigen() begin" << '\n'; 
 | 
			
		||||
    Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
 | 
			
		||||
    Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
 | 
			
		||||
    Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
 | 
			
		||||
@@ -1002,6 +1004,7 @@ if (1){
 | 
			
		||||
                        // lower triangular part used to represent series
 | 
			
		||||
                        // of Q sequence.
 | 
			
		||||
 | 
			
		||||
    Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; 
 | 
			
		||||
    // equivalent operation of Qprod *= Q
 | 
			
		||||
    //M = Eigen::MatrixXcd::Zero(Nm,Nm);
 | 
			
		||||
    
 | 
			
		||||
@@ -1022,6 +1025,7 @@ if (1){
 | 
			
		||||
    
 | 
			
		||||
    Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
 | 
			
		||||
 | 
			
		||||
    Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; 
 | 
			
		||||
    for (int i=0; i<Nm; ++i) {
 | 
			
		||||
      for (int j=0; j<Nm-(Nu+1); ++j) {
 | 
			
		||||
        for (int k=0; k<Nu+1+j; ++k) {
 | 
			
		||||
@@ -1029,6 +1033,7 @@ if (1){
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; 
 | 
			
		||||
    for (int i=0; i<Nm; ++i) {
 | 
			
		||||
      for (int j=Nm-(Nu+1); j<Nm; ++j) {
 | 
			
		||||
        for (int k=0; k<Nm; ++k) {
 | 
			
		||||
@@ -1036,6 +1041,7 @@ if (1){
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; 
 | 
			
		||||
    
 | 
			
		||||
    //static int ntimes = 2;
 | 
			
		||||
    //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
 | 
			
		||||
@@ -1061,11 +1067,13 @@ if (1){
 | 
			
		||||
        Mtmp(j,i) = conj(Mtmp(i,j));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; 
 | 
			
		||||
 | 
			
		||||
    for (int i=0; i<Nm; ++i) {
 | 
			
		||||
      Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; 
 | 
			
		||||
    M = Mtmp;
 | 
			
		||||
 | 
			
		||||
    //M = Q.adjoint()*(M*Q);
 | 
			
		||||
@@ -1077,7 +1085,7 @@ if (1){
 | 
			
		||||
    //  }
 | 
			
		||||
    //}
 | 
			
		||||
    
 | 
			
		||||
    //Glog << "shiftedQRDecompEigen() end" << endl; 
 | 
			
		||||
    Glog << "shiftedQRDecompEigen() end" <<std::endl; 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void exampleQRDecompEigen(void)
 | 
			
		||||
 
 | 
			
		||||
@@ -499,6 +499,87 @@ namespace Grid {
 | 
			
		||||
      }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Site diagonal is identity, left preconditioned by Mee^inv
 | 
			
		||||
  // ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv  ) phi =  Mee_inv eta
 | 
			
		||||
  //
 | 
			
		||||
  // Solve:
 | 
			
		||||
  // ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag  Mee_inv eta
 | 
			
		||||
  //
 | 
			
		||||
  // Old notation e<->o
 | 
			
		||||
  //
 | 
			
		||||
  // Left precon by Moo^-1
 | 
			
		||||
  //  b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o =  [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1}  eta_o
 | 
			
		||||
  //                                   eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
 | 
			
		||||
  public:
 | 
			
		||||
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
 | 
			
		||||
 | 
			
		||||
    /////////////////////////////////////////////////////
 | 
			
		||||
    // Wrap the usual normal equations Schur trick
 | 
			
		||||
    /////////////////////////////////////////////////////
 | 
			
		||||
  SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
 | 
			
		||||
      const bool _solnAsInitGuess = false)  
 | 
			
		||||
    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
 | 
			
		||||
 | 
			
		||||
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
 | 
			
		||||
    {
 | 
			
		||||
      GridBase *grid = _Matrix.RedBlackGrid();
 | 
			
		||||
      GridBase *fgrid= _Matrix.Grid();
 | 
			
		||||
 | 
			
		||||
      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
 | 
			
		||||
      
 | 
			
		||||
      Field   tmp(grid);
 | 
			
		||||
      Field  Mtmp(grid);
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd ,src_o,src);
 | 
			
		||||
    
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
 | 
			
		||||
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
 | 
			
		||||
      Mtmp=src_o-Mtmp;                 
 | 
			
		||||
      _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);     
 | 
			
		||||
      
 | 
			
		||||
      // get the right MpcDag
 | 
			
		||||
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
 | 
			
		||||
    {
 | 
			
		||||
      GridBase *grid = _Matrix.RedBlackGrid();
 | 
			
		||||
      GridBase *fgrid= _Matrix.Grid();
 | 
			
		||||
 | 
			
		||||
      Field   tmp(grid);
 | 
			
		||||
      Field   sol_e(grid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////
 | 
			
		||||
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
 | 
			
		||||
      ///////////////////////////////////////////////////
 | 
			
		||||
      _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even);
 | 
			
		||||
      tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even);
 | 
			
		||||
      _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even);
 | 
			
		||||
     
 | 
			
		||||
      setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even);
 | 
			
		||||
      setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd );
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
 | 
			
		||||
    {
 | 
			
		||||
      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
 | 
			
		||||
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
 | 
			
		||||
    };
 | 
			
		||||
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
 | 
			
		||||
    {
 | 
			
		||||
      SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
 | 
			
		||||
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Site diagonal is identity, right preconditioned by Mee^inv
 | 
			
		||||
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
 | 
			
		||||
 
 | 
			
		||||
@@ -54,6 +54,9 @@ public:
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
    profilerAllocate(bytes);
 | 
			
		||||
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
 | 
			
		||||
    if ( (_Tp*)ptr == (_Tp *) NULL ) {
 | 
			
		||||
      printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
 | 
			
		||||
    }
 | 
			
		||||
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
@@ -100,6 +103,9 @@ public:
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
    profilerAllocate(bytes);
 | 
			
		||||
    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
 | 
			
		||||
    if ( (_Tp*)ptr == (_Tp *) NULL ) {
 | 
			
		||||
      printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
 | 
			
		||||
    }
 | 
			
		||||
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
@@ -145,6 +151,9 @@ public:
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
    profilerAllocate(bytes);
 | 
			
		||||
    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
 | 
			
		||||
    if ( (_Tp*)ptr == (_Tp *) NULL ) {
 | 
			
		||||
      printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
 | 
			
		||||
    }
 | 
			
		||||
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
uint64_t total_shared;
 | 
			
		||||
uint64_t total_device;
 | 
			
		||||
uint64_t total_host;;
 | 
			
		||||
 | 
			
		||||
#if defined(__has_feature)
 | 
			
		||||
#if __has_feature(leak_sanitizer)
 | 
			
		||||
#define ASAN_LEAK_CHECK
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef ASAN_LEAK_CHECK
 | 
			
		||||
#include <sanitizer/asan_interface.h>
 | 
			
		||||
#include <sanitizer/common_interface_defs.h>
 | 
			
		||||
#include <sanitizer/lsan_interface.h>
 | 
			
		||||
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
 | 
			
		||||
#else
 | 
			
		||||
#define LEAK_CHECK(A) { }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
void MemoryManager::DisplayMallinfo(void)
 | 
			
		||||
{
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
  struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
 | 
			
		||||
  
 | 
			
		||||
  mi = mallinfo();
 | 
			
		||||
 | 
			
		||||
  std::cout << "MemoryManager: Total non-mmapped bytes (arena):       "<< (size_t)mi.arena<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: # of free chunks (ordblks):            "<< (size_t)mi.ordblks<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: # of free fastbin blocks (smblks):     "<< (size_t)mi.smblks<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: # of mapped regions (hblks):           "<< (size_t)mi.hblks<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: Bytes in mapped regions (hblkhd):      "<< (size_t)mi.hblkhd<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: Max. total allocated space (usmblks):  "<< (size_t)mi.usmblks<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: Total allocated space (uordblks):      "<< (size_t)mi.uordblks<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: Total free space (fordblks):           "<< (size_t)mi.fordblks<<std::endl;
 | 
			
		||||
  std::cout << "MemoryManager: Topmost releasable block (keepcost):   "<< (size_t)mi.keepcost<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
  LEAK_CHECK();
 | 
			
		||||
 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void MemoryManager::PrintBytes(void)
 | 
			
		||||
{
 | 
			
		||||
  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
 | 
			
		||||
@@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  cuda_mem();
 | 
			
		||||
#endif
 | 
			
		||||
  
 | 
			
		||||
  DisplayMallinfo();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
 | 
			
		||||
 
 | 
			
		||||
@@ -211,6 +211,7 @@ private:
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 public:
 | 
			
		||||
  static void DisplayMallinfo(void);
 | 
			
		||||
  static void NotifyDeletion(void * CpuPtr);
 | 
			
		||||
  static void Print(void);
 | 
			
		||||
  static void PrintAll(void);
 | 
			
		||||
 
 | 
			
		||||
@@ -91,6 +91,7 @@ public:
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  virtual int CheckerBoarded(int dim)=0;
 | 
			
		||||
  virtual int CheckerBoard(const Coordinate &site)=0;
 | 
			
		||||
  virtual int CheckerDim(void){ return 0; };
 | 
			
		||||
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
 | 
			
		||||
  virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
 | 
			
		||||
  virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
 | 
			
		||||
 
 | 
			
		||||
@@ -60,6 +60,7 @@ public:
 | 
			
		||||
  int              _checker_dim;
 | 
			
		||||
  std::vector<int> _checker_board;
 | 
			
		||||
 | 
			
		||||
  virtual int CheckerDim(void){ return _checker_dim; };
 | 
			
		||||
  virtual int CheckerBoarded(int dim){
 | 
			
		||||
    if( dim==_checker_dim) return 1;
 | 
			
		||||
    else return 0;
 | 
			
		||||
 
 | 
			
		||||
@@ -236,11 +236,18 @@ public:
 | 
			
		||||
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
 | 
			
		||||
    vobj vtmp;
 | 
			
		||||
    vtmp = r;
 | 
			
		||||
#if 1
 | 
			
		||||
    auto me  = View(CpuWrite);
 | 
			
		||||
    thread_for(ss,me.size(),{
 | 
			
		||||
       me[ss]= r;
 | 
			
		||||
      });
 | 
			
		||||
#else    
 | 
			
		||||
    auto me  = View(AcceleratorWrite);
 | 
			
		||||
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 | 
			
		||||
	auto stmp=coalescedRead(vtmp);
 | 
			
		||||
	coalescedWrite(me[ss],stmp);
 | 
			
		||||
    });
 | 
			
		||||
#endif    
 | 
			
		||||
    me.ViewClose();
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 | 
			
		||||
  const uint64_t sites = grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  // Might make all code paths go this way.
 | 
			
		||||
#if 0
 | 
			
		||||
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
  {
 | 
			
		||||
    autoView( left_v , left, AcceleratorRead);
 | 
			
		||||
    autoView( right_v,right, AcceleratorRead);
 | 
			
		||||
    // This code could read coalesce
 | 
			
		||||
    // GPU - SIMT lane compliance...
 | 
			
		||||
    accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
	auto x_l = left_v(ss);
 | 
			
		||||
	auto y_l = right_v(ss);
 | 
			
		||||
	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  deviceVector<inner_t> inner_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
    
 | 
			
		||||
  {
 | 
			
		||||
@@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 | 
			
		||||
	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  // This is in single precision and fails some tests
 | 
			
		||||
  auto anrm = sumD(inner_tmp_v,sites);  
 | 
			
		||||
  nrm = anrm;
 | 
			
		||||
@@ -373,7 +356,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
 | 
			
		||||
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 | 
			
		||||
#else
 | 
			
		||||
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  deviceVector<inner_t> inner_tmp;
 | 
			
		||||
  inner_tmp.resize(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
 
 | 
			
		||||
@@ -9,14 +9,18 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_objectD sobjD;
 | 
			
		||||
  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
 | 
			
		||||
  static Vector<sobj> mysum;
 | 
			
		||||
  mysum.resize(1);
 | 
			
		||||
  sobj *mysum_p = & mysum[0];
 | 
			
		||||
  sobj identity; zeroit(identity);
 | 
			
		||||
  mysum[0] = identity;
 | 
			
		||||
  sobj ret ; 
 | 
			
		||||
 | 
			
		||||
  Integer nsimd= vobj::Nsimd();
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
 | 
			
		||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
 | 
			
		||||
    auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
 | 
			
		||||
     cgh.parallel_for(cl::sycl::range<1>{osites},
 | 
			
		||||
		      Reduction,
 | 
			
		||||
		      [=] (cl::sycl::id<1> item, auto &sum) {
 | 
			
		||||
@@ -26,7 +30,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
 | 
			
		||||
   });
 | 
			
		||||
  theGridAccelerator->wait();
 | 
			
		||||
  ret = mysum[0];
 | 
			
		||||
  free(mysum,*theGridAccelerator);
 | 
			
		||||
  //  free(mysum,*theGridAccelerator);
 | 
			
		||||
  sobjD dret; convertType(dret,ret);
 | 
			
		||||
  return dret;
 | 
			
		||||
}
 | 
			
		||||
@@ -73,19 +77,23 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
 | 
			
		||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
 | 
			
		||||
{
 | 
			
		||||
  Word xorResult; xorResult = 0;
 | 
			
		||||
  Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
 | 
			
		||||
  static Vector<Word> d_sum;
 | 
			
		||||
  d_sum.resize(1);
 | 
			
		||||
  Word *d_sum_p=&d_sum[0];
 | 
			
		||||
  Word identity;  identity=0;
 | 
			
		||||
  d_sum[0] = identity;
 | 
			
		||||
  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
 | 
			
		||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
     auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
 | 
			
		||||
    auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
 | 
			
		||||
     cgh.parallel_for(cl::sycl::range<1>{L},
 | 
			
		||||
		      Reduction,
 | 
			
		||||
		      [=] (cl::sycl::id<1> index, auto &sum) {
 | 
			
		||||
	 sum ^=vec[index];
 | 
			
		||||
	 sum^=vec[index];
 | 
			
		||||
     });
 | 
			
		||||
   });
 | 
			
		||||
  theGridAccelerator->wait();
 | 
			
		||||
  Word ret = d_sum[0];
 | 
			
		||||
  free(d_sum,*theGridAccelerator);
 | 
			
		||||
  //  free(d_sum,*theGridAccelerator);
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
#pragma once
 | 
			
		||||
#include <type_traits>
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_CUDA)
 | 
			
		||||
 | 
			
		||||
#include <cub/cub.cuh>
 | 
			
		||||
@@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_SYCL)
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 | 
			
		||||
{
 | 
			
		||||
  size_t subvol_size = e1*e2;
 | 
			
		||||
 | 
			
		||||
  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
 | 
			
		||||
  vobj vobj_zero;
 | 
			
		||||
  zeroit(vobj_zero);
 | 
			
		||||
  for (int r = 0; r<rd; r++) { 
 | 
			
		||||
    mysum[r] = vobj_zero; 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  commVector<vobj> reduction_buffer(rd*subvol_size);    
 | 
			
		||||
 | 
			
		||||
  auto rb_p = &reduction_buffer[0];
 | 
			
		||||
 | 
			
		||||
  // autoView(Data_v, Data, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  //prepare reduction buffer 
 | 
			
		||||
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
 | 
			
		||||
  
 | 
			
		||||
      int n = s / e2;
 | 
			
		||||
      int b = s % e2;
 | 
			
		||||
      int so=r*ostride; // base offset for start of plane 
 | 
			
		||||
      int ss= so+n*stride+b;
 | 
			
		||||
 | 
			
		||||
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
 | 
			
		||||
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  for (int r = 0; r < rd; r++) {
 | 
			
		||||
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
 | 
			
		||||
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
 | 
			
		||||
          Reduction,
 | 
			
		||||
          [=](cl::sycl::id<1> item, auto &sum) {
 | 
			
		||||
              auto s = item[0];
 | 
			
		||||
              sum += rb_p[r*subvol_size+s];
 | 
			
		||||
          });
 | 
			
		||||
      });
 | 
			
		||||
      
 | 
			
		||||
     
 | 
			
		||||
  }
 | 
			
		||||
  theGridAccelerator->wait();
 | 
			
		||||
  for (int r = 0; r < rd; r++) {
 | 
			
		||||
    lvSum[r] = mysum[r];
 | 
			
		||||
  }
 | 
			
		||||
  free(mysum,*theGridAccelerator);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
 | 
			
		||||
  typedef typename vobj::vector_type vector;
 | 
			
		||||
  const int words = sizeof(vobj)/sizeof(vector);
 | 
			
		||||
  const int osites = rd*e1*e2;
 | 
			
		||||
@@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 | 
			
		||||
	    buf[ss] = dat[ss*words+w];
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
 | 
			
		||||
      
 | 
			
		||||
    #if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
 | 
			
		||||
    #elif defined(GRID_SYCL)
 | 
			
		||||
      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
    for (int r = 0; r < rd; r++) {
 | 
			
		||||
      lvSum_ptr[w+words*r]=lvSum_small[r];
 | 
			
		||||
    }
 | 
			
		||||
@@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 | 
			
		||||
{
 | 
			
		||||
  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
 | 
			
		||||
  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
 | 
			
		||||
    if constexpr (sizeof(vobj) <= 256) { 
 | 
			
		||||
      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
 | 
			
		||||
      #if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
      #elif defined (GRID_SYCL)
 | 
			
		||||
        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
      #endif
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_SYCL)
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  size_t subvol_size = e1*e2;
 | 
			
		||||
 | 
			
		||||
  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
 | 
			
		||||
  vobj vobj_zero;
 | 
			
		||||
  zeroit(vobj_zero);
 | 
			
		||||
    
 | 
			
		||||
  commVector<vobj> reduction_buffer(rd*subvol_size);    
 | 
			
		||||
 | 
			
		||||
  auto rb_p = &reduction_buffer[0];
 | 
			
		||||
 | 
			
		||||
  autoView(Data_v, Data, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  //prepare reduction buffer 
 | 
			
		||||
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
 | 
			
		||||
  
 | 
			
		||||
      int n = s / e2;
 | 
			
		||||
      int b = s % e2;
 | 
			
		||||
      int so=r*ostride; // base offset for start of plane 
 | 
			
		||||
      int ss= so+n*stride+b;
 | 
			
		||||
 | 
			
		||||
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
 | 
			
		||||
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  for (int r = 0; r < rd; r++) {
 | 
			
		||||
      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
 | 
			
		||||
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
 | 
			
		||||
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
 | 
			
		||||
          Reduction,
 | 
			
		||||
          [=](cl::sycl::id<1> item, auto &sum) {
 | 
			
		||||
              auto s = item[0];
 | 
			
		||||
              sum += rb_p[r*subvol_size+s];
 | 
			
		||||
          });
 | 
			
		||||
      });
 | 
			
		||||
      theGridAccelerator->wait();
 | 
			
		||||
      lvSum[r] = mysum[0];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  free(mysum,*theGridAccelerator);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 | 
			
		||||
{
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
@@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 | 
			
		||||
{
 | 
			
		||||
  #if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
 | 
			
		||||
  
 | 
			
		||||
  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
  
 | 
			
		||||
  #elif defined(GRID_SYCL)
 | 
			
		||||
  
 | 
			
		||||
  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
  
 | 
			
		||||
  #else
 | 
			
		||||
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
 | 
			
		||||
 
 | 
			
		||||
@@ -42,50 +42,21 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
    assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); 
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// remove and insert a half checkerboard
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
 | 
			
		||||
{
 | 
			
		||||
  half.Checkerboard() = cb;
 | 
			
		||||
 | 
			
		||||
  autoView( half_v, half, CpuWrite);
 | 
			
		||||
  autoView( full_v, full, CpuRead);
 | 
			
		||||
  thread_for(ss, full.Grid()->oSites(),{
 | 
			
		||||
    int cbos;
 | 
			
		||||
    Coordinate coor;
 | 
			
		||||
    full.Grid()->oCoorFromOindex(coor,ss);
 | 
			
		||||
    cbos=half.Grid()->CheckerBoard(coor);
 | 
			
		||||
 | 
			
		||||
    if (cbos==cb) {
 | 
			
		||||
      int ssh=half.Grid()->oIndex(coor);
 | 
			
		||||
      half_v[ssh] = full_v[ss];
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
  acceleratorPickCheckerboard(cb,half,full);
 | 
			
		||||
}
 | 
			
		||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
 | 
			
		||||
{
 | 
			
		||||
  int cb = half.Checkerboard();
 | 
			
		||||
  autoView( half_v , half, CpuRead);
 | 
			
		||||
  autoView( full_v , full, CpuWrite);
 | 
			
		||||
  thread_for(ss,full.Grid()->oSites(),{
 | 
			
		||||
 | 
			
		||||
    Coordinate coor;
 | 
			
		||||
    int cbos;
 | 
			
		||||
 | 
			
		||||
    full.Grid()->oCoorFromOindex(coor,ss);
 | 
			
		||||
    cbos=half.Grid()->CheckerBoard(coor);
 | 
			
		||||
      
 | 
			
		||||
    if (cbos==cb) {
 | 
			
		||||
      int ssh=half.Grid()->oIndex(coor);
 | 
			
		||||
      full_v[ss]=half_v[ssh];
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
  acceleratorSetCheckerboard(full,half);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 | 
			
		||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
 | 
			
		||||
{
 | 
			
		||||
  half.Checkerboard() = cb;
 | 
			
		||||
  autoView(half_v, half, AcceleratorWrite);
 | 
			
		||||
@@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
 | 
			
		||||
  unsigned long ndim_half          = half.Grid()->_ndimension;
 | 
			
		||||
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
 | 
			
		||||
  Coordinate ostride_half          = half.Grid()->_ostride;
 | 
			
		||||
  int checker_dim_half             = half.Grid()->CheckerDim();
 | 
			
		||||
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
 | 
			
		||||
    
 | 
			
		||||
    Coordinate coor;
 | 
			
		||||
@@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 | 
			
		||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
 | 
			
		||||
{
 | 
			
		||||
  int cb = half.Checkerboard();
 | 
			
		||||
  autoView(half_v , half, AcceleratorRead);
 | 
			
		||||
@@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
 | 
			
		||||
  unsigned long ndim_half          = half.Grid()->_ndimension;
 | 
			
		||||
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
 | 
			
		||||
  Coordinate ostride_half          = half.Grid()->_ostride;
 | 
			
		||||
  int checker_dim_half             = half.Grid()->CheckerDim();
 | 
			
		||||
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
 | 
			
		||||
 | 
			
		||||
    Coordinate coor;
 | 
			
		||||
 
 | 
			
		||||
@@ -90,6 +90,7 @@ public:
 | 
			
		||||
        exit(1);
 | 
			
		||||
      }
 | 
			
		||||
      Parameters.StartingType = arg;
 | 
			
		||||
      std::cout <<GridLogMessage << " GenericHMCrunner --StartingType "<<arg<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
 | 
			
		||||
@@ -97,6 +98,7 @@ public:
 | 
			
		||||
      std::vector<int> ivec(0);
 | 
			
		||||
      GridCmdOptionIntVector(arg, ivec);
 | 
			
		||||
      Parameters.StartTrajectory = ivec[0];
 | 
			
		||||
      std::cout <<GridLogMessage << " GenericHMCrunner --StartingTrajectory "<<ivec[0]<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
 | 
			
		||||
@@ -104,6 +106,7 @@ public:
 | 
			
		||||
      std::vector<int> ivec(0);
 | 
			
		||||
      GridCmdOptionIntVector(arg, ivec);
 | 
			
		||||
      Parameters.Trajectories = ivec[0];
 | 
			
		||||
      std::cout << GridLogMessage<<" GenericHMCrunner Command Line --Trajectories "<<ivec[0]<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
 | 
			
		||||
@@ -111,6 +114,7 @@ public:
 | 
			
		||||
      std::vector<int> ivec(0);
 | 
			
		||||
      GridCmdOptionIntVector(arg, ivec);
 | 
			
		||||
      Parameters.NoMetropolisUntil = ivec[0];
 | 
			
		||||
      std::cout << GridLogMessage<<" GenericHMCrunner --Thermalizations "<<ivec[0]<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
 | 
			
		||||
      arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
 | 
			
		||||
 
 | 
			
		||||
@@ -137,9 +137,11 @@ public:
 | 
			
		||||
 | 
			
		||||
      double start_force = usecond();
 | 
			
		||||
 | 
			
		||||
      MemoryManager::Print();
 | 
			
		||||
      as[level].actions.at(a)->deriv_timer_start();
 | 
			
		||||
      as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta
 | 
			
		||||
      as[level].actions.at(a)->deriv_timer_stop();
 | 
			
		||||
      MemoryManager::Print();
 | 
			
		||||
 | 
			
		||||
      auto name = as[level].actions.at(a)->action_name();
 | 
			
		||||
 | 
			
		||||
@@ -246,7 +248,11 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  virtual ~Integrator() {}
 | 
			
		||||
  virtual ~Integrator()
 | 
			
		||||
  {
 | 
			
		||||
    // Pain in the ass to clean up the Level pointers
 | 
			
		||||
    // Guido's design is at fault as per comment above in constructor
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  virtual std::string integrator_name() = 0;
 | 
			
		||||
  
 | 
			
		||||
@@ -460,6 +466,7 @@ public:
 | 
			
		||||
    for (int level = 0; level < as.size(); ++level) {
 | 
			
		||||
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 | 
			
		||||
 | 
			
		||||
	MemoryManager::Print();
 | 
			
		||||
        // get gauge field from the SmearingPolicy and
 | 
			
		||||
        // based on the boolean is_smeared in actionID
 | 
			
		||||
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 | 
			
		||||
@@ -468,6 +475,7 @@ public:
 | 
			
		||||
   	        as[level].actions.at(actionID)->S_timer_stop();
 | 
			
		||||
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
 | 
			
		||||
        H += Hterm;
 | 
			
		||||
	MemoryManager::Print();
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      as[level].apply(S_hireps, Representations, level, H);
 | 
			
		||||
 
 | 
			
		||||
@@ -32,7 +32,9 @@ private:
 | 
			
		||||
  //  Smear_Stout<Gimpl> *StoutSmearing;
 | 
			
		||||
  //  std::vector<GaugeField> SmearedSet;
 | 
			
		||||
  
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
 | 
			
		||||
  std::vector<LatticeLorentzComplex> masks;
 | 
			
		||||
  std::vector<int> cbs;
 | 
			
		||||
 | 
			
		||||
  typedef typename SU3Adjoint::AMatrix AdjMatrix;
 | 
			
		||||
  typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField;
 | 
			
		||||
@@ -147,6 +149,25 @@ private:
 | 
			
		||||
    }
 | 
			
		||||
    pokeLorentz(Fdet, Fdet_pol, nu);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void Compute_MpInvJx_dNxxdSy(int cb,
 | 
			
		||||
			       const GaugeLinkField &PlaqL,
 | 
			
		||||
			       const GaugeLinkField &PlaqR,
 | 
			
		||||
			       AdjMatrixField MpInvJx,
 | 
			
		||||
			       AdjVectorField &Fdet2 )
 | 
			
		||||
  {
 | 
			
		||||
    GaugeLinkField PlaqLeo(UrbGrid);
 | 
			
		||||
    GaugeLinkField PlaqReo(UrbGrid);
 | 
			
		||||
    AdjMatrixField MpInvJxeo(UrbGrid);
 | 
			
		||||
    AdjVectorField Fdet2eo(UrbGrid);
 | 
			
		||||
    pickCheckerboard(cb,PlaqLeo,PlaqL);
 | 
			
		||||
    pickCheckerboard(cb,PlaqReo,PlaqR);
 | 
			
		||||
    pickCheckerboard(cb,MpInvJxeo,MpInvJx);
 | 
			
		||||
    Fdet2eo.Checkerboard()=cb;
 | 
			
		||||
    Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
 | 
			
		||||
    setCheckerboard(Fdet2,Fdet2eo);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
 | 
			
		||||
  {
 | 
			
		||||
    GaugeLinkField UtaU(PlaqL.Grid());
 | 
			
		||||
@@ -278,8 +299,9 @@ public:
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Mask the gauge field
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    int cb = cbs[smr];
 | 
			
		||||
    auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    Umsk = U;
 | 
			
		||||
    ApplyMask(Umsk,smr);
 | 
			
		||||
    Utmp = peekLorentz(Umsk,mu);
 | 
			
		||||
@@ -442,7 +464,7 @@ public:
 | 
			
		||||
    AdjMatrixField MpInvJx_nu(grid);
 | 
			
		||||
    MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
 | 
			
		||||
 | 
			
		||||
    Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
 | 
			
		||||
    Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
 | 
			
		||||
    Fdet2_mu=FdetV;
 | 
			
		||||
    Fdet1_mu=Zero();
 | 
			
		||||
    
 | 
			
		||||
@@ -499,7 +521,7 @@ public:
 | 
			
		||||
 | 
			
		||||
	time=-usecond();
 | 
			
		||||
	PlaqR=(-1.0)*PlaqR;
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
 | 
			
		||||
	Fdet2_nu = FdetV;
 | 
			
		||||
	time+=usecond();
 | 
			
		||||
	std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
 | 
			
		||||
@@ -520,7 +542,7 @@ public:
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Fdet2_nu = Fdet2_nu+FdetV;
 | 
			
		||||
	
 | 
			
		||||
	///////////////// -ve nu /////////////////
 | 
			
		||||
@@ -539,7 +561,7 @@ public:
 | 
			
		||||
	Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
 | 
			
		||||
 | 
			
		||||
	MpInvJx_nu = Cshift(MpInvJx,nu,1);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Fdet2_nu = Fdet2_nu+FdetV;
 | 
			
		||||
	
 | 
			
		||||
	// x==
 | 
			
		||||
@@ -560,7 +582,7 @@ public:
 | 
			
		||||
 | 
			
		||||
	MpInvJx_nu = Cshift(MpInvJx,mu,-1);
 | 
			
		||||
	MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Fdet2_nu = Fdet2_nu+FdetV;
 | 
			
		||||
 | 
			
		||||
	/////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -589,7 +611,7 @@ public:
 | 
			
		||||
 | 
			
		||||
	MpInvJx_nu = Cshift(MpInvJx,nu,-1);
 | 
			
		||||
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Fdet2_mu = Fdet2_mu+FdetV;
 | 
			
		||||
 | 
			
		||||
	//  __
 | 
			
		||||
@@ -609,7 +631,7 @@ public:
 | 
			
		||||
 | 
			
		||||
	MpInvJx_nu = Cshift(MpInvJx,nu,1);
 | 
			
		||||
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
 | 
			
		||||
	Fdet2_mu = Fdet2_mu+FdetV;
 | 
			
		||||
	
 | 
			
		||||
      }
 | 
			
		||||
@@ -931,6 +953,10 @@ private:
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  /* Standard constructor */
 | 
			
		||||
  virtual ~SmearedConfigurationMasked()
 | 
			
		||||
  {
 | 
			
		||||
    delete UrbGrid;
 | 
			
		||||
  }
 | 
			
		||||
  SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
 | 
			
		||||
    : SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
 | 
			
		||||
  {
 | 
			
		||||
@@ -939,7 +965,6 @@ public:
 | 
			
		||||
    // was resized in base class
 | 
			
		||||
    assert(this->SmearedSet.size()==Nsmear);
 | 
			
		||||
    
 | 
			
		||||
    GridRedBlackCartesian * UrbGrid;
 | 
			
		||||
    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
 | 
			
		||||
    LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
 | 
			
		||||
    LatticeComplex tmp(_UGrid);
 | 
			
		||||
@@ -947,10 +972,11 @@ public:
 | 
			
		||||
    for (unsigned int i = 0; i < this->smearingLevels; ++i) {
 | 
			
		||||
 | 
			
		||||
      masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
 | 
			
		||||
 | 
			
		||||
      int mu= (i/2) %Nd;
 | 
			
		||||
      int cb= (i%2);
 | 
			
		||||
      LatticeComplex tmpcb(UrbGrid);
 | 
			
		||||
 | 
			
		||||
      cbs.push_back(cb);
 | 
			
		||||
	
 | 
			
		||||
      masks[i]=Zero();
 | 
			
		||||
      ////////////////////
 | 
			
		||||
@@ -962,7 +988,6 @@ public:
 | 
			
		||||
      PokeIndex<LorentzIndex>(masks[i],tmp, mu);
 | 
			
		||||
	
 | 
			
		||||
    }
 | 
			
		||||
    delete UrbGrid;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  virtual void smeared_force(GaugeField &SigmaTilde) 
 | 
			
		||||
 
 | 
			
		||||
@@ -418,32 +418,32 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in,
 | 
			
		||||
  int hNNm1= NNm1/2;
 | 
			
		||||
  RealD sqrt_2 = sqrt(2.0);
 | 
			
		||||
  Complex ci(0.0,1.0);
 | 
			
		||||
  for(int su2Index=0;su2Index<hNNm1;su2Index++){
 | 
			
		||||
    int i1, i2;
 | 
			
		||||
    su2SubGroupIndex(i1, i2, su2Index);
 | 
			
		||||
    int ax = su2Index*2;
 | 
			
		||||
    int ay = su2Index*2+1;
 | 
			
		||||
    accelerator_for(ss,grid->oSites(),1,{
 | 
			
		||||
 | 
			
		||||
  const int nsimd=  Matrix::Nsimd();
 | 
			
		||||
  accelerator_for(ss,grid->oSites(),nsimd,{
 | 
			
		||||
      for(int su2Index=0;su2Index<hNNm1;su2Index++){
 | 
			
		||||
	int i1, i2;
 | 
			
		||||
	su2SubGroupIndex(i1, i2, su2Index);
 | 
			
		||||
	int ax = su2Index*2;
 | 
			
		||||
	int ay = su2Index*2+1;
 | 
			
		||||
	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
 | 
			
		||||
	// trace( Ta x Ci in)
 | 
			
		||||
	// Bet I need to move to real part with mult by -i
 | 
			
		||||
	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
 | 
			
		||||
	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
 | 
			
		||||
      });
 | 
			
		||||
  }
 | 
			
		||||
  for(int diagIndex=0;diagIndex<N-1;diagIndex++){
 | 
			
		||||
    int k = diagIndex + 1; // diagIndex starts from 0
 | 
			
		||||
    int a = NNm1+diagIndex;
 | 
			
		||||
    RealD scale = 1.0/sqrt(2.0*k*(k+1));
 | 
			
		||||
    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
 | 
			
		||||
	auto tmp = in_v[ss]()()(0,0);
 | 
			
		||||
	coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2))));
 | 
			
		||||
	coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1))));
 | 
			
		||||
      }
 | 
			
		||||
      for(int diagIndex=0;diagIndex<N-1;diagIndex++){
 | 
			
		||||
	int k = diagIndex + 1; // diagIndex starts from 0
 | 
			
		||||
	int a = NNm1+diagIndex;
 | 
			
		||||
	RealD scale = 1.0/sqrt(2.0*k*(k+1));
 | 
			
		||||
	auto tmp = in_v(ss)()()(0,0);
 | 
			
		||||
	for(int i=1;i<k;i++){
 | 
			
		||||
	  tmp=tmp+in_v[ss]()()(i,i);
 | 
			
		||||
	  tmp=tmp+in_v(ss)()()(i,i);
 | 
			
		||||
	}
 | 
			
		||||
	tmp = tmp - in_v[ss]()()(k,k)*k;
 | 
			
		||||
	out_v[ss]()()(a,b) =imag(tmp) * scale;
 | 
			
		||||
      });
 | 
			
		||||
    }
 | 
			
		||||
	tmp = tmp - in_v(ss)()()(k,k)*k;
 | 
			
		||||
	coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale);
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Map a su2 subgroup number to the pair of rows that are non zero
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
 | 
			
		||||
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
 | 
			
		||||
  assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
 | 
			
		||||
 | 
			
		||||
  int spare = su2_index;
 | 
			
		||||
 
 | 
			
		||||
@@ -99,6 +99,8 @@ using std::log;
 | 
			
		||||
using std::exp;
 | 
			
		||||
using std::sin;
 | 
			
		||||
using std::cos;
 | 
			
		||||
using std::asin;
 | 
			
		||||
using std::acos;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
accelerator_inline RealF    conjugate(const RealF  & r){ return r; }
 | 
			
		||||
 
 | 
			
		||||
@@ -460,3 +460,9 @@ void vprefetch(const iMatrix<v, N> &vv) {
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
template<class vec> struct sycl::is_device_copyable<Grid::iScalar<vec> > : public std::true_type {};
 | 
			
		||||
template<class vec,int N> struct sycl::is_device_copyable<Grid::iVector<vec,N> > : public std::true_type {};
 | 
			
		||||
template<class vec,int N> struct sycl::is_device_copyable<Grid::iMatrix<vec,N> > : public std::true_type {};
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -210,8 +210,8 @@ void acceleratorInit(void)
 | 
			
		||||
  cl::sycl::gpu_selector selector;
 | 
			
		||||
  cl::sycl::device selectedDevice { selector };
 | 
			
		||||
  theGridAccelerator = new sycl::queue (selectedDevice);
 | 
			
		||||
  //  theCopyAccelerator = new sycl::queue (selectedDevice);
 | 
			
		||||
  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 | 
			
		||||
  theCopyAccelerator = new sycl::queue (selectedDevice);
 | 
			
		||||
  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
			
		||||
  zeInit(0);
 | 
			
		||||
 
 | 
			
		||||
@@ -539,12 +539,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
 | 
			
		||||
  acceleratorCopySynchronise();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
// CPU Target - No accelerator just thread instead
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
@@ -553,7 +547,6 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 | 
			
		||||
 | 
			
		||||
#undef GRID_SIMT
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
inline void acceleratorMem(void)
 | 
			
		||||
{
 | 
			
		||||
  /*
 | 
			
		||||
@@ -656,6 +649,12 @@ accelerator_inline void acceleratorFence(void)
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
 | 
			
		||||
  acceleratorCopySynchronise();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class T> void acceleratorPut(T& dev,T&host)
 | 
			
		||||
{
 | 
			
		||||
  acceleratorCopyToDevice(&host,&dev,sizeof(T));
 | 
			
		||||
 
 | 
			
		||||
@@ -247,9 +247,12 @@ void FlightRecorder::ReductionLog(double local,double global)
 | 
			
		||||
}
 | 
			
		||||
void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  if(LoggingMode == LoggingModeNone) return;
 | 
			
		||||
 | 
			
		||||
  if ( ChecksumCommsSend ){
 | 
			
		||||
  uint64_t *ubuf = (uint64_t *)buf;
 | 
			
		||||
  if(LoggingMode == LoggingModeNone) return;
 | 
			
		||||
  
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
 | 
			
		||||
  if(LoggingMode == LoggingModePrint) {
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										238
									
								
								HMC/ComputeWilsonFlow.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										238
									
								
								HMC/ComputeWilsonFlow.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,238 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: HMC/ComputeWilsonFlow.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
 | 
			
		||||
Author: Shuhei Yamamoto <syamamoto@bnl.gov>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution
 | 
			
		||||
directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#include <string>
 | 
			
		||||
 | 
			
		||||
namespace Grid{
 | 
			
		||||
  struct WFParameters: Serializable {
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
 | 
			
		||||
            int, steps,
 | 
			
		||||
            double, step_size,
 | 
			
		||||
            int, meas_interval,
 | 
			
		||||
	    double, maxTau, // for the adaptive algorithm
 | 
			
		||||
	    int, meas_interval_density,
 | 
			
		||||
	    std::string, path); 
 | 
			
		||||
       
 | 
			
		||||
 | 
			
		||||
    template <class ReaderClass >
 | 
			
		||||
    WFParameters(Reader<ReaderClass>& Reader){
 | 
			
		||||
      read(Reader, "WilsonFlow", *this);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct ConfParameters: Serializable {
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters,
 | 
			
		||||
	   std::string, conf_path,
 | 
			
		||||
           std::string, conf_prefix,
 | 
			
		||||
	   std::string, conf_smr_prefix,
 | 
			
		||||
           std::string, rng_prefix,
 | 
			
		||||
	   int, StartConfiguration,
 | 
			
		||||
	   int, EndConfiguration,
 | 
			
		||||
           int, Skip);
 | 
			
		||||
  
 | 
			
		||||
    template <class ReaderClass >
 | 
			
		||||
    ConfParameters(Reader<ReaderClass>& Reader){
 | 
			
		||||
      read(Reader, "Configurations", *this);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class T> void writeFile(T& in, std::string const fname){  
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
 | 
			
		||||
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
 | 
			
		||||
  Grid::emptyUserRecord record;
 | 
			
		||||
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
 | 
			
		||||
  WR.open(fname);
 | 
			
		||||
  WR.writeScidacFieldRecord(in,record,0);
 | 
			
		||||
  WR.close();
 | 
			
		||||
#endif
 | 
			
		||||
  // What is the appropriate way to throw error?
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
  using namespace Grid;
 | 
			
		||||
  
 | 
			
		||||
  Grid_init(&argc, &argv);
 | 
			
		||||
  GridLogLayout();
 | 
			
		||||
 | 
			
		||||
  auto latt_size   = GridDefaultLatt();
 | 
			
		||||
  auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
 | 
			
		||||
  auto mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian               Grid(latt_size, simd_layout, mpi_layout);
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> seeds({1, 2, 3, 4, 5});
 | 
			
		||||
  GridSerialRNG sRNG;
 | 
			
		||||
  GridParallelRNG pRNG(&Grid);
 | 
			
		||||
  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeField Umu(&Grid), Uflow(&Grid);
 | 
			
		||||
  
 | 
			
		||||
  typedef Grid::XmlReader       Serialiser;
 | 
			
		||||
  Serialiser Reader("input.xml", false, "root");
 | 
			
		||||
  WFParameters WFPar(Reader);
 | 
			
		||||
  ConfParameters CPar(Reader);
 | 
			
		||||
  CheckpointerParameters CPPar(CPar.conf_path+CPar.conf_prefix, CPar.conf_path+CPar.conf_smr_prefix, CPar.conf_path+CPar.rng_prefix);
 | 
			
		||||
  NerscHmcCheckpointer<PeriodicGimplR> CPNersc(CPPar);
 | 
			
		||||
 | 
			
		||||
  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
 | 
			
		||||
 | 
			
		||||
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
 | 
			
		||||
 | 
			
		||||
  std::cout << std::setprecision(15);
 | 
			
		||||
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  std::string file_pre  = WFPar.path;
 | 
			
		||||
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
 | 
			
		||||
 | 
			
		||||
  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
 | 
			
		||||
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
 | 
			
		||||
    
 | 
			
		||||
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
 | 
			
		||||
    typedef typename PeriodicGimplR::ComplexField ComplexField;
 | 
			
		||||
    
 | 
			
		||||
    assert(Nd == 4);
 | 
			
		||||
 | 
			
		||||
    // NOTE:
 | 
			
		||||
    // Ideally, turn the folloing into methods of the appropriate class
 | 
			
		||||
    /////////////   Compute Energy Density via Clover Leaf    /////////////////////////////////////////////////
 | 
			
		||||
    ///// Taken from qcd/smearing/WilsonFlow.h
 | 
			
		||||
    //         For plq, use static sitePlaquette from class WilsonLoops in Grid/qcd/utils/WilsonLoops.h and divide it by #faces=(1.0 * Nd * (Nd - 1)) / 2.0, ncol=3
 | 
			
		||||
    //E = 1/2 tr( F_munu F_munu )
 | 
			
		||||
    //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
 | 
			
		||||
    //F_01 F_02 F_03   F_12 F_13  F_23
 | 
			
		||||
    GaugeMat F(U.Grid());
 | 
			
		||||
    //LatticeComplexD R(U.Grid());
 | 
			
		||||
    ComplexField R(U.Grid());
 | 
			
		||||
    R = Zero();
 | 
			
		||||
  
 | 
			
		||||
    for(int mu=0;mu<3;mu++){
 | 
			
		||||
      for(int nu=mu+1;nu<4;nu++){
 | 
			
		||||
	WilsonLoops<PeriodicGimplR>::FieldStrength(F, U, mu, nu);
 | 
			
		||||
	R = R + trace(F*F);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    R = (-1.0) * R;
 | 
			
		||||
    
 | 
			
		||||
    //// Taken from qcd/utils/WilsonLoops.h
 | 
			
		||||
    
 | 
			
		||||
    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
 | 
			
		||||
    GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid());
 | 
			
		||||
    WilsonLoops<PeriodicGimplR>::FieldStrength(Bx, U, Ydir, Zdir);
 | 
			
		||||
    WilsonLoops<PeriodicGimplR>::FieldStrength(By, U, Zdir, Xdir);
 | 
			
		||||
    WilsonLoops<PeriodicGimplR>::FieldStrength(Bz, U, Xdir, Ydir);
 | 
			
		||||
 | 
			
		||||
    // Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z)
 | 
			
		||||
    GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid());
 | 
			
		||||
    WilsonLoops<PeriodicGimplR>::FieldStrength(Ex, U, Tdir, Xdir);
 | 
			
		||||
    WilsonLoops<PeriodicGimplR>::FieldStrength(Ey, U, Tdir, Ydir);
 | 
			
		||||
    WilsonLoops<PeriodicGimplR>::FieldStrength(Ez, U, Tdir, Zdir);
 | 
			
		||||
 | 
			
		||||
    double coeff = 8.0/(32.0*M_PI*M_PI);
 | 
			
		||||
    ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
 | 
			
		||||
    //ComplexField qfield Plq(U.Grid());
 | 
			
		||||
    //WilsonLoops<PeriodicGimplR>::sitePlaquette(Plq, U);
 | 
			
		||||
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
 | 
			
		||||
    //Plq = coeff * Plq;
 | 
			
		||||
 | 
			
		||||
    int tau = std::round(t);
 | 
			
		||||
    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
 | 
			
		||||
    writeFile(R,efile);
 | 
			
		||||
    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
 | 
			
		||||
    writeFile(qfield,tfile);
 | 
			
		||||
 | 
			
		||||
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
 | 
			
		||||
    RealD T = real( sum(qfield) );
 | 
			
		||||
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
 | 
			
		||||
    RealD E0 = real(peekSite(R,scoor));
 | 
			
		||||
    RealD T0 = real(peekSite(qfield,scoor));
 | 
			
		||||
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
 | 
			
		||||
	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
 | 
			
		||||
    
 | 
			
		||||
  });
 | 
			
		||||
  
 | 
			
		||||
  int t=WFPar.maxTau;
 | 
			
		||||
  WF.smear(Uflow, Umu);
 | 
			
		||||
 | 
			
		||||
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
 | 
			
		||||
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
 | 
			
		||||
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
 | 
			
		||||
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
 | 
			
		||||
  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<< GridLogMessage << " Admissibility check:\n";
 | 
			
		||||
  const double sp_adm = 0.067;                // admissible threshold
 | 
			
		||||
  const double pl_adm = 1.0-sp_adm/Nc;
 | 
			
		||||
  std::cout << GridLogMessage << "   (pl_adm =" << pl_adm << ")\n";
 | 
			
		||||
 | 
			
		||||
  // Need min and reduce min for this function
 | 
			
		||||
  //double sp_max = NC_*(1.0-stpl.plaq_min(U,pl_adm));
 | 
			
		||||
  double sp_ave = Nc*(1.0-WFlow_plaq);
 | 
			
		||||
 | 
			
		||||
  //std::cout<< GridLogMessage << "   sp_max = "        << sp_max <<"\n";
 | 
			
		||||
  std::cout<< GridLogMessage << "   sp_ave = "        << sp_ave <<"\n";
 | 
			
		||||
  std::cout<< GridLogMessage << "   (sp_admissible = "<< sp_adm <<")\n";
 | 
			
		||||
  //std::cout<< GridLogMessage << "   sp_admissible - sp_max = "<<sp_adm-sp_max <<"\n";
 | 
			
		||||
  std::cout<< GridLogMessage << "   sp_admissible - sp_ave = "<<sp_adm-sp_ave <<"\n";
 | 
			
		||||
  }
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}  // main
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
Input file example
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
JSON
 | 
			
		||||
 | 
			
		||||
{
 | 
			
		||||
    "WilsonFlow":{
 | 
			
		||||
	"steps": 200,
 | 
			
		||||
	"step_size": 0.01,
 | 
			
		||||
	"meas_interval": 50,
 | 
			
		||||
  "maxTau": 2.0
 | 
			
		||||
    },
 | 
			
		||||
    "Configurations":{
 | 
			
		||||
	"conf_prefix": "ckpoint_lat",
 | 
			
		||||
	"rng_prefix": "ckpoint_rng",
 | 
			
		||||
	"StartConfiguration": 3000,
 | 
			
		||||
	"EndConfiguration": 3000,
 | 
			
		||||
	"Skip": 5
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
*/
 | 
			
		||||
@@ -58,7 +58,7 @@ int main(int argc, char **argv) {
 | 
			
		||||
  HMCparameters HMCparams;
 | 
			
		||||
  HMCparams.StartTrajectory  = 0;
 | 
			
		||||
  HMCparams.Trajectories     = 200;
 | 
			
		||||
  HMCparams.NoMetropolisUntil=  20;
 | 
			
		||||
  HMCparams.NoMetropolisUntil=  0;
 | 
			
		||||
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
 | 
			
		||||
  HMCparams.StartingType     =std::string("ColdStart");
 | 
			
		||||
  HMCparams.MD = MD;
 | 
			
		||||
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
 | 
			
		||||
  CheckpointerParameters CPparams;
 | 
			
		||||
  CPparams.config_prefix = "ckpoint_EODWF_lat";
 | 
			
		||||
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
 | 
			
		||||
  CPparams.saveInterval  = 10;
 | 
			
		||||
  CPparams.saveInterval  = 1;
 | 
			
		||||
  CPparams.format        = "IEEE64BIG";
 | 
			
		||||
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
 | 
			
		||||
 | 
			
		||||
@@ -186,6 +186,8 @@ int main(int argc, char **argv) {
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////
 | 
			
		||||
  // HMC parameters are serialisable
 | 
			
		||||
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file                                                                     
 | 
			
		||||
  TheHMC.initializeGaugeFieldAndRNGs(U);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
 | 
			
		||||
  TheHMC.Run();  // no smearing
 | 
			
		||||
 
 | 
			
		||||
@@ -30,11 +30,13 @@ directory
 | 
			
		||||
#include <string>
 | 
			
		||||
 | 
			
		||||
template <class T> void readFile(T& out, std::string const fname){
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  Grid::emptyUserRecord record;
 | 
			
		||||
  Grid::ScidacReader RD;
 | 
			
		||||
  RD.open(fname);
 | 
			
		||||
  RD.readScidacFieldRecord(out,record);
 | 
			
		||||
  RD.close();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -31,11 +31,13 @@ directory
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
template <class T> void writeFile(T& out, std::string const fname){
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  emptyUserRecord record;
 | 
			
		||||
  ScidacWriter WR(out.Grid()->IsBoss());
 | 
			
		||||
  WR.open(fname);
 | 
			
		||||
  WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
 | 
			
		||||
  WR.close();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
 
 | 
			
		||||
@@ -261,23 +261,25 @@ public:
 | 
			
		||||
    fprintf(FP,"\n\n");
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  template<class CComplex>
 | 
			
		||||
  static void BLAS(void)
 | 
			
		||||
  {
 | 
			
		||||
    //int nbasis, int nrhs, int coarseVol
 | 
			
		||||
    int  basis[] = { 16,32,64 };
 | 
			
		||||
    int  rhs[]   = { 8,16,32 };
 | 
			
		||||
    int  vol  = 4*4*4*4;
 | 
			
		||||
    int  rhs[]   = { 8,12,16 };
 | 
			
		||||
    int  vol  = 8*8*8*8;
 | 
			
		||||
    int  blk  = 4*4*4*4;
 | 
			
		||||
 | 
			
		||||
    GridBLAS blas;
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    int fpbits = sizeof(CComplex)*4;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= batched GEMM fp"<<fpbits<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
 | 
			
		||||
    fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank fp%d\n",fpbits);
 | 
			
		||||
 | 
			
		||||
    for(int b=0;b<3;b++){
 | 
			
		||||
    for(int r=0;r<3;r++){
 | 
			
		||||
@@ -285,7 +287,7 @@ public:
 | 
			
		||||
      int N=rhs[r];
 | 
			
		||||
      int K=basis[b];
 | 
			
		||||
      int BATCH=vol;
 | 
			
		||||
      double p=blas.benchmark(M,N,K,BATCH);
 | 
			
		||||
      double p=blas.benchmark<CComplex>(M,N,K,BATCH);
 | 
			
		||||
 | 
			
		||||
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
 | 
			
		||||
      
 | 
			
		||||
@@ -299,9 +301,9 @@ public:
 | 
			
		||||
    for(int r=0;r<3;r++){
 | 
			
		||||
      int M=basis[b];
 | 
			
		||||
      int N=rhs[r];
 | 
			
		||||
      int K=vol;
 | 
			
		||||
      int K=blk;
 | 
			
		||||
      int BATCH=vol;
 | 
			
		||||
      double p=blas.benchmark(M,N,K,BATCH);
 | 
			
		||||
      double p=blas.benchmark<CComplex>(M,N,K,BATCH);
 | 
			
		||||
 | 
			
		||||
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) 
 | 
			
		||||
@@ -313,10 +315,10 @@ public:
 | 
			
		||||
    for(int b=0;b<3;b++){
 | 
			
		||||
    for(int r=0;r<3;r++){
 | 
			
		||||
      int M=rhs[r];
 | 
			
		||||
      int N=vol;
 | 
			
		||||
      int N=blk;
 | 
			
		||||
      int K=basis[b];
 | 
			
		||||
      int BATCH=vol;
 | 
			
		||||
      double p=blas.benchmark(M,N,K,BATCH);
 | 
			
		||||
      double p=blas.benchmark<CComplex>(M,N,K,BATCH);
 | 
			
		||||
 | 
			
		||||
      fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) 
 | 
			
		||||
@@ -867,6 +869,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  int do_memory=1;
 | 
			
		||||
  int do_comms =1;
 | 
			
		||||
  int do_blas  =1;
 | 
			
		||||
  int do_dslash=1;
 | 
			
		||||
 | 
			
		||||
  int sel=4;
 | 
			
		||||
  std::vector<int> L_list({8,12,16,24,32});
 | 
			
		||||
@@ -877,6 +880,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::vector<double> staggered;
 | 
			
		||||
 | 
			
		||||
  int Ls=1;
 | 
			
		||||
  if (do_dslash){
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
@@ -901,6 +905,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
    staggered.push_back(result);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
@@ -909,8 +914,33 @@ int main (int argc, char ** argv)
 | 
			
		||||
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int NN=NN_global;
 | 
			
		||||
  if(do_dslash){
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
 | 
			
		||||
    fprintf(FP,"Per node summary table\n");
 | 
			
		||||
    fprintf(FP,"\n");
 | 
			
		||||
    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
 | 
			
		||||
    fprintf(FP,"\n");
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
 | 
			
		||||
      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
 | 
			
		||||
    }
 | 
			
		||||
    fprintf(FP,"\n");
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
 | 
			
		||||
    std::cout<<std::setprecision(3);
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  if ( do_memory ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
 | 
			
		||||
@@ -918,15 +948,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
    Benchmark::Memory();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_blas ) {
 | 
			
		||||
#if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::BLAS();
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_su4 ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
 | 
			
		||||
@@ -941,28 +962,14 @@ int main (int argc, char ** argv)
 | 
			
		||||
    Benchmark::Comms();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_blas ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
 | 
			
		||||
    fprintf(FP,"Per node summary table\n");
 | 
			
		||||
    fprintf(FP,"\n");
 | 
			
		||||
    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
 | 
			
		||||
    fprintf(FP,"\n");
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
 | 
			
		||||
      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
 | 
			
		||||
    }
 | 
			
		||||
    fprintf(FP,"\n");
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
 | 
			
		||||
    std::cout<<std::setprecision(3);
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    Benchmark::BLAS<ComplexD>();
 | 
			
		||||
    Benchmark::BLAS<ComplexF>();
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
  fclose(FP);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										67
									
								
								systems/Aurora/benchmarks/bench1.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								systems/Aurora/benchmarks/bench1.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,67 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
#PBS -q debug
 | 
			
		||||
#PBS -l select=1
 | 
			
		||||
#PBS -l walltime=00:20:00
 | 
			
		||||
#PBS -A LatticeQCD_aesp_CNDA
 | 
			
		||||
 | 
			
		||||
#export OMP_PROC_BIND=spread
 | 
			
		||||
#unset OMP_PLACES
 | 
			
		||||
 | 
			
		||||
cd $PBS_O_WORKDIR
 | 
			
		||||
 | 
			
		||||
source ../sourceme.sh
 | 
			
		||||
module load pti-gpu
 | 
			
		||||
 | 
			
		||||
#cat $PBS_NODEFILE
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=4
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
			
		||||
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
			
		||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
			
		||||
 | 
			
		||||
# 12 ppn, 2 nodes, 24 ranks
 | 
			
		||||
#
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
 | 
			
		||||
#$CMD | tee 1node.comms
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
 | 
			
		||||
#$CMD | tee 1tile.dwf
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
$CMD | tee 1node.32.32.32.48.dwf
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
#$CMD | tee 1node.64.64.32.96.dwf
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
#$CMD | tee 1node.64.32.32.48.dwf
 | 
			
		||||
 | 
			
		||||
@@ -1,10 +1,8 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 | 
			
		||||
 | 
			
		||||
#PBS -q EarlyAppAccess
 | 
			
		||||
#PBS -q workq
 | 
			
		||||
#PBS -l select=2
 | 
			
		||||
#PBS -l walltime=01:00:00
 | 
			
		||||
#PBS -l walltime=00:20:00
 | 
			
		||||
#PBS -A LatticeQCD_aesp_CNDA
 | 
			
		||||
 | 
			
		||||
#export OMP_PROC_BIND=spread
 | 
			
		||||
@@ -13,11 +11,13 @@
 | 
			
		||||
cd $PBS_O_WORKDIR
 | 
			
		||||
 | 
			
		||||
source ../sourceme.sh
 | 
			
		||||
module load pti-gpu
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=3
 | 
			
		||||
#cat $PBS_NODEFILE
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=4
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
			
		||||
@@ -31,30 +31,25 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
			
		||||
 | 
			
		||||
# 12 ppn, 2 nodes, 24 ranks
 | 
			
		||||
#
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
			
		||||
	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
 | 
			
		||||
$CMD | tee 2node.comms
 | 
			
		||||
 | 
			
		||||
#$CMD 
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
$CMD | tee 2node.32.32.64.48.dwf
 | 
			
		||||
 | 
			
		||||
#$CMD 
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
$CMD | tee 2node.64.64.64.96.dwf
 | 
			
		||||
 | 
			
		||||
$CMD 
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
			
		||||
 | 
			
		||||
$CMD 
 | 
			
		||||
@@ -1,33 +1,34 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 | 
			
		||||
#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 | 
			
		||||
export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 | 
			
		||||
export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
 | 
			
		||||
export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 | 
			
		||||
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 | 
			
		||||
#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
 | 
			
		||||
#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
 | 
			
		||||
#export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
 | 
			
		||||
 | 
			
		||||
export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
 | 
			
		||||
export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
 | 
			
		||||
 | 
			
		||||
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
 | 
			
		||||
  
 | 
			
		||||
#export GRID_MPICH_NIC_BIND=$NIC
 | 
			
		||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
 | 
			
		||||
 | 
			
		||||
unset EnableWalkerPartition
 | 
			
		||||
export EnableImplicitScaling=0
 | 
			
		||||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 | 
			
		||||
export ZE_AFFINITY_MASK=$gpu_id
 | 
			
		||||
export ONEAPI_DEVICE_FILTER=gpu,level_zero
 | 
			
		||||
 | 
			
		||||
#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
 | 
			
		||||
 | 
			
		||||
#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 | 
			
		||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
 | 
			
		||||
 | 
			
		||||
numactl -m $NUMA -N $NUMAP  "$@"
 | 
			
		||||
if [ $PALS_RANKID = "0" ]
 | 
			
		||||
then
 | 
			
		||||
#    numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline  "$@"
 | 
			
		||||
#    numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
 | 
			
		||||
    numactl -m $NUMA -N $NUMA  "$@"
 | 
			
		||||
else 
 | 
			
		||||
    numactl -m $NUMA -N $NUMA  "$@"
 | 
			
		||||
fi
 | 
			
		||||
 
 | 
			
		||||
@@ -1,16 +1,18 @@
 | 
			
		||||
 | 
			
		||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
			
		||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 | 
			
		||||
../../configure \
 | 
			
		||||
	--enable-simd=GPU \
 | 
			
		||||
	--enable-gen-simd-width=64 \
 | 
			
		||||
	--enable-comms=mpi-auto \
 | 
			
		||||
	--enable-debug \
 | 
			
		||||
	--disable-gparity \
 | 
			
		||||
	--disable-fermion-reps \
 | 
			
		||||
	--with-lime=$CLIME \
 | 
			
		||||
	--enable-shm=nvlink \
 | 
			
		||||
	--enable-accelerator=sycl \
 | 
			
		||||
	--enable-accelerator-aware-mpi=no\
 | 
			
		||||
	--enable-accelerator-aware-mpi=yes\
 | 
			
		||||
	--enable-unified=no \
 | 
			
		||||
	MPICXX=mpicxx \
 | 
			
		||||
	CXX=icpx \
 | 
			
		||||
	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
 | 
			
		||||
	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
 | 
			
		||||
	CXX=icpx 
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										23
									
								
								systems/Aurora/config-command-leak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								systems/Aurora/config-command-leak
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,23 @@
 | 
			
		||||
source ~/spack/share/spack/setup-env.sh 
 | 
			
		||||
spack load c-lime
 | 
			
		||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 | 
			
		||||
export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
 | 
			
		||||
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
 | 
			
		||||
 | 
			
		||||
../../configure \
 | 
			
		||||
	--enable-debug \
 | 
			
		||||
	--enable-simd=GPU \
 | 
			
		||||
	--enable-gen-simd-width=64 \
 | 
			
		||||
	--enable-comms=mpi-auto \
 | 
			
		||||
	--disable-gparity \
 | 
			
		||||
	--disable-fermion-reps \
 | 
			
		||||
	--with-lime=$CLIME \
 | 
			
		||||
	--enable-shm=nvlink \
 | 
			
		||||
	--enable-accelerator=sycl \
 | 
			
		||||
	--enable-accelerator-aware-mpi=yes\
 | 
			
		||||
	--enable-unified=no \
 | 
			
		||||
	MPICXX=mpicxx \
 | 
			
		||||
	CXX=icpx \
 | 
			
		||||
	LDFLAGS="-fiopenmp -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl  -lsycl -Xarch_host -fsanitize=leak -fsycl-device-code-split=per_kernel" \
 | 
			
		||||
	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -Xarch_host  -fsycl -fsanitize=leak "
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										22
									
								
								systems/Aurora/config-command-sanitize
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								systems/Aurora/config-command-sanitize
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,22 @@
 | 
			
		||||
# -fsycl-targets=spir64_gen -Xs\" -device pvc \"
 | 
			
		||||
# -fsycl-targets=intel_gpu_pvc_vg,intel_gpu_pvc
 | 
			
		||||
# -fsycl-targets=intel_gpu_pvc
 | 
			
		||||
 | 
			
		||||
unset DEVICE
 | 
			
		||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl -Xarch_host -fsanitize=address" 
 | 
			
		||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions -Xarch_host -fsanitize=address  -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 | 
			
		||||
../../configure \
 | 
			
		||||
	--enable-simd=GPU \
 | 
			
		||||
	--enable-gen-simd-width=64 \
 | 
			
		||||
	--enable-comms=mpi-auto \
 | 
			
		||||
	--enable-debug \
 | 
			
		||||
	--disable-gparity \
 | 
			
		||||
	--disable-fermion-reps \
 | 
			
		||||
	--with-lime=$CLIME \
 | 
			
		||||
	--enable-shm=nvlink \
 | 
			
		||||
	--enable-accelerator=sycl \
 | 
			
		||||
	--enable-accelerator-aware-mpi=yes\
 | 
			
		||||
	--enable-unified=no \
 | 
			
		||||
	MPICXX=mpicxx \
 | 
			
		||||
	CXX=icpx 
 | 
			
		||||
 | 
			
		||||
@@ -1,12 +1,22 @@
 | 
			
		||||
source ~/spack/share/spack/setup-env.sh 
 | 
			
		||||
spack load c-lime
 | 
			
		||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 | 
			
		||||
#spack load libefence
 | 
			
		||||
#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
 | 
			
		||||
#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
 | 
			
		||||
#spack load gperftools
 | 
			
		||||
export TCMALLOC=/home/paboyle/gperftools/install
 | 
			
		||||
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
 | 
			
		||||
export INTELGT_AUTO_ATTACH_DISABLE=1
 | 
			
		||||
 | 
			
		||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
 | 
			
		||||
#module load oneapi/release/2023.12.15.001
 | 
			
		||||
#module use /soft/modulefiles
 | 
			
		||||
#module load intel_compute_runtime/release/agama-devel-682.22
 | 
			
		||||
 | 
			
		||||
module use /soft/modulefiles
 | 
			
		||||
module load intel_compute_runtime/release/agama-devel-682.22
 | 
			
		||||
 | 
			
		||||
export FI_CXI_DEFAULT_CQ_SIZE=131072
 | 
			
		||||
export FI_CXI_CQ_FILL_PERCENT=20
 | 
			
		||||
 | 
			
		||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
			
		||||
#export FI_CXI_DEFAULT_CQ_SIZE=131072
 | 
			
		||||
#export FI_CXI_CQ_FILL_PERCENT=20
 | 
			
		||||
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
			
		||||
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
@@ -14,13 +24,17 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
			
		||||
# -ftarget-register-alloc-mode=pvc:small
 | 
			
		||||
# -ftarget-register-alloc-mode=pvc:large
 | 
			
		||||
# -ftarget-register-alloc-mode=pvc:auto
 | 
			
		||||
#
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 | 
			
		||||
 | 
			
		||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 | 
			
		||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 | 
			
		||||
export http_proxy=http://proxy.alcf.anl.gov:3128
 | 
			
		||||
export https_proxy=http://proxy.alcf.anl.gov:3128
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
 | 
			
		||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
 | 
			
		||||
 | 
			
		||||
#source ~/spack/share/spack/setup-env.sh
 | 
			
		||||
#spack load gperftools
 | 
			
		||||
#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
 | 
			
		||||
#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
 | 
			
		||||
 | 
			
		||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										76
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.csv
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,76 @@
 | 
			
		||||
Memory Bandwidth
 | 
			
		||||
 | 
			
		||||
Bytes, GB/s per node
 | 
			
		||||
6291456, 379.297050
 | 
			
		||||
100663296, 3754.674992
 | 
			
		||||
509607936, 6521.472413
 | 
			
		||||
1610612736, 8513.456479
 | 
			
		||||
3932160000, 9018.901766
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
GEMM
 | 
			
		||||
 | 
			
		||||
 M, N, K, BATCH, GF/s per rank
 | 
			
		||||
16, 8, 16, 256, 0.564958
 | 
			
		||||
16, 16, 16, 256, 243.148058
 | 
			
		||||
16, 32, 16, 256, 440.346877
 | 
			
		||||
32, 8, 32, 256, 439.194136
 | 
			
		||||
32, 16, 32, 256, 847.334141
 | 
			
		||||
32, 32, 32, 256, 1430.892623
 | 
			
		||||
64, 8, 64, 256, 1242.756741
 | 
			
		||||
64, 16, 64, 256, 2196.689493
 | 
			
		||||
64, 32, 64, 256, 3697.458072
 | 
			
		||||
16, 8, 256, 256, 899.582627
 | 
			
		||||
16, 16, 256, 256, 1673.537756
 | 
			
		||||
16, 32, 256, 256, 2959.597089
 | 
			
		||||
32, 8, 256, 256, 1558.858630
 | 
			
		||||
32, 16, 256, 256, 2864.839445
 | 
			
		||||
32, 32, 256, 256, 4810.671254
 | 
			
		||||
64, 8, 256, 256, 2386.092942
 | 
			
		||||
64, 16, 256, 256, 4451.665937
 | 
			
		||||
64, 32, 256, 256, 5942.124095
 | 
			
		||||
8, 256, 16, 256, 799.867271
 | 
			
		||||
16, 256, 16, 256, 1584.624888
 | 
			
		||||
32, 256, 16, 256, 1949.422338
 | 
			
		||||
8, 256, 32, 256, 1389.417474
 | 
			
		||||
16, 256, 32, 256, 2668.344493
 | 
			
		||||
32, 256, 32, 256, 3234.162120
 | 
			
		||||
8, 256, 64, 256, 2150.925128
 | 
			
		||||
16, 256, 64, 256, 4012.488132
 | 
			
		||||
32, 256, 64, 256, 5154.785521
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Communications
 | 
			
		||||
 | 
			
		||||
Packet bytes, direction, GB/s per node
 | 
			
		||||
4718592, 1, 245.026198
 | 
			
		||||
4718592, 2, 251.180996
 | 
			
		||||
4718592, 3, 361.110977
 | 
			
		||||
4718592, 5, 247.898447
 | 
			
		||||
4718592, 6, 249.867523
 | 
			
		||||
4718592, 7, 359.033061
 | 
			
		||||
15925248, 1, 255.030946
 | 
			
		||||
15925248, 2, 264.453890
 | 
			
		||||
15925248, 3, 392.949183
 | 
			
		||||
15925248, 5, 256.040644
 | 
			
		||||
15925248, 6, 264.681896
 | 
			
		||||
15925248, 7, 392.102622
 | 
			
		||||
37748736, 1, 258.823333
 | 
			
		||||
37748736, 2, 268.181577
 | 
			
		||||
37748736, 3, 401.478191
 | 
			
		||||
37748736, 5, 258.995363
 | 
			
		||||
37748736, 6, 268.206586
 | 
			
		||||
37748736, 7, 400.397611
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Per node summary table
 | 
			
		||||
 | 
			
		||||
L , Wilson, DWF4, Staggered, GF/s per node
 | 
			
		||||
 | 
			
		||||
8 , 155, 1386, 50
 | 
			
		||||
12 , 694, 4208, 230
 | 
			
		||||
16 , 1841, 6675, 609
 | 
			
		||||
24 , 3934, 8573, 1641
 | 
			
		||||
32 , 5083, 9771, 3086
 | 
			
		||||
 | 
			
		||||
		
		
			
  | 
							
								
								
									
										702
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										702
									
								
								systems/Frontier/benchmarks/Benchmark_usqcd.log
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,702 @@
 | 
			
		||||
RANK 1 using GPU 1
 | 
			
		||||
RANK 5 using GPU 6
 | 
			
		||||
RANK 0 using GPU 0
 | 
			
		||||
RANK 2 using GPU 2
 | 
			
		||||
RANK 3 using GPU 3
 | 
			
		||||
RANK 6 using GPU 5
 | 
			
		||||
RANK 7 using GPU 4
 | 
			
		||||
RANK 4 using GPU 7
 | 
			
		||||
world_rank 0 has 1 devices
 | 
			
		||||
AcceleratorHipInit: ========================
 | 
			
		||||
AcceleratorHipInit: Device Number    : 0
 | 
			
		||||
AcceleratorHipInit: ========================
 | 
			
		||||
AcceleratorHipInit: Device identifier: AMD Instinct MI250X
 | 
			
		||||
AcceleratorHipInit:   totalGlobalMem: 68702699520 
 | 
			
		||||
AcceleratorHipInit:   isMultiGpuBoard: 0 
 | 
			
		||||
AcceleratorHipInit:   warpSize: 64 
 | 
			
		||||
AcceleratorHipInit: using default device 
 | 
			
		||||
AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding 
 | 
			
		||||
AcceleratorHipInit: Configure options --enable-setdevice=no 
 | 
			
		||||
local rank 0 device 0 bus id: 0000:c1:00.0
 | 
			
		||||
AcceleratorHipInit: ================================================
 | 
			
		||||
SharedMemoryMpi:  World communicator of size 8
 | 
			
		||||
SharedMemoryMpi:  Node  communicator of size 8
 | 
			
		||||
0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 4294967296bytes at 0x7ff651800000 - 7ff7517fffff for comms buffers 
 | 
			
		||||
Setting up IPC
 | 
			
		||||
 | 
			
		||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 | 
			
		||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 | 
			
		||||
__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 | 
			
		||||
__|_                                    _|__
 | 
			
		||||
__|_   GGGG    RRRR    III    DDDD      _|__
 | 
			
		||||
__|_  G        R   R    I     D   D     _|__
 | 
			
		||||
__|_  G        R   R    I     D    D    _|__
 | 
			
		||||
__|_  G  GG    RRRR     I     D    D    _|__
 | 
			
		||||
__|_  G   G    R  R     I     D   D     _|__
 | 
			
		||||
__|_   GGGG    R   R   III    DDDD      _|__
 | 
			
		||||
__|_                                    _|__
 | 
			
		||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 | 
			
		||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 | 
			
		||||
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
Current Grid git commit hash=9a1ad6a5eb29a369d74784e7483c60e578323d76: (HEAD -> develop, origin/develop, origin/HEAD) clean
 | 
			
		||||
 | 
			
		||||
Grid : Message : ================================================ 
 | 
			
		||||
Grid : Message : MPI is initialised and logging filters activated 
 | 
			
		||||
Grid : Message : ================================================ 
 | 
			
		||||
Grid : Message : This rank is running on host frontier01320
 | 
			
		||||
Grid : Message : Requested 4294967296 byte stencil comms buffers 
 | 
			
		||||
Grid : Message : MemoryManager Cache 54962159616 bytes 
 | 
			
		||||
Grid : Message : MemoryManager::Init() setting up
 | 
			
		||||
Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
 | 
			
		||||
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
 | 
			
		||||
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
 | 
			
		||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 | 
			
		||||
Grid : Message : MemoryManager::Init() Using hipMalloc
 | 
			
		||||
Grid : Message : 0.293720 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.293790 s : = Grid is setup to use 1 threads
 | 
			
		||||
Grid : Message : 0.293800 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.293810 s : Grid Default Decomposition patterns
 | 
			
		||||
Grid : Message : 0.293810 s : 	OpenMP threads : 1
 | 
			
		||||
Grid : Message : 0.293820 s : 	MPI tasks      : 1 2 2 2 
 | 
			
		||||
Grid : Message : 0.293870 s : 	vReal          : 512bits ; 1 2 2 2 
 | 
			
		||||
Grid : Message : 0.293890 s : 	vRealF         : 512bits ; 2 2 2 2 
 | 
			
		||||
Grid : Message : 0.293910 s : 	vRealD         : 512bits ; 1 2 2 2 
 | 
			
		||||
Grid : Message : 0.293920 s : 	vComplex       : 512bits ; 1 1 2 2 
 | 
			
		||||
Grid : Message : 0.293930 s : 	vComplexF      : 512bits ; 1 2 2 2 
 | 
			
		||||
Grid : Message : 0.293960 s : 	vComplexD      : 512bits ; 1 1 2 2 
 | 
			
		||||
Grid : Message : 0.293970 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.293980 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.293990 s :  Clover dslash 4D vectorised (temporarily Wilson)
 | 
			
		||||
Grid : Message : 0.294000 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.301330 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.301360 s : Benchmark DWF on 8^4 local volume 
 | 
			
		||||
Grid : Message : 0.301370 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 0.301380 s : * Global volume  : 8 16 16 16 
 | 
			
		||||
Grid : Message : 0.301410 s : * Ls             : 1
 | 
			
		||||
Grid : Message : 0.301420 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 0.301430 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 0.301440 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 0.301450 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 0.301460 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 0.301470 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.345030 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 0.158302 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.158310 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 0.158311 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 0.158312 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 0.158313 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.240681 s : Deo FlopsPerSite is 1344
 | 
			
		||||
Grid : Message : 0.240711 s : Deo mflop/s =   154914.0 (130.8) 139367.7-159565.9
 | 
			
		||||
Grid : Message : 0.240715 s : Deo mflop/s per rank   19364.3
 | 
			
		||||
Grid : Message : 0.240716 s : Deo mflop/s per node   154914.0
 | 
			
		||||
Grid : Message : 0.240718 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.240719 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 0.240719 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 0.240719 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 0.240719 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.315028 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 0.315033 s : Deo mflop/s =   151459.5 (142.0) 131856.9-157286.4
 | 
			
		||||
Grid : Message : 0.315036 s : Deo mflop/s per rank   18932.4
 | 
			
		||||
Grid : Message : 0.315037 s : Deo mflop/s per node   151459.5
 | 
			
		||||
Grid : Message : 0.315038 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.315040 s : 8^4 x 1 Deo Best  mflop/s        =   154914.0 ; 154914.0 per node 
 | 
			
		||||
Grid : Message : 0.315042 s : 8^4 x 1 Deo Worst mflop/s        =   151459.5 ; 151459.5 per node 
 | 
			
		||||
Grid : Message : 0.315043 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 0.315043 s : 154914.0 ; 151459.5 ; 
 | 
			
		||||
Grid : Message : 0.315044 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.316507 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.316510 s : Benchmark DWF on 12^4 local volume 
 | 
			
		||||
Grid : Message : 0.316511 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 0.316512 s : * Global volume  : 12 24 24 24 
 | 
			
		||||
Grid : Message : 0.316515 s : * Ls             : 1
 | 
			
		||||
Grid : Message : 0.316516 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 0.316517 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 0.316518 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 0.316518 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 0.316519 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 0.316520 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.327883 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 0.786395 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.786404 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 0.786405 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 0.786406 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 0.786406 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.871646 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 0.871659 s : Deo mflop/s =   684982.2 (632.4) 609162.5-714594.5
 | 
			
		||||
Grid : Message : 0.871663 s : Deo mflop/s per rank   85622.8
 | 
			
		||||
Grid : Message : 0.871664 s : Deo mflop/s per node   684982.2
 | 
			
		||||
Grid : Message : 0.871665 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.871665 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 0.871665 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 0.871665 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 0.871665 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.953697 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 0.953702 s : Deo mflop/s =   693556.6 (576.5) 663552.0-719204.7
 | 
			
		||||
Grid : Message : 0.953705 s : Deo mflop/s per rank   86694.6
 | 
			
		||||
Grid : Message : 0.953706 s : Deo mflop/s per node   693556.6
 | 
			
		||||
Grid : Message : 0.953707 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.953708 s : 12^4 x 1 Deo Best  mflop/s        =   693556.6 ; 693556.6 per node 
 | 
			
		||||
Grid : Message : 0.953710 s : 12^4 x 1 Deo Worst mflop/s        =   684982.2 ; 684982.2 per node 
 | 
			
		||||
Grid : Message : 0.953712 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 0.953712 s : 684982.2 ; 693556.6 ; 
 | 
			
		||||
Grid : Message : 0.953713 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.957609 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.957613 s : Benchmark DWF on 16^4 local volume 
 | 
			
		||||
Grid : Message : 0.957614 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 0.957615 s : * Global volume  : 16 32 32 32 
 | 
			
		||||
Grid : Message : 0.957620 s : * Ls             : 1
 | 
			
		||||
Grid : Message : 0.957621 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 0.957622 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 0.957623 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 0.957623 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 0.957624 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 0.957625 s : ==================================================================================
 | 
			
		||||
Grid : Message : 0.985828 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 2.379761 s : ==================================================================================
 | 
			
		||||
Grid : Message : 2.379772 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 2.379773 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 2.379774 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 2.379775 s : ==================================================================================
 | 
			
		||||
Grid : Message : 2.486712 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 2.486725 s : Deo mflop/s =   1803226.1 (1139.4) 1646362.3-1864135.1
 | 
			
		||||
Grid : Message : 2.486729 s : Deo mflop/s per rank   225403.3
 | 
			
		||||
Grid : Message : 2.486731 s : Deo mflop/s per node   1803226.1
 | 
			
		||||
Grid : Message : 2.486732 s : ==================================================================================
 | 
			
		||||
Grid : Message : 2.486732 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 2.486732 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 2.486732 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 2.486732 s : ==================================================================================
 | 
			
		||||
Grid : Message : 2.584407 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 2.584412 s : Deo mflop/s =   1840587.3 (1119.6) 1779401.7-1914791.0
 | 
			
		||||
Grid : Message : 2.584415 s : Deo mflop/s per rank   230073.4
 | 
			
		||||
Grid : Message : 2.584416 s : Deo mflop/s per node   1840587.3
 | 
			
		||||
Grid : Message : 2.584417 s : ==================================================================================
 | 
			
		||||
Grid : Message : 2.584418 s : 16^4 x 1 Deo Best  mflop/s        =   1840587.3 ; 1840587.3 per node 
 | 
			
		||||
Grid : Message : 2.584420 s : 16^4 x 1 Deo Worst mflop/s        =   1803226.1 ; 1803226.1 per node 
 | 
			
		||||
Grid : Message : 2.584422 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 2.584422 s : 1803226.1 ; 1840587.3 ; 
 | 
			
		||||
Grid : Message : 2.584423 s : ==================================================================================
 | 
			
		||||
Grid : Message : 2.592858 s : ==================================================================================
 | 
			
		||||
Grid : Message : 2.592862 s : Benchmark DWF on 24^4 local volume 
 | 
			
		||||
Grid : Message : 2.592863 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 2.592864 s : * Global volume  : 24 48 48 48 
 | 
			
		||||
Grid : Message : 2.592869 s : * Ls             : 1
 | 
			
		||||
Grid : Message : 2.592870 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 2.592871 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 2.592872 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 2.592872 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 2.592873 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 2.592874 s : ==================================================================================
 | 
			
		||||
Grid : Message : 2.715623 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 9.608838 s : ==================================================================================
 | 
			
		||||
Grid : Message : 9.608852 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 9.608853 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 9.608854 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 9.608855 s : ==================================================================================
 | 
			
		||||
Grid : Message : 9.870294 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 9.870309 s : Deo mflop/s =   3861903.3 (1708.9) 3511078.3-3937368.2
 | 
			
		||||
Grid : Message : 9.870313 s : Deo mflop/s per rank   482737.9
 | 
			
		||||
Grid : Message : 9.870314 s : Deo mflop/s per node   3861903.3
 | 
			
		||||
Grid : Message : 9.870315 s : ==================================================================================
 | 
			
		||||
Grid : Message : 9.870316 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 9.870316 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 9.870317 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 9.870317 s : ==================================================================================
 | 
			
		||||
Grid : Message : 10.101619 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 10.101624 s : Deo mflop/s =   3933599.5 (1412.7) 3835758.7-4008152.3
 | 
			
		||||
Grid : Message : 10.101627 s : Deo mflop/s per rank   491699.9
 | 
			
		||||
Grid : Message : 10.101628 s : Deo mflop/s per node   3933599.5
 | 
			
		||||
Grid : Message : 10.101629 s : ==================================================================================
 | 
			
		||||
Grid : Message : 10.101629 s : 24^4 x 1 Deo Best  mflop/s        =   3933599.5 ; 3933599.5 per node 
 | 
			
		||||
Grid : Message : 10.101631 s : 24^4 x 1 Deo Worst mflop/s        =   3861903.3 ; 3861903.3 per node 
 | 
			
		||||
Grid : Message : 10.101633 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 10.101633 s : 3861903.3 ; 3933599.5 ; 
 | 
			
		||||
Grid : Message : 10.101634 s : ==================================================================================
 | 
			
		||||
Grid : Message : 10.139642 s : ==================================================================================
 | 
			
		||||
Grid : Message : 10.139652 s : Benchmark DWF on 32^4 local volume 
 | 
			
		||||
Grid : Message : 10.139653 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 10.139654 s : * Global volume  : 32 64 64 64 
 | 
			
		||||
Grid : Message : 10.139661 s : * Ls             : 1
 | 
			
		||||
Grid : Message : 10.139661 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 10.139662 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 10.139662 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 10.139662 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 10.139663 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 10.139663 s : ==================================================================================
 | 
			
		||||
Grid : Message : 10.502161 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 32.211092 s : ==================================================================================
 | 
			
		||||
Grid : Message : 32.211107 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 32.211108 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 32.211109 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 32.211110 s : ==================================================================================
 | 
			
		||||
Grid : Message : 32.841718 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 32.841732 s : Deo mflop/s =   4988499.9 (2722.5) 4244837.8-5120022.3
 | 
			
		||||
Grid : Message : 32.841736 s : Deo mflop/s per rank   623562.5
 | 
			
		||||
Grid : Message : 32.841737 s : Deo mflop/s per node   4988499.9
 | 
			
		||||
Grid : Message : 32.841738 s : ==================================================================================
 | 
			
		||||
Grid : Message : 32.841739 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 32.841739 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 32.841740 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 32.841740 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.407434 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 33.407442 s : Deo mflop/s =   5082758.0 (1883.1) 4971027.0-5205119.6
 | 
			
		||||
Grid : Message : 33.407446 s : Deo mflop/s per rank   635344.7
 | 
			
		||||
Grid : Message : 33.407447 s : Deo mflop/s per node   5082758.0
 | 
			
		||||
Grid : Message : 33.407448 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.407448 s : 32^4 x 1 Deo Best  mflop/s        =   5082758.0 ; 5082758.0 per node 
 | 
			
		||||
Grid : Message : 33.407450 s : 32^4 x 1 Deo Worst mflop/s        =   4988499.9 ; 4988499.9 per node 
 | 
			
		||||
Grid : Message : 33.407452 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 33.407452 s : 4988499.9 ; 5082758.0 ; 
 | 
			
		||||
Grid : Message : 33.407453 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.506785 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.506798 s :  Domain wall dslash 4D vectorised
 | 
			
		||||
Grid : Message : 33.506799 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.530686 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.530689 s : Benchmark DWF on 8^4 local volume 
 | 
			
		||||
Grid : Message : 33.530690 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 33.530691 s : * Global volume  : 8 16 16 16 
 | 
			
		||||
Grid : Message : 33.530698 s : * Ls             : 12
 | 
			
		||||
Grid : Message : 33.530699 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 33.530700 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 33.530701 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 33.530702 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 33.530703 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 33.530704 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.545465 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 33.752384 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.752397 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 33.752398 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 33.752399 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 33.752400 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.851964 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 33.851977 s : Deo mflop/s =   1383287.7 (849.8) 1321205.8-1420651.4
 | 
			
		||||
Grid : Message : 33.851981 s : Deo mflop/s per rank   172911.0
 | 
			
		||||
Grid : Message : 33.851983 s : Deo mflop/s per node   1383287.7
 | 
			
		||||
Grid : Message : 33.851984 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.851984 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 33.851984 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 33.851984 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 33.851984 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.949235 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 33.949240 s : Deo mflop/s =   1386335.8 (734.6) 1341325.6-1428330.6
 | 
			
		||||
Grid : Message : 33.949243 s : Deo mflop/s per rank   173292.0
 | 
			
		||||
Grid : Message : 33.949244 s : Deo mflop/s per node   1386335.8
 | 
			
		||||
Grid : Message : 33.949245 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.949245 s : 8^4 x 12 Deo Best  mflop/s        =   1386335.8 ; 1386335.8 per node 
 | 
			
		||||
Grid : Message : 33.949247 s : 8^4 x 12 Deo Worst mflop/s        =   1383287.7 ; 1383287.7 per node 
 | 
			
		||||
Grid : Message : 33.949249 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 33.949249 s : 1383287.7 ; 1386335.8 ; 
 | 
			
		||||
Grid : Message : 33.949250 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.952789 s : ==================================================================================
 | 
			
		||||
Grid : Message : 33.952793 s : Benchmark DWF on 12^4 local volume 
 | 
			
		||||
Grid : Message : 33.952794 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 33.952795 s : * Global volume  : 12 24 24 24 
 | 
			
		||||
Grid : Message : 33.952800 s : * Ls             : 12
 | 
			
		||||
Grid : Message : 33.952801 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 33.952802 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 33.952803 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 33.952803 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 33.952804 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 33.952805 s : ==================================================================================
 | 
			
		||||
Grid : Message : 34.362200 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 34.969821 s : ==================================================================================
 | 
			
		||||
Grid : Message : 34.969832 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 34.969833 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 34.969834 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 34.969835 s : ==================================================================================
 | 
			
		||||
Grid : Message : 35.135545 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 35.135558 s : Deo mflop/s =   4208495.6 (2165.0) 4053699.5-4315228.5
 | 
			
		||||
Grid : Message : 35.135562 s : Deo mflop/s per rank   526062.0
 | 
			
		||||
Grid : Message : 35.135563 s : Deo mflop/s per node   4208495.6
 | 
			
		||||
Grid : Message : 35.135564 s : ==================================================================================
 | 
			
		||||
Grid : Message : 35.135565 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 35.135565 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 35.135565 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 35.135565 s : ==================================================================================
 | 
			
		||||
Grid : Message : 35.299710 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 35.299715 s : Deo mflop/s =   4156968.7 (1450.2) 4053699.5-4219939.5
 | 
			
		||||
Grid : Message : 35.299718 s : Deo mflop/s per rank   519621.1
 | 
			
		||||
Grid : Message : 35.299719 s : Deo mflop/s per node   4156968.7
 | 
			
		||||
Grid : Message : 35.299721 s : ==================================================================================
 | 
			
		||||
Grid : Message : 35.299721 s : 12^4 x 12 Deo Best  mflop/s        =   4208495.6 ; 4208495.6 per node 
 | 
			
		||||
Grid : Message : 35.299723 s : 12^4 x 12 Deo Worst mflop/s        =   4156968.7 ; 4156968.7 per node 
 | 
			
		||||
Grid : Message : 35.299725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 35.299725 s : 4208495.6 ; 4156968.7 ; 
 | 
			
		||||
Grid : Message : 35.299726 s : ==================================================================================
 | 
			
		||||
Grid : Message : 35.309687 s : ==================================================================================
 | 
			
		||||
Grid : Message : 35.309693 s : Benchmark DWF on 16^4 local volume 
 | 
			
		||||
Grid : Message : 35.309694 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 35.309695 s : * Global volume  : 16 32 32 32 
 | 
			
		||||
Grid : Message : 35.309701 s : * Ls             : 12
 | 
			
		||||
Grid : Message : 35.309702 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 35.309703 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 35.309704 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 35.309704 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 35.309705 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 35.309706 s : ==================================================================================
 | 
			
		||||
Grid : Message : 35.448780 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 38.468764 s : ==================================================================================
 | 
			
		||||
Grid : Message : 38.468777 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 38.468778 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 38.468779 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 38.468780 s : ==================================================================================
 | 
			
		||||
Grid : Message : 38.801024 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 38.801040 s : Deo mflop/s =   6674673.6 (2168.6) 6484445.4-6797200.1
 | 
			
		||||
Grid : Message : 38.801044 s : Deo mflop/s per rank   834334.2
 | 
			
		||||
Grid : Message : 38.801045 s : Deo mflop/s per node   6674673.6
 | 
			
		||||
Grid : Message : 38.801046 s : ==================================================================================
 | 
			
		||||
Grid : Message : 38.801047 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 38.801048 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 38.801049 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 38.801049 s : ==================================================================================
 | 
			
		||||
Grid : Message : 39.129777 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 39.129783 s : Deo mflop/s =   6560128.4 (2117.4) 6405846.1-6679081.3
 | 
			
		||||
Grid : Message : 39.129786 s : Deo mflop/s per rank   820016.1
 | 
			
		||||
Grid : Message : 39.129787 s : Deo mflop/s per node   6560128.4
 | 
			
		||||
Grid : Message : 39.129788 s : ==================================================================================
 | 
			
		||||
Grid : Message : 39.129788 s : 16^4 x 12 Deo Best  mflop/s        =   6674673.6 ; 6674673.6 per node 
 | 
			
		||||
Grid : Message : 39.129790 s : 16^4 x 12 Deo Worst mflop/s        =   6560128.4 ; 6560128.4 per node 
 | 
			
		||||
Grid : Message : 39.129792 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 39.129793 s : 6674673.6 ; 6560128.4 ; 
 | 
			
		||||
Grid : Message : 39.129795 s : ==================================================================================
 | 
			
		||||
Grid : Message : 39.161251 s : ==================================================================================
 | 
			
		||||
Grid : Message : 39.161265 s : Benchmark DWF on 24^4 local volume 
 | 
			
		||||
Grid : Message : 39.161266 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 39.161267 s : * Global volume  : 24 48 48 48 
 | 
			
		||||
Grid : Message : 39.161274 s : * Ls             : 12
 | 
			
		||||
Grid : Message : 39.161275 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 39.161276 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 39.161277 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 39.161277 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 39.161278 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 39.161279 s : ==================================================================================
 | 
			
		||||
Grid : Message : 39.911996 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 54.971914 s : ==================================================================================
 | 
			
		||||
Grid : Message : 54.971928 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 54.971929 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 54.971930 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 54.971931 s : ==================================================================================
 | 
			
		||||
Grid : Message : 56.309445 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 56.309462 s : Deo mflop/s =   8572660.7 (1374.9) 8483366.4-8644399.6
 | 
			
		||||
Grid : Message : 56.309467 s : Deo mflop/s per rank   1071582.6
 | 
			
		||||
Grid : Message : 56.309468 s : Deo mflop/s per node   8572660.7
 | 
			
		||||
Grid : Message : 56.309469 s : ==================================================================================
 | 
			
		||||
Grid : Message : 56.309471 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 56.309472 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 56.309473 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 56.309474 s : ==================================================================================
 | 
			
		||||
Grid : Message : 57.640707 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 57.640714 s : Deo mflop/s =   8200141.3 (1445.8) 8113545.6-8286307.9
 | 
			
		||||
Grid : Message : 57.640717 s : Deo mflop/s per rank   1025017.7
 | 
			
		||||
Grid : Message : 57.640718 s : Deo mflop/s per node   8200141.3
 | 
			
		||||
Grid : Message : 57.640719 s : ==================================================================================
 | 
			
		||||
Grid : Message : 57.640720 s : 24^4 x 12 Deo Best  mflop/s        =   8572660.7 ; 8572660.7 per node 
 | 
			
		||||
Grid : Message : 57.640723 s : 24^4 x 12 Deo Worst mflop/s        =   8200141.3 ; 8200141.3 per node 
 | 
			
		||||
Grid : Message : 57.640725 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 57.640725 s : 8572660.7 ; 8200141.3 ; 
 | 
			
		||||
Grid : Message : 57.640727 s : ==================================================================================
 | 
			
		||||
Grid : Message : 57.806175 s : ==================================================================================
 | 
			
		||||
Grid : Message : 57.806190 s : Benchmark DWF on 32^4 local volume 
 | 
			
		||||
Grid : Message : 57.806191 s : * Nc             : 3
 | 
			
		||||
Grid : Message : 57.806192 s : * Global volume  : 32 64 64 64 
 | 
			
		||||
Grid : Message : 57.806200 s : * Ls             : 12
 | 
			
		||||
Grid : Message : 57.806200 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 57.806200 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 57.806200 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 57.806200 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 57.806201 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 57.806201 s : ==================================================================================
 | 
			
		||||
Grid : Message : 60.313153 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 107.830286 s : ==================================================================================
 | 
			
		||||
Grid : Message : 107.830306 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 107.830307 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 107.830308 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 107.830309 s : ==================================================================================
 | 
			
		||||
Grid : Message : 111.479603 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 111.479625 s : Deo mflop/s =   9771387.8 (1000.8) 9688589.9-9830800.0
 | 
			
		||||
Grid : Message : 111.479629 s : Deo mflop/s per rank   1221423.5
 | 
			
		||||
Grid : Message : 111.479630 s : Deo mflop/s per node   9771387.8
 | 
			
		||||
Grid : Message : 111.479631 s : ==================================================================================
 | 
			
		||||
Grid : Message : 111.479631 s : * Using UNROLLED WilsonKernels
 | 
			
		||||
Grid : Message : 111.479631 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 111.479631 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 111.479631 s : ==================================================================================
 | 
			
		||||
Grid : Message : 115.406559 s : Deo FlopsPerSite is 1344.0
 | 
			
		||||
Grid : Message : 115.406573 s : Deo mflop/s =   8785297.3 (1739.6) 8628282.5-8911307.5
 | 
			
		||||
Grid : Message : 115.406576 s : Deo mflop/s per rank   1098162.2
 | 
			
		||||
Grid : Message : 115.406577 s : Deo mflop/s per node   8785297.3
 | 
			
		||||
Grid : Message : 115.406578 s : ==================================================================================
 | 
			
		||||
Grid : Message : 115.406578 s : 32^4 x 12 Deo Best  mflop/s        =   9771387.8 ; 9771387.8 per node 
 | 
			
		||||
Grid : Message : 115.406580 s : 32^4 x 12 Deo Worst mflop/s        =   8785297.3 ; 8785297.3 per node 
 | 
			
		||||
Grid : Message : 115.406581 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 115.406581 s : 9771387.8 ; 8785297.3 ; 
 | 
			
		||||
Grid : Message : 115.406582 s : ==================================================================================
 | 
			
		||||
Grid : Message : 115.918888 s : ==================================================================================
 | 
			
		||||
Grid : Message : 115.918902 s :  Improved Staggered dslash 4D vectorised
 | 
			
		||||
Grid : Message : 115.918903 s : ==================================================================================
 | 
			
		||||
Grid : Message : 115.920344 s : ==================================================================================
 | 
			
		||||
Grid : Message : 115.920346 s : Benchmark ImprovedStaggered on 8^4 local volume 
 | 
			
		||||
Grid : Message : 115.920347 s : * Global volume  : 8 16 16 16 
 | 
			
		||||
Grid : Message : 115.920354 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 115.920355 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 115.920356 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 115.920357 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 115.920376 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 115.920377 s : ==================================================================================
 | 
			
		||||
Grid : Message : 115.923522 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 116.904870 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.904950 s : * Using GENERIC Nc StaggeredKernels
 | 
			
		||||
Grid : Message : 116.904960 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 116.904970 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.288979 s : Deo mflop/s =   49708.9 (22.9) 44075.3-50609.3
 | 
			
		||||
Grid : Message : 116.289000 s : Deo mflop/s per rank   6213.6
 | 
			
		||||
Grid : Message : 116.289002 s : Deo mflop/s per node   49708.9
 | 
			
		||||
Grid : Message : 116.289003 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.289004 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 116.289005 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.481632 s : Deo mflop/s =   49737.1 (13.5) 48517.0-50338.0
 | 
			
		||||
Grid : Message : 116.481639 s : Deo mflop/s per rank   6217.1
 | 
			
		||||
Grid : Message : 116.481640 s : Deo mflop/s per node   49737.1
 | 
			
		||||
Grid : Message : 116.481641 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.481642 s : 8^4  Deo Best  mflop/s        =   49737.1 ; 49737.1 per node 
 | 
			
		||||
Grid : Message : 116.481644 s : 8^4  Deo Worst mflop/s        =   49708.9 ; 49708.9 per node 
 | 
			
		||||
Grid : Message : 116.481646 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 116.481646 s : 49708.9 ; 49737.1 ; 
 | 
			
		||||
Grid : Message : 116.481647 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.483458 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.483461 s : Benchmark ImprovedStaggered on 12^4 local volume 
 | 
			
		||||
Grid : Message : 116.483462 s : * Global volume  : 12 24 24 24 
 | 
			
		||||
Grid : Message : 116.483465 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 116.483466 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 116.483466 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 116.483466 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 116.483467 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 116.483468 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.489279 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 116.945016 s : ==================================================================================
 | 
			
		||||
Grid : Message : 116.945025 s : * Using GENERIC Nc StaggeredKernels
 | 
			
		||||
Grid : Message : 116.945026 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 116.945027 s : ==================================================================================
 | 
			
		||||
Grid : Message : 117.159821 s : Deo mflop/s =   229778.4 (89.5) 223656.1-233547.5
 | 
			
		||||
Grid : Message : 117.159835 s : Deo mflop/s per rank   28722.3
 | 
			
		||||
Grid : Message : 117.159837 s : Deo mflop/s per node   229778.4
 | 
			
		||||
Grid : Message : 117.159838 s : ==================================================================================
 | 
			
		||||
Grid : Message : 117.159838 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 117.159838 s : ==================================================================================
 | 
			
		||||
Grid : Message : 117.371102 s : Deo mflop/s =   229516.6 (61.8) 225781.1-233547.5
 | 
			
		||||
Grid : Message : 117.371109 s : Deo mflop/s per rank   28689.6
 | 
			
		||||
Grid : Message : 117.371110 s : Deo mflop/s per node   229516.6
 | 
			
		||||
Grid : Message : 117.371111 s : ==================================================================================
 | 
			
		||||
Grid : Message : 117.371111 s : 12^4  Deo Best  mflop/s        =   229778.4 ; 229778.4 per node 
 | 
			
		||||
Grid : Message : 117.371113 s : 12^4  Deo Worst mflop/s        =   229516.6 ; 229516.6 per node 
 | 
			
		||||
Grid : Message : 117.371115 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 117.371115 s : 229778.4 ; 229516.6 ; 
 | 
			
		||||
Grid : Message : 117.371116 s : ==================================================================================
 | 
			
		||||
Grid : Message : 117.373669 s : ==================================================================================
 | 
			
		||||
Grid : Message : 117.373673 s : Benchmark ImprovedStaggered on 16^4 local volume 
 | 
			
		||||
Grid : Message : 117.373674 s : * Global volume  : 16 32 32 32 
 | 
			
		||||
Grid : Message : 117.373678 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 117.373679 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 117.373679 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 117.373679 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 117.373680 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 117.373681 s : ==================================================================================
 | 
			
		||||
Grid : Message : 117.386495 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 118.755695 s : ==================================================================================
 | 
			
		||||
Grid : Message : 118.755706 s : * Using GENERIC Nc StaggeredKernels
 | 
			
		||||
Grid : Message : 118.755707 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 118.755708 s : ==================================================================================
 | 
			
		||||
Grid : Message : 119.178990 s : Deo mflop/s =   608844.0 (126.1) 596065.5-615608.7
 | 
			
		||||
Grid : Message : 119.179160 s : Deo mflop/s per rank   76105.5
 | 
			
		||||
Grid : Message : 119.179180 s : Deo mflop/s per node   608844.0
 | 
			
		||||
Grid : Message : 119.179190 s : ==================================================================================
 | 
			
		||||
Grid : Message : 119.179200 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 119.179200 s : ==================================================================================
 | 
			
		||||
Grid : Message : 119.271093 s : Deo mflop/s =   605259.7 (188.7) 591372.1-614349.7
 | 
			
		||||
Grid : Message : 119.271101 s : Deo mflop/s per rank   75657.5
 | 
			
		||||
Grid : Message : 119.271103 s : Deo mflop/s per node   605259.7
 | 
			
		||||
Grid : Message : 119.271104 s : ==================================================================================
 | 
			
		||||
Grid : Message : 119.271105 s : 16^4  Deo Best  mflop/s        =   608844.0 ; 608844.0 per node 
 | 
			
		||||
Grid : Message : 119.271107 s : 16^4  Deo Worst mflop/s        =   605259.7 ; 605259.7 per node 
 | 
			
		||||
Grid : Message : 119.271109 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 119.271109 s : 608844.0 ; 605259.7 ; 
 | 
			
		||||
Grid : Message : 119.271110 s : ==================================================================================
 | 
			
		||||
Grid : Message : 119.275303 s : ==================================================================================
 | 
			
		||||
Grid : Message : 119.275308 s : Benchmark ImprovedStaggered on 24^4 local volume 
 | 
			
		||||
Grid : Message : 119.275309 s : * Global volume  : 24 48 48 48 
 | 
			
		||||
Grid : Message : 119.275315 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 119.275316 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 119.275317 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 119.275317 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 119.275318 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 119.275319 s : ==================================================================================
 | 
			
		||||
Grid : Message : 119.328765 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 126.866160 s : ==================================================================================
 | 
			
		||||
Grid : Message : 126.866270 s : * Using GENERIC Nc StaggeredKernels
 | 
			
		||||
Grid : Message : 126.866280 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 126.866290 s : ==================================================================================
 | 
			
		||||
Grid : Message : 126.604376 s : Deo mflop/s =   1641161.6 (335.5) 1619660.5-1663961.9
 | 
			
		||||
Grid : Message : 126.604392 s : Deo mflop/s per rank   205145.2
 | 
			
		||||
Grid : Message : 126.604394 s : Deo mflop/s per node   1641161.6
 | 
			
		||||
Grid : Message : 126.604395 s : ==================================================================================
 | 
			
		||||
Grid : Message : 126.604396 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 126.604396 s : ==================================================================================
 | 
			
		||||
Grid : Message : 127.829420 s : Deo mflop/s =   1620972.4 (344.9) 1602593.4-1644174.3
 | 
			
		||||
Grid : Message : 127.829520 s : Deo mflop/s per rank   202621.6
 | 
			
		||||
Grid : Message : 127.829530 s : Deo mflop/s per node   1620972.4
 | 
			
		||||
Grid : Message : 127.829540 s : ==================================================================================
 | 
			
		||||
Grid : Message : 127.829550 s : 24^4  Deo Best  mflop/s        =   1641161.6 ; 1641161.6 per node 
 | 
			
		||||
Grid : Message : 127.829570 s : 24^4  Deo Worst mflop/s        =   1620972.4 ; 1620972.4 per node 
 | 
			
		||||
Grid : Message : 127.829590 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 127.829590 s : 1641161.6 ; 1620972.4 ; 
 | 
			
		||||
Grid : Message : 127.829600 s : ==================================================================================
 | 
			
		||||
Grid : Message : 127.107891 s : ==================================================================================
 | 
			
		||||
Grid : Message : 127.107903 s : Benchmark ImprovedStaggered on 32^4 local volume 
 | 
			
		||||
Grid : Message : 127.107904 s : * Global volume  : 32 64 64 64 
 | 
			
		||||
Grid : Message : 127.107912 s : * ranks          : 8
 | 
			
		||||
Grid : Message : 127.107913 s : * nodes          : 1
 | 
			
		||||
Grid : Message : 127.107914 s : * ranks/node     : 8
 | 
			
		||||
Grid : Message : 127.107914 s : * ranks geom     : 1 2 2 2 
 | 
			
		||||
Grid : Message : 127.107915 s : * Using 1 threads
 | 
			
		||||
Grid : Message : 127.107916 s : ==================================================================================
 | 
			
		||||
Grid : Message : 127.257116 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 148.527930 s : ==================================================================================
 | 
			
		||||
Grid : Message : 148.527941 s : * Using GENERIC Nc StaggeredKernels
 | 
			
		||||
Grid : Message : 148.527942 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 148.527943 s : ==================================================================================
 | 
			
		||||
Grid : Message : 149.401625 s : Deo mflop/s =   3085543.7 (956.0) 2934476.4-3115147.4
 | 
			
		||||
Grid : Message : 149.401643 s : Deo mflop/s per rank   385693.0
 | 
			
		||||
Grid : Message : 149.401645 s : Deo mflop/s per node   3085543.7
 | 
			
		||||
Grid : Message : 149.401646 s : ==================================================================================
 | 
			
		||||
Grid : Message : 149.401647 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 149.401648 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.204533 s : Deo mflop/s =   3053468.5 (343.9) 3030688.8-3077255.0
 | 
			
		||||
Grid : Message : 150.204540 s : Deo mflop/s per rank   381683.6
 | 
			
		||||
Grid : Message : 150.204541 s : Deo mflop/s per node   3053468.5
 | 
			
		||||
Grid : Message : 150.204542 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.204543 s : 32^4  Deo Best  mflop/s        =   3085543.7 ; 3085543.7 per node 
 | 
			
		||||
Grid : Message : 150.204545 s : 32^4  Deo Worst mflop/s        =   3053468.5 ; 3053468.5 per node 
 | 
			
		||||
Grid : Message : 150.204547 s : G/S/C ; G/O/C ; G/S/S ; G/O/S 
 | 
			
		||||
Grid : Message : 150.204547 s : 3085543.7 ; 3053468.5 ; 
 | 
			
		||||
Grid : Message : 150.204548 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.292848 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.292864 s :  Summary table Ls=12
 | 
			
		||||
Grid : Message : 150.292866 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.292866 s : L 		 Clover 		 DWF4 		 Staggered
 | 
			
		||||
Grid : Message : 150.292867 s : 8 		 154914.0 		 1386335.8 		 49737.1
 | 
			
		||||
Grid : Message : 150.292880 s : 12 		 693556.6 		 4208495.6 		 229778.4
 | 
			
		||||
Grid : Message : 150.292882 s : 16 		 1840587.3 		 6674673.6 		 608844.0
 | 
			
		||||
Grid : Message : 150.292884 s : 24 		 3933599.5 		 8572660.7 		 1641161.6
 | 
			
		||||
Grid : Message : 150.292886 s : 32 		 5082758.0 		 9771387.8 		 3085543.7
 | 
			
		||||
Grid : Message : 150.292888 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.292888 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.292888 s :  Memory benchmark 
 | 
			
		||||
Grid : Message : 150.292888 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.295495 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.295497 s : = Benchmarking a*x + y bandwidth
 | 
			
		||||
Grid : Message : 150.295498 s : ==================================================================================
 | 
			
		||||
Grid : Message : 150.295499 s :   L  		bytes			GB/s		Gflop/s		 seconds		GB/s / node
 | 
			
		||||
Grid : Message : 150.295500 s : ----------------------------------------------------------
 | 
			
		||||
Grid : Message : 160.682233 s : 8		6291456.000   		379.297		31.608		10.367		379.297
 | 
			
		||||
Grid : Message : 161.851979 s : 16		100663296.000   		3754.675		312.890		1.047		3754.675
 | 
			
		||||
Grid : Message : 162.458098 s : 24		509607936.000   		6521.472		543.456		0.603		6521.472
 | 
			
		||||
Grid : Message : 162.924116 s : 32		1610612736.000   		8513.456		709.455		0.462		8513.456
 | 
			
		||||
Grid : Message : 163.363877 s : 40		3932160000.000   		9018.902		751.575		0.436		9018.902
 | 
			
		||||
Grid : Message : 163.363976 s : ==================================================================================
 | 
			
		||||
Grid : Message : 163.363978 s :  Batched BLAS benchmark 
 | 
			
		||||
Grid : Message : 163.363979 s : ==================================================================================
 | 
			
		||||
hipblasCreate
 | 
			
		||||
Grid : Message : 163.364046 s : ==================================================================================
 | 
			
		||||
Grid : Message : 163.364048 s : = batched GEMM (double precision) 
 | 
			
		||||
Grid : Message : 163.364048 s : ==================================================================================
 | 
			
		||||
Grid : Message : 163.364048 s :   M  		N			K		Gflop/s / rank (coarse mrhs)
 | 
			
		||||
Grid : Message : 163.364049 s : ----------------------------------------------------------
 | 
			
		||||
Grid : Message : 163.438476 s : 16		8		16		256		0.565
 | 
			
		||||
Grid : Message : 163.438944 s : 16		16		16		256		243.148
 | 
			
		||||
Grid : Message : 163.439501 s : 16		32		16		256		440.347
 | 
			
		||||
Grid : Message : 163.440003 s : 32		8		32		256		439.194
 | 
			
		||||
Grid : Message : 163.440463 s : 32		16		32		256		847.334
 | 
			
		||||
Grid : Message : 163.441051 s : 32		32		32		256		1430.893
 | 
			
		||||
Grid : Message : 163.441679 s : 64		8		64		256		1242.757
 | 
			
		||||
Grid : Message : 163.442354 s : 64		16		64		256		2196.689
 | 
			
		||||
Grid : Message : 163.443196 s : 64		32		64		256		3697.458
 | 
			
		||||
Grid : Message : 163.443200 s : ----------------------------------------------------------
 | 
			
		||||
Grid : Message : 163.443201 s :   M  		N			K		Gflop/s / rank (block project)
 | 
			
		||||
Grid : Message : 163.443202 s : ----------------------------------------------------------
 | 
			
		||||
Grid : Message : 163.444013 s : 16		8		256		256		899.583
 | 
			
		||||
Grid : Message : 163.444933 s : 16		16		256		256		1673.538
 | 
			
		||||
Grid : Message : 163.446013 s : 16		32		256		256		2959.597
 | 
			
		||||
Grid : Message : 163.446951 s : 32		8		256		256		1558.859
 | 
			
		||||
Grid : Message : 163.447970 s : 32		16		256		256		2864.839
 | 
			
		||||
Grid : Message : 163.449240 s : 32		32		256		256		4810.671
 | 
			
		||||
Grid : Message : 163.450524 s : 64		8		256		256		2386.093
 | 
			
		||||
Grid : Message : 163.451877 s : 64		16		256		256		4451.666
 | 
			
		||||
Grid : Message : 163.453806 s : 64		32		256		256		5942.124
 | 
			
		||||
Grid : Message : 163.453809 s : ----------------------------------------------------------
 | 
			
		||||
Grid : Message : 163.453810 s :   M  		N			K		Gflop/s / rank (block promote)
 | 
			
		||||
Grid : Message : 163.453811 s : ----------------------------------------------------------
 | 
			
		||||
Grid : Message : 163.454716 s : 8		256		16		256		799.867
 | 
			
		||||
Grid : Message : 163.455690 s : 16		256		16		256		1584.625
 | 
			
		||||
Grid : Message : 163.457209 s : 32		256		16		256		1949.422
 | 
			
		||||
Grid : Message : 163.458254 s : 8		256		32		256		1389.417
 | 
			
		||||
Grid : Message : 163.459339 s : 16		256		32		256		2668.344
 | 
			
		||||
Grid : Message : 163.461158 s : 32		256		32		256		3234.162
 | 
			
		||||
Grid : Message : 163.462566 s : 8		256		64		256		2150.925
 | 
			
		||||
Grid : Message : 163.464066 s : 16		256		64		256		4012.488
 | 
			
		||||
Grid : Message : 163.466272 s : 32		256		64		256		5154.786
 | 
			
		||||
Grid : Message : 163.466276 s : ==================================================================================
 | 
			
		||||
Grid : Message : 163.466277 s : ==================================================================================
 | 
			
		||||
Grid : Message : 163.466278 s :  Communications benchmark 
 | 
			
		||||
Grid : Message : 163.466279 s : ==================================================================================
 | 
			
		||||
Grid : Message : 163.466280 s : ====================================================================================================
 | 
			
		||||
Grid : Message : 163.466280 s : = Benchmarking threaded STENCIL halo exchange in 3 dimensions
 | 
			
		||||
Grid : Message : 163.466281 s : ====================================================================================================
 | 
			
		||||
Grid : Message : 163.466281 s :  L  	 Ls  	bytes	 MB/s uni  		 MB/s bidi 
 | 
			
		||||
Grid : Message : 163.521339 s : 16	12	 4718592 	 122513.099		245026.198
 | 
			
		||||
Grid : Message : 163.551417 s : 16	12	 4718592 	 125590.498		251180.996
 | 
			
		||||
Grid : Message : 163.572339 s : 16	12	 4718592 	 180555.489		361110.977
 | 
			
		||||
Grid : Message : 163.602810 s : 16	12	 4718592 	 123949.223		247898.447
 | 
			
		||||
Grid : Message : 163.633041 s : 16	12	 4718592 	 124933.761		249867.523
 | 
			
		||||
Grid : Message : 163.654084 s : 16	12	 4718592 	 179516.530		359033.061
 | 
			
		||||
Grid : Message : 163.756280 s : 24	12	 15925248 	 127515.473		255030.946
 | 
			
		||||
Grid : Message : 163.852651 s : 24	12	 15925248 	 132226.945		264453.890
 | 
			
		||||
Grid : Message : 163.917510 s : 24	12	 15925248 	 196474.591		392949.183
 | 
			
		||||
Grid : Message : 164.170390 s : 24	12	 15925248 	 128020.322		256040.644
 | 
			
		||||
Grid : Message : 164.113321 s : 24	12	 15925248 	 132340.948		264681.896
 | 
			
		||||
Grid : Message : 164.178314 s : 24	12	 15925248 	 196051.311		392102.622
 | 
			
		||||
Grid : Message : 164.413983 s : 32	12	 37748736 	 129411.666		258823.333
 | 
			
		||||
Grid : Message : 164.639218 s : 32	12	 37748736 	 134090.789		268181.577
 | 
			
		||||
Grid : Message : 164.789675 s : 32	12	 37748736 	 200739.096		401478.191
 | 
			
		||||
Grid : Message : 165.228910 s : 32	12	 37748736 	 129497.681		258995.363
 | 
			
		||||
Grid : Message : 165.248096 s : 32	12	 37748736 	 134103.293		268206.586
 | 
			
		||||
Grid : Message : 165.398958 s : 32	12	 37748736 	 200198.805		400397.611
 | 
			
		||||
Grid : Message : 165.399411 s : ==================================================================================
 | 
			
		||||
Grid : Message : 165.399413 s :  Per Node Summary table Ls=12
 | 
			
		||||
Grid : Message : 165.399414 s : ==================================================================================
 | 
			
		||||
Grid : Message : 165.399414 s :  L 		 Clover		 DWF4		 Staggered (GF/s per node)
 | 
			
		||||
Grid : Message : 165.399417 s : 8 		 154914.003 	 1386335.817 	 49737.127
 | 
			
		||||
Grid : Message : 165.399423 s : 12 		 693556.579 	 4208495.611 	 229778.435
 | 
			
		||||
Grid : Message : 165.399426 s : 16 		 1840587.280 	 6674673.647 	 608844.000
 | 
			
		||||
Grid : Message : 165.399429 s : 24 		 3933599.545 	 8572660.656 	 1641161.613
 | 
			
		||||
Grid : Message : 165.399432 s : 32 		 5082757.996 	 9771387.820 	 3085543.742
 | 
			
		||||
Grid : Message : 165.399435 s : ==================================================================================
 | 
			
		||||
Grid : Message : 165.399435 s : ==================================================================================
 | 
			
		||||
Grid : Message : 165.399435 s :  Comparison point     result: 9172024.238 Mflop/s per node
 | 
			
		||||
Grid : Message : 165.399436 s :  Comparison point is 0.5*(9771387.820+8572660.656) 
 | 
			
		||||
Grid : Message : 165.399438 s : ==================================================================================
 | 
			
		||||
Grid : Message : 165.399438 s : *******************************************
 | 
			
		||||
Grid : Message : 165.399438 s : ******* Grid Finalize                ******
 | 
			
		||||
Grid : Message : 165.399438 s : *******************************************
 | 
			
		||||
							
								
								
									
										38
									
								
								systems/Frontier/benchmarks/benchusqcd.slurm
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										38
									
								
								systems/Frontier/benchmarks/benchusqcd.slurm
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,38 @@
 | 
			
		||||
#!/bin/bash -l
 | 
			
		||||
#SBATCH --job-name=bench
 | 
			
		||||
##SBATCH --partition=small-g
 | 
			
		||||
##SBATCH -q debug
 | 
			
		||||
#SBATCH --nodes=1
 | 
			
		||||
#SBATCH --ntasks-per-node=8
 | 
			
		||||
#SBATCH --cpus-per-task=7
 | 
			
		||||
#SBATCH --gpus-per-node=8
 | 
			
		||||
#SBATCH --time=00:30:00
 | 
			
		||||
#SBATCH --account=phy157_dwf
 | 
			
		||||
#SBATCH --gpu-bind=none
 | 
			
		||||
#SBATCH --exclusive
 | 
			
		||||
#SBATCH --mem=0
 | 
			
		||||
 | 
			
		||||
cat << EOF > select_gpu
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
export GPU_MAP=(0 1 2 3 7 6 5 4)
 | 
			
		||||
export NUMA_MAP=(3 3 1 1 2 2 0 0)
 | 
			
		||||
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 | 
			
		||||
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
 | 
			
		||||
export HIP_VISIBLE_DEVICES=\$GPU
 | 
			
		||||
unset ROCR_VISIBLE_DEVICES
 | 
			
		||||
echo RANK \$SLURM_LOCALID using GPU \$GPU    
 | 
			
		||||
exec numactl -m \$NUMA -N \$NUMA \$*
 | 
			
		||||
EOF
 | 
			
		||||
 | 
			
		||||
chmod +x ./select_gpu
 | 
			
		||||
 | 
			
		||||
root=$HOME/Frontier/Grid/systems/Frontier/
 | 
			
		||||
source ${root}/sourceme.sh
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=7
 | 
			
		||||
export MPICH_GPU_SUPPORT_ENABLED=1
 | 
			
		||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 | 
			
		||||
 | 
			
		||||
srun ./select_gpu ./Benchmark_usqcd --grid 32.32.32.32 --mpi 1.2.2.2 --accelerator-threads 8 --comms-overlap --shm 4096 --shm-mpi 0 --grid $vol  > Benchmark_usqcd.log
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -3,7 +3,7 @@ spack load c-lime
 | 
			
		||||
module load emacs 
 | 
			
		||||
module load PrgEnv-gnu
 | 
			
		||||
module load rocm
 | 
			
		||||
module load cray-mpich/8.1.23
 | 
			
		||||
module load cray-mpich
 | 
			
		||||
module load gmp
 | 
			
		||||
module load cray-fftw
 | 
			
		||||
module load craype-accel-amd-gfx90a
 | 
			
		||||
 
 | 
			
		||||
@@ -2,11 +2,11 @@
 | 
			
		||||
    --enable-comms=mpi \
 | 
			
		||||
    --enable-simd=GPU \
 | 
			
		||||
    --enable-shm=nvlink \
 | 
			
		||||
    --enable-gen-simd-width=64 \
 | 
			
		||||
    --enable-accelerator=cuda \
 | 
			
		||||
    --enable-gen-simd-width=64 \
 | 
			
		||||
    --disable-gparity \
 | 
			
		||||
    --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
 | 
			
		||||
    --enable-accelerator-cshift \
 | 
			
		||||
    --disable-unified \
 | 
			
		||||
    CXX=nvcc \
 | 
			
		||||
    LDFLAGS="-cudart shared " \
 | 
			
		||||
    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
 | 
			
		||||
    LDFLAGS="-cudart shared -lcublas " \
 | 
			
		||||
    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared --diag-suppress 177,550,611"
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,7 @@
 | 
			
		||||
module load cuda/11.4.1  openmpi/4.1.1-cuda11.4.1  ucx/1.12.0-cuda11.4.1  
 | 
			
		||||
#module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
 | 
			
		||||
export PREFIX=/home/tc002/tc002/shared/env/prefix/
 | 
			
		||||
export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
 | 
			
		||||
module load cuda/12.3 
 | 
			
		||||
module load ucx/1.15.0-cuda12.3  
 | 
			
		||||
module load openmpi/4.1.5-cuda12.3
 | 
			
		||||
source /home/dp207/dp207/shared/env/production/env-base.sh 
 | 
			
		||||
source /home/dp207/dp207/shared/env/production/env-gpu.sh 
 | 
			
		||||
unset SBATCH_EXPORT
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -142,7 +142,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
 | 
			
		||||
  int iters;
 | 
			
		||||
 | 
			
		||||
  time_t now;
 | 
			
		||||
  time_t start = time(NULL);
 | 
			
		||||
  UGrid->Broadcast(0,(void *)&start,sizeof(start));
 | 
			
		||||
 | 
			
		||||
  FlightRecorder::ContinueOnFail = 0;
 | 
			
		||||
  FlightRecorder::PrintEntireLog = 0;
 | 
			
		||||
@@ -162,9 +164,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
    }
 | 
			
		||||
    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
 | 
			
		||||
    result_o = Zero();
 | 
			
		||||
    t1=usecond();
 | 
			
		||||
    t1=usecond(); 
 | 
			
		||||
    mCG(src_o,result_o);
 | 
			
		||||
    t2=usecond();
 | 
			
		||||
    t2=usecond(); 
 | 
			
		||||
    iters = mCG.TotalInnerIterations; //Number of inner CG iterations
 | 
			
		||||
    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
 | 
			
		||||
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
 | 
			
		||||
@@ -176,7 +178,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
    std::cout << " FlightRecorder is OK! "<<std::endl;
 | 
			
		||||
    iter ++;
 | 
			
		||||
  } while (time(NULL) < (start + nsecs/10) );
 | 
			
		||||
    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
 | 
			
		||||
  } while (now < (start + nsecs/10) );
 | 
			
		||||
    
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
 | 
			
		||||
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
			
		||||
@@ -189,7 +192,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
    }
 | 
			
		||||
    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
 | 
			
		||||
    result_o_2 = Zero();
 | 
			
		||||
    t1=usecond();
 | 
			
		||||
    t1=usecond(); 
 | 
			
		||||
    CG(HermOpEO,src_o,result_o_2);
 | 
			
		||||
    t2=usecond();
 | 
			
		||||
    iters = CG.IterationsToComplete;
 | 
			
		||||
@@ -201,8 +204,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
    std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
 | 
			
		||||
    assert(FlightRecorder::ErrorCount()==0);
 | 
			
		||||
    std::cout << " FlightRecorder is OK! "<<std::endl;
 | 
			
		||||
    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
 | 
			
		||||
    i++;
 | 
			
		||||
  } while (time(NULL) < (start + nsecs) );
 | 
			
		||||
  } while (now < (start + nsecs) );
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD diff_o(FrbGrid);
 | 
			
		||||
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										118
									
								
								tests/debug/Test_8888.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										118
									
								
								tests/debug/Test_8888.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,118 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_general_coarse_hdcg.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2023
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pboyle@bnl.gov>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  const int Ls=8;
 | 
			
		||||
  const int nbasis = 40;
 | 
			
		||||
  const int cb = 0 ;
 | 
			
		||||
  RealD mass=0.01;
 | 
			
		||||
  RealD M5=1.8;
 | 
			
		||||
  RealD b=1.0;
 | 
			
		||||
  RealD c=0.0;
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 | 
			
		||||
								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 | 
			
		||||
								   GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////// RNGs /////////////////////////////////
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
  std::vector<int> cseeds({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////// Configuration /////////////////////////////////
 | 
			
		||||
  LatticeGaugeField Umu(UGrid);
 | 
			
		||||
 | 
			
		||||
  FieldMetaData header;
 | 
			
		||||
  std::string file("ckpoint_EODWF_lat.125");
 | 
			
		||||
  NerscIO::readConfiguration(Umu,header,file);
 | 
			
		||||
 | 
			
		||||
  //////////////////////// Fermion action //////////////////////////////////
 | 
			
		||||
  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
 | 
			
		||||
 | 
			
		||||
  MdagMLinearOperator<MobiusFermionD, LatticeFermion> HermOp(Ddwf);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "         Fine Power method            "<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  LatticeFermionD pm_src(FGrid);
 | 
			
		||||
  pm_src = ComplexD(1.0);
 | 
			
		||||
  PowerMethod<LatticeFermionD>       fPM;
 | 
			
		||||
  fPM(HermOp,pm_src);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "         Fine Lanczos  (poly, low)    "<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  int Nk=80;
 | 
			
		||||
  int Nm=Nk*3;
 | 
			
		||||
  int Nstop=8;
 | 
			
		||||
  int Nconv_test_interval=1;
 | 
			
		||||
  
 | 
			
		||||
  //  Chebyshev<LatticeFermionD>      IRLChebyLo(0.2,64.0,201);  // 1 iter
 | 
			
		||||
  Chebyshev<LatticeFermionD>      IRLChebyLo(0.0,55.0,101);  // 1 iter
 | 
			
		||||
  FunctionHermOp<LatticeFermionD>    PolyOp(IRLChebyLo,HermOp);
 | 
			
		||||
  PlainHermOp<LatticeFermionD>          Op(HermOp);
 | 
			
		||||
 | 
			
		||||
  ImplicitlyRestartedLanczos IRL(PolyOp,
 | 
			
		||||
				 Op,
 | 
			
		||||
				 Nk, // sought vecs
 | 
			
		||||
				 Nk, // sought vecs
 | 
			
		||||
				 Nm, // spare vecs
 | 
			
		||||
				 1.0e-8,
 | 
			
		||||
				 10 // Max iterations
 | 
			
		||||
				 );
 | 
			
		||||
 | 
			
		||||
  int Nconv;
 | 
			
		||||
  std::vector<RealD>            eval(Nm);
 | 
			
		||||
  std::vector<LatticeFermionD>     evec(Nm,FGrid);
 | 
			
		||||
  LatticeFermionD     irl_src(FGrid);
 | 
			
		||||
 | 
			
		||||
  IRL.calc(eval,evec,irl_src,Nconv);
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
@@ -392,9 +392,27 @@ void  TestCGschur(What & Ddwf,
 | 
			
		||||
		   GridParallelRNG *RNG5)
 | 
			
		||||
{
 | 
			
		||||
  LatticeFermion src   (FGrid); random(*RNG5,src);
 | 
			
		||||
  LatticeFermion result(FGrid); result=Zero();
 | 
			
		||||
  LatticeFermion result1(FGrid); result1=Zero();
 | 
			
		||||
  LatticeFermion result2(FGrid); result2=Zero();
 | 
			
		||||
  LatticeFermion result3(FGrid); result3=Zero();
 | 
			
		||||
 | 
			
		||||
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
 | 
			
		||||
  SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
 | 
			
		||||
  SchurSolver(Ddwf,src,result);
 | 
			
		||||
  SchurSolver(Ddwf,src,result1);
 | 
			
		||||
 | 
			
		||||
  SchurRedBlackDiagOneSolve<LatticeFermion> SchurSolverSymm1(CG);
 | 
			
		||||
  SchurSolverSymm1(Ddwf,src,result2);
 | 
			
		||||
 | 
			
		||||
  SchurRedBlackDiagTwoSolve<LatticeFermion> SchurSolverSymm2(CG);
 | 
			
		||||
  SchurSolverSymm2(Ddwf,src,result3);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " Standard " <<norm2(result1)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " Symm1    " <<norm2(result2)<<std::endl; 
 | 
			
		||||
  result2=result2-result1;
 | 
			
		||||
  std::cout << GridLogMessage << " diff " <<norm2(result2) <<std::endl; 
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " Symm2    " <<norm2(result3)<<std::endl; 
 | 
			
		||||
  result3=result3-result1;
 | 
			
		||||
  std::cout << GridLogMessage << " diff " <<norm2(result3) <<std::endl; 
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -244,7 +244,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
#if 0  
 | 
			
		||||
  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
 | 
			
		||||
  typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
 | 
			
		||||
  
 | 
			
		||||
@@ -307,7 +307,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
    rh_res= Zero();
 | 
			
		||||
    mrhsCG(MrhsCoarseOp,rh_src,rh_res);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
  std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
@@ -26,84 +26,13 @@ Author: Peter Boyle <pboyle@bnl.gov>
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#include <Grid/lattice/PaddedCell.h>
 | 
			
		||||
#include <Grid/stencil/GeneralLocalStencil.h>
 | 
			
		||||
//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/AdefGeneric.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
template<class Coarsened>
 | 
			
		||||
void SaveOperator(Coarsened &Operator,std::string file)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  emptyUserRecord record;
 | 
			
		||||
  ScidacWriter WR(Operator.Grid()->IsBoss());
 | 
			
		||||
  assert(Operator._A.size()==Operator.geom.npoint);
 | 
			
		||||
  WR.open(file);
 | 
			
		||||
  for(int p=0;p<Operator._A.size();p++){
 | 
			
		||||
    auto tmp = Operator.Cell.Extract(Operator._A[p]);
 | 
			
		||||
    WR.writeScidacFieldRecord(tmp,record);
 | 
			
		||||
  }
 | 
			
		||||
  WR.close();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
template<class Coarsened>
 | 
			
		||||
void LoadOperator(Coarsened &Operator,std::string file)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  emptyUserRecord record;
 | 
			
		||||
  Grid::ScidacReader RD ;
 | 
			
		||||
  RD.open(file);
 | 
			
		||||
  assert(Operator._A.size()==Operator.geom.npoint);
 | 
			
		||||
  for(int p=0;p<Operator.geom.npoint;p++){
 | 
			
		||||
    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
 | 
			
		||||
    RD.readScidacFieldRecord(Operator._A[p],record);
 | 
			
		||||
  }    
 | 
			
		||||
  RD.close();
 | 
			
		||||
  Operator.ExchangeCoarseLinks();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
template<class aggregation>
 | 
			
		||||
void SaveBasis(aggregation &Agg,std::string file)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  emptyUserRecord record;
 | 
			
		||||
  ScidacWriter WR(Agg.FineGrid->IsBoss());
 | 
			
		||||
  WR.open(file);
 | 
			
		||||
  for(int b=0;b<Agg.subspace.size();b++){
 | 
			
		||||
    WR.writeScidacFieldRecord(Agg.subspace[b],record);
 | 
			
		||||
  }
 | 
			
		||||
  WR.close();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
template<class aggregation>
 | 
			
		||||
void LoadBasis(aggregation &Agg, std::string file)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  emptyUserRecord record;
 | 
			
		||||
  ScidacReader RD ;
 | 
			
		||||
  RD.open(file);
 | 
			
		||||
  for(int b=0;b<Agg.subspace.size();b++){
 | 
			
		||||
    RD.readScidacFieldRecord(Agg.subspace[b],record);
 | 
			
		||||
  }    
 | 
			
		||||
  RD.close();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Field> class TestSolver : public LinearFunction<Field> {
 | 
			
		||||
public:
 | 
			
		||||
  TestSolver() {};
 | 
			
		||||
  void operator() (const Field &in, Field &out){    out = Zero();  }     
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
RealD InverseApproximation(RealD x){
 | 
			
		||||
  return 1.0/x;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Want Op in CoarsenOp to call MatPcDagMatPc
 | 
			
		||||
template<class Field>
 | 
			
		||||
class HermOpAdaptor : public LinearOperatorBase<Field>
 | 
			
		||||
@@ -119,33 +48,37 @@ public:
 | 
			
		||||
  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
 | 
			
		||||
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 | 
			
		||||
};
 | 
			
		||||
template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 | 
			
		||||
 | 
			
		||||
template<class Field> class CGSmoother : public LinearFunction<Field>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  using LinearFunction<Field>::operator();
 | 
			
		||||
  typedef LinearOperatorBase<Field> FineOperator;
 | 
			
		||||
  FineOperator   & _SmootherOperator;
 | 
			
		||||
  Chebyshev<Field> Cheby;
 | 
			
		||||
  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
 | 
			
		||||
  int iters;
 | 
			
		||||
  CGSmoother(int _iters, FineOperator &SmootherOperator) :
 | 
			
		||||
    _SmootherOperator(SmootherOperator),
 | 
			
		||||
    Cheby(_lo,_hi,_ord,InverseApproximation)
 | 
			
		||||
    iters(_iters)
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
 | 
			
		||||
  };
 | 
			
		||||
  void operator() (const Field &in, Field &out) 
 | 
			
		||||
  {
 | 
			
		||||
    Field tmp(in.Grid());
 | 
			
		||||
    tmp = in;
 | 
			
		||||
    Cheby(_SmootherOperator,tmp,out);
 | 
			
		||||
    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
 | 
			
		||||
 | 
			
		||||
    out=Zero();
 | 
			
		||||
 | 
			
		||||
    CG(_SmootherOperator,in,out);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  const int Ls=24;
 | 
			
		||||
  const int nbasis = 40;
 | 
			
		||||
  const int nbasis = 60;
 | 
			
		||||
  const int cb = 0 ;
 | 
			
		||||
  RealD mass=0.00078;
 | 
			
		||||
  RealD M5=1.8;
 | 
			
		||||
@@ -160,10 +93,12 @@ int main (int argc, char ** argv)
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  // Construct a coarsened grid with 4^4 cell
 | 
			
		||||
  Coordinate Block({4,4,4,4});
 | 
			
		||||
  Coordinate clatt = GridDefaultLatt();
 | 
			
		||||
  for(int d=0;d<clatt.size();d++){
 | 
			
		||||
    clatt[d] = clatt[d]/4;
 | 
			
		||||
    clatt[d] = clatt[d]/Block[d];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
 | 
			
		||||
							    GridDefaultSimd(Nd,vComplex::Nsimd()),
 | 
			
		||||
							    GridDefaultMpi());;
 | 
			
		||||
@@ -182,7 +117,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  LatticeGaugeField Umu(UGrid);
 | 
			
		||||
 | 
			
		||||
  FieldMetaData header;
 | 
			
		||||
  std::string file("ckpoint_lat.4000");
 | 
			
		||||
  std::string file("ckpoint_EODWF_lat.125");
 | 
			
		||||
  NerscIO::readConfiguration(Umu,header,file);
 | 
			
		||||
 | 
			
		||||
  //////////////////////// Fermion action //////////////////////////////////
 | 
			
		||||
@@ -192,15 +127,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
 | 
			
		||||
  HermFineMatrix FineHermOp(HermOpEO);
 | 
			
		||||
  
 | 
			
		||||
  LatticeFermion result(FrbGrid); result=Zero();
 | 
			
		||||
 | 
			
		||||
  LatticeFermion    src(FrbGrid); random(RNG5,src);
 | 
			
		||||
 | 
			
		||||
  // Run power method on FineHermOp
 | 
			
		||||
  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  ///////////// Coarse basis and Little Dirac Operator ///////
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -208,219 +135,170 @@ int main (int argc, char ** argv)
 | 
			
		||||
  typedef LittleDiracOperator::CoarseVector CoarseVector;
 | 
			
		||||
 | 
			
		||||
  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
 | 
			
		||||
  NearestStencilGeometry5D geom_nn(Coarse5d);
 | 
			
		||||
  
 | 
			
		||||
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
 | 
			
		||||
 | 
			
		||||
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
 | 
			
		||||
  Subspace Aggregates(Coarse5d,FrbGrid,cb);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  // Need to check about red-black grid coarsening
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
 | 
			
		||||
 | 
			
		||||
  bool load=false;
 | 
			
		||||
  if ( load ) {
 | 
			
		||||
    LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
 | 
			
		||||
    LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
 | 
			
		||||
  } else {
 | 
			
		||||
    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
 | 
			
		||||
				       95.0,0.1,
 | 
			
		||||
				       //				     400,200,200 -- 48 iters
 | 
			
		||||
				       //				     600,200,200 -- 38 iters, 162s
 | 
			
		||||
				       //				     600,200,100 -- 38 iters, 169s
 | 
			
		||||
				       //				     600,200,50  -- 88 iters. 370s 
 | 
			
		||||
				       800,
 | 
			
		||||
				       200,
 | 
			
		||||
				       100,
 | 
			
		||||
				       0.0);
 | 
			
		||||
    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
 | 
			
		||||
    SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
 | 
			
		||||
    SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // Try projecting to one hop only
 | 
			
		||||
  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
 | 
			
		||||
  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
 | 
			
		||||
  int refine=1;
 | 
			
		||||
    //    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
 | 
			
		||||
    //    					0.0003,1.0e-5,2000); // Lo, tol, maxit
 | 
			
		||||
    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Create Subspace"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
 | 
			
		||||
 | 
			
		||||
  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
 | 
			
		||||
  HermMatrix CoarseOp     (LittleDiracOp);
 | 
			
		||||
  HermMatrix CoarseOpProj (LittleDiracOpProj);
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Refine Subspace"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
 | 
			
		||||
  
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  // Build a coarse lanczos
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter
 | 
			
		||||
  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
 | 
			
		||||
  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
 | 
			
		||||
  int Nk=48;
 | 
			
		||||
  int Nm=64;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Coarsen after refine"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  Aggregates.Orthogonalise();
 | 
			
		||||
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
 | 
			
		||||
    
 | 
			
		||||
  const int nrhs=12;
 | 
			
		||||
    
 | 
			
		||||
  Coordinate mpi=GridDefaultMpi();
 | 
			
		||||
  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
 | 
			
		||||
  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
 | 
			
		||||
  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
 | 
			
		||||
    
 | 
			
		||||
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
 | 
			
		||||
  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
 | 
			
		||||
  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
 | 
			
		||||
 | 
			
		||||
  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
 | 
			
		||||
  
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "         Coarse Lanczos               "<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
 | 
			
		||||
  Chebyshev<CoarseVector>      IRLCheby(0.01,42.0,301);  // 1 iter
 | 
			
		||||
  MrhsHermMatrix MrhsCoarseOp     (mrhs);
 | 
			
		||||
 | 
			
		||||
  CoarseVector pm_src(CoarseMrhs);
 | 
			
		||||
  pm_src = ComplexD(1.0);
 | 
			
		||||
  PowerMethod<CoarseVector>       cPM;
 | 
			
		||||
  cPM(MrhsCoarseOp,pm_src);
 | 
			
		||||
 | 
			
		||||
  int Nk=192;
 | 
			
		||||
  int Nm=384;
 | 
			
		||||
  int Nstop=Nk;
 | 
			
		||||
  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20);
 | 
			
		||||
  int Nconv_test_interval=1;
 | 
			
		||||
  
 | 
			
		||||
  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
 | 
			
		||||
							  Coarse5d,
 | 
			
		||||
							  CoarseMrhs,
 | 
			
		||||
							  nrhs,
 | 
			
		||||
							  IRLCheby,
 | 
			
		||||
							  Nstop,
 | 
			
		||||
							  Nconv_test_interval,
 | 
			
		||||
							  nrhs,
 | 
			
		||||
							  Nk,
 | 
			
		||||
							  Nm,
 | 
			
		||||
							  1e-5,10);
 | 
			
		||||
 | 
			
		||||
  int Nconv;
 | 
			
		||||
  std::vector<RealD>            eval(Nm);
 | 
			
		||||
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
 | 
			
		||||
  CoarseVector c_src(Coarse5d);
 | 
			
		||||
  //c_src=1.0;
 | 
			
		||||
  random(CRNG,c_src);
 | 
			
		||||
 | 
			
		||||
  CoarseVector c_res(Coarse5d); 
 | 
			
		||||
  CoarseVector c_ref(Coarse5d); 
 | 
			
		||||
 | 
			
		||||
  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
 | 
			
		||||
 | 
			
		||||
  IRL.calc(eval,evec,c_src,Nconv);
 | 
			
		||||
  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
 | 
			
		||||
  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  // Build a coarse space solver
 | 
			
		||||
  // Block projector for coarse/fine
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  int maxit=20000;
 | 
			
		||||
  ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false);
 | 
			
		||||
  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,10000,false);
 | 
			
		||||
  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
 | 
			
		||||
 | 
			
		||||
  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
 | 
			
		||||
  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
 | 
			
		||||
  c_res=Zero();
 | 
			
		||||
  HPDSolve(c_src,c_res); c_ref = c_res;
 | 
			
		||||
  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Deflated (with real op EV's) solve for the projected coarse op
 | 
			
		||||
  // Work towards ADEF1 in the coarse space
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
 | 
			
		||||
  c_res=Zero();
 | 
			
		||||
  HPDSolveProj(c_src,c_res);
 | 
			
		||||
  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
 | 
			
		||||
  c_res = c_res - c_ref;
 | 
			
		||||
  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Calling mRHS HDCG"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
 | 
			
		||||
  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
 | 
			
		||||
  MrhsProjector.ImportBasis(Aggregates.subspace);
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Coarse ADEF1 with deflation space
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  ChebyshevSmoother<CoarseVector,HermMatrix >
 | 
			
		||||
    CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
 | 
			
		||||
    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
 | 
			
		||||
  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
 | 
			
		||||
  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
 | 
			
		||||
  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
 | 
			
		||||
  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
 | 
			
		||||
  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  // CG, Cheby mode spacing 200,200
 | 
			
		||||
  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
 | 
			
		||||
  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
 | 
			
		||||
  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
 | 
			
		||||
  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
 | 
			
		||||
  // HDCG 38 iters 162s
 | 
			
		||||
  //
 | 
			
		||||
  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
 | 
			
		||||
  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
 | 
			
		||||
  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
 | 
			
		||||
  // HDCG 38 iters 169s
 | 
			
		||||
 | 
			
		||||
  TwoLevelADEF1defl<CoarseVector>
 | 
			
		||||
    cADEF1(1.0e-8, 500,
 | 
			
		||||
	   CoarseOp,
 | 
			
		||||
	   CoarseSmoother,
 | 
			
		||||
	   evec,eval);
 | 
			
		||||
 | 
			
		||||
  c_res=Zero();
 | 
			
		||||
  cADEF1(c_src,c_res);
 | 
			
		||||
  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
 | 
			
		||||
  c_res = c_res - c_ref;
 | 
			
		||||
  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  //  cADEF1.Tolerance = 4.0e-2;
 | 
			
		||||
  //  cADEF1.Tolerance = 1.0e-1;
 | 
			
		||||
  cADEF1.Tolerance = 5.0e-2;
 | 
			
		||||
  c_res=Zero();
 | 
			
		||||
  cADEF1(c_src,c_res);
 | 
			
		||||
  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
 | 
			
		||||
  c_res = c_res - c_ref;
 | 
			
		||||
  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  // Build a smoother
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
 | 
			
		||||
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383
 | 
			
		||||
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328
 | 
			
		||||
  //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
 | 
			
		||||
  //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
 | 
			
		||||
  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
 | 
			
		||||
  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
 | 
			
		||||
 | 
			
		||||
  //
 | 
			
		||||
  // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
 | 
			
		||||
  // Need to measure cost of coarse space.
 | 
			
		||||
  //
 | 
			
		||||
  // -- i) Reduce coarse residual   -- 0.04
 | 
			
		||||
  // -- ii) Lanczos on coarse space -- done
 | 
			
		||||
  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
 | 
			
		||||
  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
 | 
			
		||||
  //
 | 
			
		||||
  std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters
 | 
			
		||||
 | 
			
		||||
  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
 | 
			
		||||
  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  
 | 
			
		||||
 | 
			
		||||
  for(int l=0;l<los.size();l++){
 | 
			
		||||
 | 
			
		||||
    RealD lo = los[l];
 | 
			
		||||
 | 
			
		||||
    for(int o=0;o<ords.size();o++){
 | 
			
		||||
 | 
			
		||||
      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
 | 
			
		||||
      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
 | 
			
		||||
      
 | 
			
		||||
      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
 | 
			
		||||
      ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp);  // 311
 | 
			
		||||
 | 
			
		||||
      //////////////////////////////////////////
 | 
			
		||||
      // Build a HDCG solver
 | 
			
		||||
      //////////////////////////////////////////
 | 
			
		||||
      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
 | 
			
		||||
	HDCG(1.0e-8, 100,
 | 
			
		||||
	     FineHermOp,
 | 
			
		||||
	     Smoother,
 | 
			
		||||
	     HPDSolveSloppy,
 | 
			
		||||
	     HPDSolve,
 | 
			
		||||
	     Aggregates);
 | 
			
		||||
 | 
			
		||||
      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
 | 
			
		||||
	HDCGdefl(1.0e-8, 100,
 | 
			
		||||
		 FineHermOp,
 | 
			
		||||
		 Smoother,
 | 
			
		||||
		 cADEF1,
 | 
			
		||||
		 HPDSolve,
 | 
			
		||||
		 Aggregates);
 | 
			
		||||
      
 | 
			
		||||
      result=Zero();
 | 
			
		||||
      HDCGdefl(src,result);
 | 
			
		||||
 | 
			
		||||
      result=Zero();
 | 
			
		||||
      HDCG(src,result);
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << " Recompute coarse evecs  "<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  evec.resize(Nm,Coarse5d);
 | 
			
		||||
  eval.resize(Nm);
 | 
			
		||||
  for(int r=0;r<nrhs;r++){
 | 
			
		||||
    random(CRNG,c_src[r]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Standard CG
 | 
			
		||||
  result=Zero();
 | 
			
		||||
  CGfine(HermOpEO, src, result);
 | 
			
		||||
  IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  // Deflation guesser object
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << " Reimport coarse evecs  "<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
 | 
			
		||||
  MrhsGuesser.ImportEigenBasis(evec,eval);
 | 
			
		||||
      
 | 
			
		||||
  //////////////////////////
 | 
			
		||||
  // Extra HDCG parameters
 | 
			
		||||
  //////////////////////////
 | 
			
		||||
  int maxit=3000;
 | 
			
		||||
  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
 | 
			
		||||
  RealD lo=2.0;
 | 
			
		||||
  int ord = 7;
 | 
			
		||||
 | 
			
		||||
  DoNothingGuesser<CoarseVector> DoNothing;
 | 
			
		||||
  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  // Mirs smoother
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  RealD MirsShift = lo;
 | 
			
		||||
  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
 | 
			
		||||
  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
 | 
			
		||||
 | 
			
		||||
  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
 | 
			
		||||
    HDCGmrhs(1.0e-8, 500,
 | 
			
		||||
	     FineHermOp,
 | 
			
		||||
	     CGsmooth,
 | 
			
		||||
	     HPDSolveMrhs,    // Used in M1
 | 
			
		||||
	     HPDSolveMrhs,          // Used in Vstart
 | 
			
		||||
	     MrhsProjector,
 | 
			
		||||
	     MrhsGuesser,
 | 
			
		||||
	     CoarseMrhs);
 | 
			
		||||
    
 | 
			
		||||
  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
 | 
			
		||||
  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
 | 
			
		||||
  
 | 
			
		||||
  for(int r=0;r<nrhs;r++){
 | 
			
		||||
    random(RNG5,src_mrhs[r]);
 | 
			
		||||
    res_mrhs[r]=Zero();
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  HDCGmrhs(src_mrhs,res_mrhs);
 | 
			
		||||
 | 
			
		||||
  // Standard CG
 | 
			
		||||
#if 1
 | 
			
		||||
  {
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Calling red black CG"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
    LatticeFermion result(FrbGrid); result=Zero();
 | 
			
		||||
    LatticeFermion    src(FrbGrid); random(RNG5,src);
 | 
			
		||||
    result=Zero();
 | 
			
		||||
 | 
			
		||||
    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
 | 
			
		||||
    CGfine(HermOpEO, src, result);
 | 
			
		||||
  }
 | 
			
		||||
#endif  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -145,7 +145,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  const int Ls=24;
 | 
			
		||||
  const int nbasis = 60;
 | 
			
		||||
  const int nbasis = 62;
 | 
			
		||||
  const int cb = 0 ;
 | 
			
		||||
  RealD mass=0.00078;
 | 
			
		||||
  RealD M5=1.8;
 | 
			
		||||
@@ -160,7 +160,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  // Construct a coarsened grid with 4^4 cell
 | 
			
		||||
  Coordinate Block({4,4,4,4});
 | 
			
		||||
  Coordinate Block({4,4,6,4});
 | 
			
		||||
  Coordinate clatt = GridDefaultLatt();
 | 
			
		||||
  for(int d=0;d<clatt.size();d++){
 | 
			
		||||
    clatt[d] = clatt[d]/Block[d];
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										396
									
								
								tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										396
									
								
								tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,396 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_general_coarse_hdcg.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2023
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pboyle@bnl.gov>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
template<class aggregation>
 | 
			
		||||
void SaveBasis(aggregation &Agg,std::string file)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  emptyUserRecord record;
 | 
			
		||||
  ScidacWriter WR(Agg.FineGrid->IsBoss());
 | 
			
		||||
  WR.open(file);
 | 
			
		||||
  for(int b=0;b<Agg.subspace.size();b++){
 | 
			
		||||
    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
 | 
			
		||||
    //    WR.writeScidacFieldRecord(Agg.subspace[b],record);
 | 
			
		||||
  }
 | 
			
		||||
  WR.close();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
template<class aggregation>
 | 
			
		||||
void LoadBasis(aggregation &Agg, std::string file)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  emptyUserRecord record;
 | 
			
		||||
  ScidacReader RD ;
 | 
			
		||||
  RD.open(file);
 | 
			
		||||
  for(int b=0;b<Agg.subspace.size();b++){
 | 
			
		||||
    RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
 | 
			
		||||
    //    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
 | 
			
		||||
  }    
 | 
			
		||||
  RD.close();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
template<class CoarseVector>
 | 
			
		||||
void SaveEigenvectors(std::vector<RealD>            &eval,
 | 
			
		||||
		      std::vector<CoarseVector>     &evec,
 | 
			
		||||
		      std::string evec_file,
 | 
			
		||||
		      std::string eval_file)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
  emptyUserRecord record;
 | 
			
		||||
  ScidacWriter WR(evec[0].Grid()->IsBoss());
 | 
			
		||||
  WR.open(evec_file);
 | 
			
		||||
  for(int b=0;b<evec.size();b++){
 | 
			
		||||
    WR.writeScidacFieldRecord(evec[b],record,0,0);
 | 
			
		||||
  }
 | 
			
		||||
  WR.close();
 | 
			
		||||
  XmlWriter WRx(eval_file);
 | 
			
		||||
  write(WRx,"evals",eval);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
template<class CoarseVector>
 | 
			
		||||
void LoadEigenvectors(std::vector<RealD>            &eval,
 | 
			
		||||
		      std::vector<CoarseVector>     &evec,
 | 
			
		||||
		      std::string evec_file,
 | 
			
		||||
		      std::string eval_file)
 | 
			
		||||
{
 | 
			
		||||
#ifdef HAVE_LIME
 | 
			
		||||
    XmlReader RDx(eval_file);
 | 
			
		||||
    read(RDx,"evals",eval);
 | 
			
		||||
    emptyUserRecord record;
 | 
			
		||||
 | 
			
		||||
    Grid::ScidacReader RD ;
 | 
			
		||||
    RD.open(evec_file);
 | 
			
		||||
    assert(evec.size()==eval.size());
 | 
			
		||||
    for(int k=0;k<eval.size();k++) {
 | 
			
		||||
      RD.readScidacFieldRecord(evec[k],record);
 | 
			
		||||
    }
 | 
			
		||||
    RD.close();
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Want Op in CoarsenOp to call MatPcDagMatPc
 | 
			
		||||
template<class Field>
 | 
			
		||||
class HermOpAdaptor : public LinearOperatorBase<Field>
 | 
			
		||||
{
 | 
			
		||||
  LinearOperatorBase<Field> & wrapped;
 | 
			
		||||
public:
 | 
			
		||||
  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
 | 
			
		||||
  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
 | 
			
		||||
  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
 | 
			
		||||
  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
 | 
			
		||||
  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
 | 
			
		||||
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
 | 
			
		||||
  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
 | 
			
		||||
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class Field> class CGSmoother : public LinearFunction<Field>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  using LinearFunction<Field>::operator();
 | 
			
		||||
  typedef LinearOperatorBase<Field> FineOperator;
 | 
			
		||||
  FineOperator   & _SmootherOperator;
 | 
			
		||||
  int iters;
 | 
			
		||||
  CGSmoother(int _iters, FineOperator &SmootherOperator) :
 | 
			
		||||
    _SmootherOperator(SmootherOperator),
 | 
			
		||||
    iters(_iters)
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
 | 
			
		||||
  };
 | 
			
		||||
  void operator() (const Field &in, Field &out) 
 | 
			
		||||
  {
 | 
			
		||||
    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
 | 
			
		||||
 | 
			
		||||
    out=Zero();
 | 
			
		||||
 | 
			
		||||
    CG(_SmootherOperator,in,out);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  const int Ls=24;
 | 
			
		||||
  const int nbasis = 60;
 | 
			
		||||
  const int cb = 0 ;
 | 
			
		||||
  RealD mass=0.00078;
 | 
			
		||||
  RealD M5=1.8;
 | 
			
		||||
  RealD b=1.5;
 | 
			
		||||
  RealD c=0.5;
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 | 
			
		||||
								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 | 
			
		||||
								   GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  // Construct a coarsened grid with 4^4 cell
 | 
			
		||||
  //  Coordinate Block({4,4,6,4});
 | 
			
		||||
  Coordinate Block({4,4,4,4});
 | 
			
		||||
  Coordinate clatt = GridDefaultLatt();
 | 
			
		||||
  for(int d=0;d<clatt.size();d++){
 | 
			
		||||
    clatt[d] = clatt[d]/Block[d];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
 | 
			
		||||
							    GridDefaultSimd(Nd,vComplex::Nsimd()),
 | 
			
		||||
							    GridDefaultMpi());;
 | 
			
		||||
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////// RNGs /////////////////////////////////
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
  std::vector<int> cseeds({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////// Configuration /////////////////////////////////
 | 
			
		||||
  LatticeGaugeField Umu(UGrid);
 | 
			
		||||
 | 
			
		||||
  FieldMetaData header;
 | 
			
		||||
  std::string file("/lustre/orion/phy157/proj-shared/phy157_dwf/lehner/ensemble-Ha/ckpoint_lat.2250");
 | 
			
		||||
  NerscIO::readConfiguration(Umu,header,file);
 | 
			
		||||
 | 
			
		||||
  //////////////////////// Fermion action //////////////////////////////////
 | 
			
		||||
  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
 | 
			
		||||
 | 
			
		||||
  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
 | 
			
		||||
 | 
			
		||||
  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
 | 
			
		||||
  HermFineMatrix FineHermOp(HermOpEO);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  ///////////// Coarse basis and Little Dirac Operator ///////
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
 | 
			
		||||
  typedef LittleDiracOperator::CoarseVector CoarseVector;
 | 
			
		||||
 | 
			
		||||
  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
 | 
			
		||||
 | 
			
		||||
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
 | 
			
		||||
  Subspace Aggregates(Coarse5d,FrbGrid,cb);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  // Need to check about red-black grid coarsening
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  //  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys96.mixed.2500.60");
 | 
			
		||||
  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60");
 | 
			
		||||
  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys96.mixed.2500.60_v2");
 | 
			
		||||
  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys96.mixed.60");
 | 
			
		||||
  std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
 | 
			
		||||
  std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
 | 
			
		||||
  bool load_agg=true;
 | 
			
		||||
  bool load_refine=true;
 | 
			
		||||
  bool load_mat=false;
 | 
			
		||||
  bool load_evec=false;
 | 
			
		||||
 | 
			
		||||
  int refine=1;
 | 
			
		||||
  if ( load_agg ) {
 | 
			
		||||
    if ( !(refine) || (!load_refine) ) { 
 | 
			
		||||
      LoadBasis(Aggregates,subspace_file);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
 | 
			
		||||
    SaveBasis(Aggregates,subspace_file);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( load_refine ) {
 | 
			
		||||
    std::cout << " Load Refine "<< refine_file <<std::endl;
 | 
			
		||||
    LoadBasis(Aggregates,refine_file);
 | 
			
		||||
  } else {
 | 
			
		||||
    Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,3000); // 172 iters
 | 
			
		||||
    //    Aggregates.RefineSubspace(HermOpEO,0.001,3.0e-4,2500); // 172 iters
 | 
			
		||||
    SaveBasis(Aggregates,refine_file);
 | 
			
		||||
  }
 | 
			
		||||
  Aggregates.Orthogonalise();
 | 
			
		||||
  
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
  const int nrhs=12;
 | 
			
		||||
    
 | 
			
		||||
  Coordinate mpi=GridDefaultMpi();
 | 
			
		||||
  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
 | 
			
		||||
  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
 | 
			
		||||
  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
 | 
			
		||||
    
 | 
			
		||||
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
 | 
			
		||||
  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
 | 
			
		||||
  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  // Deflation guesser object
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  // Block projector for coarse/fine
 | 
			
		||||
  //////////////////////////////////////////
 | 
			
		||||
  MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
 | 
			
		||||
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Coarsen after refine"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
 | 
			
		||||
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "         Coarse Lanczos               "<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
 | 
			
		||||
  //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,301);  // 4.4.6.4
 | 
			
		||||
  //  Chebyshev<CoarseVector>      IRLCheby(0.0012,42.0,501);  // for 4.4.4.4 blocking 350 evs
 | 
			
		||||
  //  Chebyshev<CoarseVector>      IRLCheby(0.0014,42.0,501);  // for 4.4.4.4 blocking 700 evs
 | 
			
		||||
  //  Chebyshev<CoarseVector>      IRLCheby(0.002,42.0,501);  // for 4.4.4.4 blocking 1226 evs
 | 
			
		||||
  //  Chebyshev<CoarseVector>      IRLCheby(0.0025,42.0,501);  // for 4.4.4.4 blocking 1059 evs
 | 
			
		||||
							  //							  3e-4,2);
 | 
			
		||||
  Chebyshev<CoarseVector>      IRLCheby(0.0018,42.0,301);  // for 4.4.4.4 blocking  // 790 evs
 | 
			
		||||
  
 | 
			
		||||
  MrhsHermMatrix MrhsCoarseOp     (mrhs);
 | 
			
		||||
 | 
			
		||||
  CoarseVector pm_src(CoarseMrhs);
 | 
			
		||||
  pm_src = ComplexD(1.0);
 | 
			
		||||
  PowerMethod<CoarseVector>       cPM;   cPM(MrhsCoarseOp,pm_src);
 | 
			
		||||
 | 
			
		||||
  //  int Nk=nrhs*30; // 4.4.6.4
 | 
			
		||||
  //  int Nk=nrhs*80;
 | 
			
		||||
  int Nk=nrhs*60; // 720
 | 
			
		||||
  int Nm=Nk*4;    // 2880 ; generally finishes at 1440
 | 
			
		||||
  int Nstop=512;
 | 
			
		||||
  int Nconv_test_interval=1;
 | 
			
		||||
  
 | 
			
		||||
  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
 | 
			
		||||
							  Coarse5d,
 | 
			
		||||
							  CoarseMrhs,
 | 
			
		||||
							  nrhs,
 | 
			
		||||
							  IRLCheby,
 | 
			
		||||
							  Nstop,
 | 
			
		||||
							  Nconv_test_interval,
 | 
			
		||||
							  nrhs,
 | 
			
		||||
							  Nk,
 | 
			
		||||
							  Nm,
 | 
			
		||||
							  3e-4,2);
 | 
			
		||||
 | 
			
		||||
  std::vector<RealD>            eval(Nm);
 | 
			
		||||
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
 | 
			
		||||
  std::vector<CoarseVector>     c_src(nrhs,Coarse5d);
 | 
			
		||||
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << " Recompute coarse evecs  "<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  evec.resize(Nm,Coarse5d);
 | 
			
		||||
  eval.resize(Nm);
 | 
			
		||||
  for(int r=0;r<nrhs;r++){
 | 
			
		||||
    random(CRNG,c_src[r]);
 | 
			
		||||
  }
 | 
			
		||||
  int Nconv;
 | 
			
		||||
  IRL.calc(eval,evec,c_src,Nconv,LanczosType::rbl);
 | 
			
		||||
  Nconv = eval.size();
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << " import coarse evecs  "<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  MrhsGuesser.ImportEigenBasis(evec,eval);
 | 
			
		||||
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Calling mRHS HDCG"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
 | 
			
		||||
  MrhsProjector.ImportBasis(Aggregates.subspace);
 | 
			
		||||
 | 
			
		||||
  //////////////////////////
 | 
			
		||||
  // Extra HDCG parameters
 | 
			
		||||
  //////////////////////////
 | 
			
		||||
  int maxit=3000;
 | 
			
		||||
  ConjugateGradient<CoarseVector>  CG(7.5e-2,maxit,false);
 | 
			
		||||
  RealD lo=2.0;
 | 
			
		||||
  int ord = 7;
 | 
			
		||||
 | 
			
		||||
  DoNothingGuesser<CoarseVector> DoNothing;
 | 
			
		||||
  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
 | 
			
		||||
  HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing);
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  // Mirs smoother
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  RealD MirsShift = lo;
 | 
			
		||||
  ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
 | 
			
		||||
  CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
 | 
			
		||||
    HDCGmrhs(1.0e-8, 500,
 | 
			
		||||
	     FineHermOp,
 | 
			
		||||
	     CGsmooth,
 | 
			
		||||
	     HPDSolveMrhs,    // Used in M1
 | 
			
		||||
	     HPDSolveMrhs,          // Used in Vstart
 | 
			
		||||
	     MrhsProjector,
 | 
			
		||||
	     MrhsGuesser,
 | 
			
		||||
	     CoarseMrhs);
 | 
			
		||||
    
 | 
			
		||||
  std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
 | 
			
		||||
  std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
 | 
			
		||||
  
 | 
			
		||||
  for(int r=0;r<nrhs;r++){
 | 
			
		||||
    random(RNG5,src_mrhs[r]);
 | 
			
		||||
    res_mrhs[r]=Zero();
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  HDCGmrhs(src_mrhs,res_mrhs);
 | 
			
		||||
 | 
			
		||||
  // Standard CG
 | 
			
		||||
#if 0
 | 
			
		||||
  {
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
  std::cout << "Calling red black CG"<<std::endl;
 | 
			
		||||
  std::cout << "**************************************"<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
    LatticeFermion result(FrbGrid); result=Zero();
 | 
			
		||||
    LatticeFermion    src(FrbGrid); random(RNG5,src);
 | 
			
		||||
    result=Zero();
 | 
			
		||||
 | 
			
		||||
    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
 | 
			
		||||
    CGfine(HermOpEO, src, result);
 | 
			
		||||
  }
 | 
			
		||||
#endif  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user