Working version of Lanczos without the extra copy.

2025-11-11 17:19:31 +00:00 · 2017-04-06 23:35:30 -04:00
parent 9e48b7dfda
commit 93cb5d4e97
1 changed files with 55 additions and 64 deletions
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -8,6 +8,7 @@

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -45,6 +46,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
 #include "DenseMatrix.h"
 #include "EigenSort.h"

+// eliminate temorary vector in calc()
+#define MEM_SAVE
+
 namespace Grid {

 /////////////////////////////////////////////////////////////
@@ -496,8 +500,6 @@ until convergence
 */

 // alternate implementation for minimizing memory usage.  May affect the performance
-#define MEM_SAVE
-#undef MEM_SAVE2
    void calc(DenseVector<RealD>& eval,
 	      DenseVector<Field>& evec,
 	      const Field& src,
@@ -520,13 +522,12 @@ until convergence
 	DenseVector<RealD> Qt(Nm*Nm);
 	DenseVector<int>   Iconv(Nm);

-#if (!defined MEM_SAVE ) || (!defined MEM_SAVE2)
+#if (!defined MEM_SAVE )
 	DenseVector<Field>  B(Nm,grid); // waste of space replicating
 #endif
 	
 	Field f(grid);
 	Field v(grid);
-//	auto B2 = evec[0]._odata[0];
  
 	int k1 = 1;
 	int k2 = Nk;
@@ -612,7 +613,7 @@ until convergence
 	assert(k2<Nm);

 #ifndef MEM_SAVE
-if (0) {  
+	if (0) {   // old implementation without blocking
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 	  
 	  for(int j=k1-1; j<k2+1; ++j){
@@ -621,12 +622,7 @@ if (0) {
 	      B[j] += Qt[k+Nm*j] * evec[k];
 	    }
 	  }
-	t1=usecond()/1e6;
-	std::cout<<GridLogMessage <<"IRL::QR Rotate: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
 	}
-#endif
-
-#ifndef MEM_SAVE
 	{
 	for(int i=0; i<(Nk+1); ++i) {
 		B[i] = 0.0;
@@ -651,25 +647,24 @@ PARALLEL_FOR_LOOP
 	
 	assert(k2<Nm);
 	assert(k1>0);
-//	DenseVector < decltype(B2) > B(Nm);
-//	std::vector < decltype( B2 ) > B(Nm*thr,B2);
 	Field B(grid);
 PARALLEL_FOR_LOOP
 	for(int ss=0;ss < grid->oSites();ss++){
+//		auto B2 = evec[0]._odata[0];
+//		std::vector < decltype( B2 ) > B(Nm*thr,B2);
 		int thr=GridThread::GetThreads();
 		int me = GridThread::ThreadBarrier();
-		printf("thr=%d ss=%d me=%d\n",thr,ss,me);fflush(stdout);
+//		printf("thr=%d ss=%d me=%d\n",thr,ss,me);fflush(stdout);
+		assert(Nm*thr<grid->oSites());
 		for(int j=0; j<Nm; ++j) B._odata[j+Nm*me]=0.;
 		for(int j=k1-1; j<(k2+1); ++j){
 			for(int k=0; k<Nm ; ++k){
 			    B._odata[j+Nm*me] +=Qt[k+Nm*j] * evec[k]._odata[ss]; 
 			}
 		}
-#if 1
 		for(int j=k1-1; j<(k2+1); ++j){
 			evec[j]._odata[ss] = B._odata[j+Nm*me];
 		}
-#endif
 	}
 }
 #endif
@@ -697,7 +692,7 @@ PARALLEL_FOR_LOOP
 	t1=usecond()/1e6;
 	std::cout<<GridLogMessage <<"IRL::diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
 	  
-#ifndef MEM_SAVE2
+#ifndef MEM_SAVE
 if (0) {
 	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
 	  
@@ -769,7 +764,6 @@ PARALLEL_FOR_LOOP
 			B.checkerboard = evec[0].checkerboard;
 		    for(int k = 0; k<Nk; ++k){
 		    	B += Qt[k+j*Nm] * evec[k];
-//		B[Iconv[j]] +=Qt[k+Nm*Iconv[j]] * evec[k]._odata[ss]; 
 		    }
 		    std::cout<<GridLogMessage << "norm(B["<<j<<"])="<<norm2(B)<<std::endl;
 //		    _poly(_Linop,B,v);
@@ -794,8 +788,6 @@ PARALLEL_FOR_LOOP
 		      ++Nconv;
 		    }
 		}
-//	t1=usecond()/1e6;
-//	std::cout<<GridLogMessage <<"IRL::Convergence rotation: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
 	}
 #endif
 	t1=usecond()/1e6;
@@ -816,35 +808,34 @@ PARALLEL_FOR_LOOP
       // Sorting
       eval.resize(Nconv);
       evec.resize(Nconv,grid);
-#ifndef MEM_SAVE2
+#ifndef MEM_SAVE
       for(int i=0; i<Nconv; ++i){
         eval[i] = eval2[Iconv[i]];
         evec[i] = B[Iconv[i]];
       }
 #else
-#if 0
-	Field B(grid);
-	int thr=GridThread::GetThreads();
-	int me = GridThread::ThreadBarrier();
-	printf("thr=%d ss=%d me=%d\n",thr,ss,me);fflush(stdout);
-#endif
 {
       for(int i=0; i<Nconv; ++i)
         eval[i] = eval2[Iconv[i]];
 //	int thr=GridThread::GetThreads();
 //	printf("thr=%d\n",thr);
+	Field B(grid);
 PARALLEL_FOR_LOOP
 	for(int ss=0;ss < grid->oSites();ss++){
-		auto B2 = evec[0]._odata[0];
-		std::vector < decltype( B2 ) > B(Nm,B2);
-		for(int j=0; j<Nconv; ++j) B[Iconv[j]]=0.;
+		int thr=GridThread::GetThreads();
+		int me = GridThread::ThreadBarrier();
+//		printf("thr=%d ss=%d me=%d\n",thr,ss,me);fflush(stdout);
+//		auto B2 = evec[0]._odata[0];
+//		std::vector < decltype( B2 ) > B(Nm,B2);
+		assert( (Nm*thr)<grid->oSites());
+		for(int j=0; j<Nconv; ++j) B._odata[Iconv[j]+Nm*me]=0.;
 		for(int j=0; j<Nconv; ++j){
 			for(int k=0; k<Nm ; ++k){
-			    B[Iconv[j]] +=Qt[k+Nm*Iconv[j]] * evec[k]._odata[ss]; 
+			    B._odata[Iconv[j]+Nm*me] +=Qt[k+Nm*Iconv[j]] * evec[k]._odata[ss]; 
 			}
 		}
 		for(int j=0; j<Nconv; ++j){
-			evec[Iconv[j]]._odata[ss] = B[Iconv[j]];
+			evec[Iconv[j]]._odata[ss] = B._odata[Iconv[j]+Nm*me];
 		}
 	}
 }